1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool UseST64; 109 int AddrIdx[MaxAddressRegs]; 110 const MachineOperand *AddrReg[MaxAddressRegs]; 111 unsigned NumAddresses; 112 unsigned Order; 113 114 bool hasSameBaseAddress(const MachineInstr &MI) { 115 for (unsigned i = 0; i < NumAddresses; i++) { 116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 117 118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 120 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 121 return false; 122 } 123 continue; 124 } 125 126 // Check same base pointer. Be careful of subregisters, which can occur 127 // with vectors of pointers. 128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 130 return false; 131 } 132 } 133 return true; 134 } 135 136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 137 for (unsigned i = 0; i < NumAddresses; ++i) { 138 const MachineOperand *AddrOp = AddrReg[i]; 139 // Immediates are always OK. 140 if (AddrOp->isImm()) 141 continue; 142 143 // Don't try to merge addresses that aren't either immediates or registers. 144 // TODO: Should be possible to merge FrameIndexes and maybe some other 145 // non-register 146 if (!AddrOp->isReg()) 147 return false; 148 149 // TODO: We should be able to merge physical reg addreses. 150 if (AddrOp->getReg().isPhysical()) 151 return false; 152 153 // If an address has only one use then there will be on other 154 // instructions with the same address, so we can't merge this one. 155 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 156 return false; 157 } 158 return true; 159 } 160 161 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 162 const GCNSubtarget &STM); 163 }; 164 165 struct BaseRegisters { 166 Register LoReg; 167 Register HiReg; 168 169 unsigned LoSubReg = 0; 170 unsigned HiSubReg = 0; 171 }; 172 173 struct MemAddress { 174 BaseRegisters Base; 175 int64_t Offset = 0; 176 }; 177 178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 179 180 private: 181 const GCNSubtarget *STM = nullptr; 182 const SIInstrInfo *TII = nullptr; 183 const SIRegisterInfo *TRI = nullptr; 184 MachineRegisterInfo *MRI = nullptr; 185 AliasAnalysis *AA = nullptr; 186 bool OptimizeAgain; 187 188 static bool dmasksCanBeCombined(const CombineInfo &CI, 189 const SIInstrInfo &TII, 190 const CombineInfo &Paired); 191 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 192 CombineInfo &Paired, bool Modify = false); 193 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 194 const CombineInfo &Paired); 195 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 196 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 197 const CombineInfo &Paired); 198 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 199 const CombineInfo &Paired); 200 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 201 202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 203 SmallVectorImpl<MachineInstr *> &InstsToMove); 204 205 unsigned read2Opcode(unsigned EltSize) const; 206 unsigned read2ST64Opcode(unsigned EltSize) const; 207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 208 CombineInfo &Paired, 209 const SmallVectorImpl<MachineInstr *> &InstsToMove); 210 211 unsigned write2Opcode(unsigned EltSize) const; 212 unsigned write2ST64Opcode(unsigned EltSize) const; 213 MachineBasicBlock::iterator 214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 215 const SmallVectorImpl<MachineInstr *> &InstsToMove); 216 MachineBasicBlock::iterator 217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 235 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 236 int32_t NewOffset) const; 237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 241 /// Promotes constant offset to the immediate by adjusting the base. It 242 /// tries to use a base from the nearby instructions that allows it to have 243 /// a 13bit constant offset which gets promoted to the immediate. 244 bool promoteConstantOffsetToImm(MachineInstr &CI, 245 MemInfoMap &Visited, 246 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 247 void addInstToMergeableList(const CombineInfo &CI, 248 std::list<std::list<CombineInfo> > &MergeableInsts) const; 249 250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 253 std::list<std::list<CombineInfo>> &MergeableInsts) const; 254 255 public: 256 static char ID; 257 258 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 263 bool &OptimizeListAgain); 264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 265 266 bool runOnMachineFunction(MachineFunction &MF) override; 267 268 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.setPreservesCFG(); 272 AU.addRequired<AAResultsWrapperPass>(); 273 274 MachineFunctionPass::getAnalysisUsage(AU); 275 } 276 277 MachineFunctionProperties getRequiredProperties() const override { 278 return MachineFunctionProperties() 279 .set(MachineFunctionProperties::Property::IsSSA); 280 } 281 }; 282 283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 284 const unsigned Opc = MI.getOpcode(); 285 286 if (TII.isMUBUF(Opc)) { 287 // FIXME: Handle d16 correctly 288 return AMDGPU::getMUBUFElements(Opc); 289 } 290 if (TII.isMIMG(MI)) { 291 uint64_t DMaskImm = 292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 293 return countPopulation(DMaskImm); 294 } 295 if (TII.isMTBUF(Opc)) { 296 return AMDGPU::getMTBUFElements(Opc); 297 } 298 299 switch (Opc) { 300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 301 return 1; 302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 303 return 2; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 305 return 4; 306 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 307 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 308 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 309 case AMDGPU::DS_WRITE_B32_gfx9: 310 return 1; 311 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 312 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 313 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 314 case AMDGPU::DS_WRITE_B64_gfx9: 315 return 2; 316 default: 317 return 0; 318 } 319 } 320 321 /// Maps instruction opcode to enum InstClassEnum. 322 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 323 switch (Opc) { 324 default: 325 if (TII.isMUBUF(Opc)) { 326 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 327 default: 328 return UNKNOWN; 329 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 330 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 333 return BUFFER_LOAD; 334 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 335 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 338 return BUFFER_STORE; 339 } 340 } 341 if (TII.isMIMG(Opc)) { 342 // Ignore instructions encoded without vaddr. 343 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 344 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 345 return UNKNOWN; 346 // Ignore BVH instructions 347 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 348 return UNKNOWN; 349 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 350 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 351 TII.isGather4(Opc)) 352 return UNKNOWN; 353 return MIMG; 354 } 355 if (TII.isMTBUF(Opc)) { 356 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 357 default: 358 return UNKNOWN; 359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 360 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 363 return TBUFFER_LOAD; 364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 365 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 368 return TBUFFER_STORE; 369 } 370 } 371 return UNKNOWN; 372 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 375 return S_BUFFER_LOAD_IMM; 376 case AMDGPU::DS_READ_B32: 377 case AMDGPU::DS_READ_B32_gfx9: 378 case AMDGPU::DS_READ_B64: 379 case AMDGPU::DS_READ_B64_gfx9: 380 return DS_READ; 381 case AMDGPU::DS_WRITE_B32: 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 case AMDGPU::DS_WRITE_B64: 384 case AMDGPU::DS_WRITE_B64_gfx9: 385 return DS_WRITE; 386 } 387 } 388 389 /// Determines instruction subclass from opcode. Only instructions 390 /// of the same subclass can be merged together. 391 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 392 switch (Opc) { 393 default: 394 if (TII.isMUBUF(Opc)) 395 return AMDGPU::getMUBUFBaseOpcode(Opc); 396 if (TII.isMIMG(Opc)) { 397 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 398 assert(Info); 399 return Info->BaseOpcode; 400 } 401 if (TII.isMTBUF(Opc)) 402 return AMDGPU::getMTBUFBaseOpcode(Opc); 403 return -1; 404 case AMDGPU::DS_READ_B32: 405 case AMDGPU::DS_READ_B32_gfx9: 406 case AMDGPU::DS_READ_B64: 407 case AMDGPU::DS_READ_B64_gfx9: 408 case AMDGPU::DS_WRITE_B32: 409 case AMDGPU::DS_WRITE_B32_gfx9: 410 case AMDGPU::DS_WRITE_B64: 411 case AMDGPU::DS_WRITE_B64_gfx9: 412 return Opc; 413 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 414 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 415 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 416 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 417 } 418 } 419 420 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 421 AddressRegs Result; 422 423 if (TII.isMUBUF(Opc)) { 424 if (AMDGPU::getMUBUFHasVAddr(Opc)) 425 Result.VAddr = true; 426 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 427 Result.SRsrc = true; 428 if (AMDGPU::getMUBUFHasSoffset(Opc)) 429 Result.SOffset = true; 430 431 return Result; 432 } 433 434 if (TII.isMIMG(Opc)) { 435 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 436 if (VAddr0Idx >= 0) { 437 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 438 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 439 } else { 440 Result.VAddr = true; 441 } 442 Result.SRsrc = true; 443 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 444 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 445 Result.SSamp = true; 446 447 return Result; 448 } 449 if (TII.isMTBUF(Opc)) { 450 if (AMDGPU::getMTBUFHasVAddr(Opc)) 451 Result.VAddr = true; 452 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 453 Result.SRsrc = true; 454 if (AMDGPU::getMTBUFHasSoffset(Opc)) 455 Result.SOffset = true; 456 457 return Result; 458 } 459 460 switch (Opc) { 461 default: 462 return Result; 463 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 464 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 465 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 466 Result.SBase = true; 467 return Result; 468 case AMDGPU::DS_READ_B32: 469 case AMDGPU::DS_READ_B64: 470 case AMDGPU::DS_READ_B32_gfx9: 471 case AMDGPU::DS_READ_B64_gfx9: 472 case AMDGPU::DS_WRITE_B32: 473 case AMDGPU::DS_WRITE_B64: 474 case AMDGPU::DS_WRITE_B32_gfx9: 475 case AMDGPU::DS_WRITE_B64_gfx9: 476 Result.Addr = true; 477 return Result; 478 } 479 } 480 481 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 482 const SIInstrInfo &TII, 483 const GCNSubtarget &STM) { 484 I = MI; 485 unsigned Opc = MI->getOpcode(); 486 InstClass = getInstClass(Opc, TII); 487 488 if (InstClass == UNKNOWN) 489 return; 490 491 switch (InstClass) { 492 case DS_READ: 493 EltSize = 494 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 495 : 4; 496 break; 497 case DS_WRITE: 498 EltSize = 499 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 500 : 4; 501 break; 502 case S_BUFFER_LOAD_IMM: 503 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 504 break; 505 default: 506 EltSize = 4; 507 break; 508 } 509 510 if (InstClass == MIMG) { 511 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 512 // Offset is not considered for MIMG instructions. 513 Offset = 0; 514 } else { 515 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 516 Offset = I->getOperand(OffsetIdx).getImm(); 517 } 518 519 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 520 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 521 522 Width = getOpcodeWidth(*I, TII); 523 524 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 525 Offset &= 0xffff; 526 } else if (InstClass != MIMG) { 527 CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 528 } 529 530 AddressRegs Regs = getRegs(Opc, TII); 531 532 NumAddresses = 0; 533 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 534 AddrIdx[NumAddresses++] = 535 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 536 if (Regs.Addr) 537 AddrIdx[NumAddresses++] = 538 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 539 if (Regs.SBase) 540 AddrIdx[NumAddresses++] = 541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 542 if (Regs.SRsrc) 543 AddrIdx[NumAddresses++] = 544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 545 if (Regs.SOffset) 546 AddrIdx[NumAddresses++] = 547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 548 if (Regs.VAddr) 549 AddrIdx[NumAddresses++] = 550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 551 if (Regs.SSamp) 552 AddrIdx[NumAddresses++] = 553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 554 assert(NumAddresses <= MaxAddressRegs); 555 556 for (unsigned J = 0; J < NumAddresses; J++) 557 AddrReg[J] = &I->getOperand(AddrIdx[J]); 558 } 559 560 } // end anonymous namespace. 561 562 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 563 "SI Load Store Optimizer", false, false) 564 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 565 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 566 false, false) 567 568 char SILoadStoreOptimizer::ID = 0; 569 570 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 571 572 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 573 return new SILoadStoreOptimizer(); 574 } 575 576 static void moveInstsAfter(MachineBasicBlock::iterator I, 577 ArrayRef<MachineInstr *> InstsToMove) { 578 MachineBasicBlock *MBB = I->getParent(); 579 ++I; 580 for (MachineInstr *MI : InstsToMove) { 581 MI->removeFromParent(); 582 MBB->insert(I, MI); 583 } 584 } 585 586 static void addDefsUsesToList(const MachineInstr &MI, 587 DenseSet<Register> &RegDefs, 588 DenseSet<Register> &PhysRegUses) { 589 for (const MachineOperand &Op : MI.operands()) { 590 if (Op.isReg()) { 591 if (Op.isDef()) 592 RegDefs.insert(Op.getReg()); 593 else if (Op.readsReg() && Op.getReg().isPhysical()) 594 PhysRegUses.insert(Op.getReg()); 595 } 596 } 597 } 598 599 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 600 MachineBasicBlock::iterator B, 601 AliasAnalysis *AA) { 602 // RAW or WAR - cannot reorder 603 // WAW - cannot reorder 604 // RAR - safe to reorder 605 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 606 } 607 608 // Add MI and its defs to the lists if MI reads one of the defs that are 609 // already in the list. Returns true in that case. 610 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 611 DenseSet<Register> &PhysRegUses, 612 SmallVectorImpl<MachineInstr *> &Insts) { 613 for (MachineOperand &Use : MI.operands()) { 614 // If one of the defs is read, then there is a use of Def between I and the 615 // instruction that I will potentially be merged with. We will need to move 616 // this instruction after the merged instructions. 617 // 618 // Similarly, if there is a def which is read by an instruction that is to 619 // be moved for merging, then we need to move the def-instruction as well. 620 // This can only happen for physical registers such as M0; virtual 621 // registers are in SSA form. 622 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 623 (Use.isDef() && RegDefs.count(Use.getReg())) || 624 (Use.isDef() && Use.getReg().isPhysical() && 625 PhysRegUses.count(Use.getReg())))) { 626 Insts.push_back(&MI); 627 addDefsUsesToList(MI, RegDefs, PhysRegUses); 628 return true; 629 } 630 } 631 632 return false; 633 } 634 635 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 636 ArrayRef<MachineInstr *> InstsToMove, 637 AliasAnalysis *AA) { 638 assert(MemOp.mayLoadOrStore()); 639 640 for (MachineInstr *InstToMove : InstsToMove) { 641 if (!InstToMove->mayLoadOrStore()) 642 continue; 643 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 644 return false; 645 } 646 return true; 647 } 648 649 // This function assumes that \p A and \p B have are identical except for 650 // size and offset, and they referecne adjacent memory. 651 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 652 const MachineMemOperand *A, 653 const MachineMemOperand *B) { 654 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 655 unsigned Size = A->getSize() + B->getSize(); 656 // This function adds the offset parameter to the existing offset for A, 657 // so we pass 0 here as the offset and then manually set it to the correct 658 // value after the call. 659 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 660 MMO->setOffset(MinOffset); 661 return MMO; 662 } 663 664 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 665 const SIInstrInfo &TII, 666 const CombineInfo &Paired) { 667 assert(CI.InstClass == MIMG); 668 669 // Ignore instructions with tfe/lwe set. 670 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 671 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 672 673 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 674 return false; 675 676 // Check other optional immediate operands for equality. 677 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 678 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 679 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 680 681 for (auto op : OperandsToMatch) { 682 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 683 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 684 return false; 685 if (Idx != -1 && 686 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 687 return false; 688 } 689 690 // Check DMask for overlaps. 691 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 692 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 693 694 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 695 if ((1u << AllowedBitsForMin) <= MinMask) 696 return false; 697 698 return true; 699 } 700 701 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 702 unsigned ComponentCount, 703 const GCNSubtarget &STI) { 704 if (ComponentCount > 4) 705 return 0; 706 707 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 708 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 709 if (!OldFormatInfo) 710 return 0; 711 712 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 713 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 714 ComponentCount, 715 OldFormatInfo->NumFormat, STI); 716 717 if (!NewFormatInfo) 718 return 0; 719 720 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 721 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 722 723 return NewFormatInfo->Format; 724 } 725 726 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 727 // highest power of two. Note that the result is well defined for all inputs 728 // including corner cases like: 729 // - if Lo == Hi, return that value 730 // - if Lo == 0, return 0 (even though the "- 1" below underflows 731 // - if Lo > Hi, return 0 (as if the range wrapped around) 732 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 733 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 734 } 735 736 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 737 const GCNSubtarget &STI, 738 CombineInfo &Paired, 739 bool Modify) { 740 assert(CI.InstClass != MIMG); 741 742 // XXX - Would the same offset be OK? Is there any reason this would happen or 743 // be useful? 744 if (CI.Offset == Paired.Offset) 745 return false; 746 747 // This won't be valid if the offset isn't aligned. 748 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 749 return false; 750 751 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 752 753 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 754 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 755 if (!Info0) 756 return false; 757 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 758 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 759 if (!Info1) 760 return false; 761 762 if (Info0->BitsPerComp != Info1->BitsPerComp || 763 Info0->NumFormat != Info1->NumFormat) 764 return false; 765 766 // TODO: Should be possible to support more formats, but if format loads 767 // are not dword-aligned, the merged load might not be valid. 768 if (Info0->BitsPerComp != 32) 769 return false; 770 771 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 772 return false; 773 } 774 775 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 776 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 777 CI.UseST64 = false; 778 CI.BaseOff = 0; 779 780 // Handle all non-DS instructions. 781 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 782 return (EltOffset0 + CI.Width == EltOffset1 || 783 EltOffset1 + Paired.Width == EltOffset0) && 784 CI.CPol == Paired.CPol && 785 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 786 } 787 788 // If the offset in elements doesn't fit in 8-bits, we might be able to use 789 // the stride 64 versions. 790 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 791 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 792 if (Modify) { 793 CI.Offset = EltOffset0 / 64; 794 Paired.Offset = EltOffset1 / 64; 795 CI.UseST64 = true; 796 } 797 return true; 798 } 799 800 // Check if the new offsets fit in the reduced 8-bit range. 801 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 802 if (Modify) { 803 CI.Offset = EltOffset0; 804 Paired.Offset = EltOffset1; 805 } 806 return true; 807 } 808 809 // Try to shift base address to decrease offsets. 810 uint32_t Min = std::min(EltOffset0, EltOffset1); 811 uint32_t Max = std::max(EltOffset0, EltOffset1); 812 813 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 814 if (((Max - Min) & ~Mask) == 0) { 815 if (Modify) { 816 // From the range of values we could use for BaseOff, choose the one that 817 // is aligned to the highest power of two, to maximise the chance that 818 // the same offset can be reused for other load/store pairs. 819 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 820 // Copy the low bits of the offsets, so that when we adjust them by 821 // subtracting BaseOff they will be multiples of 64. 822 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 823 CI.BaseOff = BaseOff * CI.EltSize; 824 CI.Offset = (EltOffset0 - BaseOff) / 64; 825 Paired.Offset = (EltOffset1 - BaseOff) / 64; 826 CI.UseST64 = true; 827 } 828 return true; 829 } 830 831 if (isUInt<8>(Max - Min)) { 832 if (Modify) { 833 // From the range of values we could use for BaseOff, choose the one that 834 // is aligned to the highest power of two, to maximise the chance that 835 // the same offset can be reused for other load/store pairs. 836 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 837 CI.BaseOff = BaseOff * CI.EltSize; 838 CI.Offset = EltOffset0 - BaseOff; 839 Paired.Offset = EltOffset1 - BaseOff; 840 } 841 return true; 842 } 843 844 return false; 845 } 846 847 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 848 const CombineInfo &CI, 849 const CombineInfo &Paired) { 850 const unsigned Width = (CI.Width + Paired.Width); 851 switch (CI.InstClass) { 852 default: 853 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 854 case S_BUFFER_LOAD_IMM: 855 switch (Width) { 856 default: 857 return false; 858 case 2: 859 case 4: 860 return true; 861 } 862 } 863 } 864 865 const TargetRegisterClass * 866 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 867 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 868 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 869 } 870 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 871 return TRI->getRegClassForReg(*MRI, Src->getReg()); 872 } 873 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 874 return TRI->getRegClassForReg(*MRI, Src->getReg()); 875 } 876 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 877 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 878 } 879 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 880 return TRI->getRegClassForReg(*MRI, Src->getReg()); 881 } 882 return nullptr; 883 } 884 885 /// This function assumes that CI comes before Paired in a basic block. 886 bool SILoadStoreOptimizer::checkAndPrepareMerge( 887 CombineInfo &CI, CombineInfo &Paired, 888 SmallVectorImpl<MachineInstr *> &InstsToMove) { 889 890 // Check both offsets (or masks for MIMG) can be combined and fit in the 891 // reduced range. 892 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 893 return false; 894 895 if (CI.InstClass != MIMG && 896 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 897 return false; 898 899 const unsigned Opc = CI.I->getOpcode(); 900 const InstClassEnum InstClass = getInstClass(Opc, *TII); 901 902 if (InstClass == UNKNOWN) { 903 return false; 904 } 905 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 906 907 // Do not merge VMEM buffer instructions with "swizzled" bit set. 908 int Swizzled = 909 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 910 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 911 return false; 912 913 DenseSet<Register> RegDefsToMove; 914 DenseSet<Register> PhysRegUsesToMove; 915 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 916 917 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); 918 bool IsAGPR = TRI->hasAGPRs(DataRC); 919 920 MachineBasicBlock::iterator E = std::next(Paired.I); 921 MachineBasicBlock::iterator MBBI = std::next(CI.I); 922 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 923 for (; MBBI != E; ++MBBI) { 924 925 if (MBBI == MBBE) { 926 // CombineInfo::Order is a hint on the instruction ordering within the 927 // basic block. This hint suggests that CI precedes Paired, which is 928 // true most of the time. However, moveInstsAfter() processing a 929 // previous list may have changed this order in a situation when it 930 // moves an instruction which exists in some other merge list. 931 // In this case it must be dependent. 932 return false; 933 } 934 935 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 936 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 937 // This is not a matching instruction, but we can keep looking as 938 // long as one of these conditions are met: 939 // 1. It is safe to move I down past MBBI. 940 // 2. It is safe to move MBBI down past the instruction that I will 941 // be merged into. 942 943 if (MBBI->hasUnmodeledSideEffects()) { 944 // We can't re-order this instruction with respect to other memory 945 // operations, so we fail both conditions mentioned above. 946 return false; 947 } 948 949 if (MBBI->mayLoadOrStore() && 950 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 951 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 952 // We fail condition #1, but we may still be able to satisfy condition 953 // #2. Add this instruction to the move list and then we will check 954 // if condition #2 holds once we have selected the matching instruction. 955 InstsToMove.push_back(&*MBBI); 956 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 957 continue; 958 } 959 960 // When we match I with another DS instruction we will be moving I down 961 // to the location of the matched instruction any uses of I will need to 962 // be moved down as well. 963 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 964 InstsToMove); 965 continue; 966 } 967 968 // Don't merge volatiles. 969 if (MBBI->hasOrderedMemoryRef()) 970 return false; 971 972 int Swizzled = 973 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 974 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 975 return false; 976 977 // Handle a case like 978 // DS_WRITE_B32 addr, v, idx0 979 // w = DS_READ_B32 addr, idx0 980 // DS_WRITE_B32 addr, f(w), idx1 981 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 982 // merging of the two writes. 983 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 984 InstsToMove)) 985 continue; 986 987 if (&*MBBI == &*Paired.I) { 988 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) 989 return false; 990 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 991 // operands. However we are reporting that ds_write2 shall have 992 // only VGPR data so that machine copy propagation does not 993 // create an illegal instruction with a VGPR and AGPR sources. 994 // Consequenctially if we create such instruction the verifier 995 // will complain. 996 if (IsAGPR && CI.InstClass == DS_WRITE) 997 return false; 998 999 // We need to go through the list of instructions that we plan to 1000 // move and make sure they are all safe to move down past the merged 1001 // instruction. 1002 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 1003 1004 // Call offsetsCanBeCombined with modify = true so that the offsets are 1005 // correct for the new instruction. This should return true, because 1006 // this function should only be called on CombineInfo objects that 1007 // have already been confirmed to be mergeable. 1008 if (CI.InstClass != MIMG) 1009 offsetsCanBeCombined(CI, *STM, Paired, true); 1010 return true; 1011 } 1012 return false; 1013 } 1014 1015 // We've found a load/store that we couldn't merge for some reason. 1016 // We could potentially keep looking, but we'd need to make sure that 1017 // it was safe to move I and also all the instruction in InstsToMove 1018 // down past this instruction. 1019 // check if we can move I across MBBI and if we can move all I's users 1020 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 1021 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 1022 break; 1023 } 1024 return false; 1025 } 1026 1027 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1028 if (STM->ldsRequiresM0Init()) 1029 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1030 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1031 } 1032 1033 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1034 if (STM->ldsRequiresM0Init()) 1035 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1036 1037 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1038 : AMDGPU::DS_READ2ST64_B64_gfx9; 1039 } 1040 1041 MachineBasicBlock::iterator 1042 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1043 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1044 MachineBasicBlock *MBB = CI.I->getParent(); 1045 1046 // Be careful, since the addresses could be subregisters themselves in weird 1047 // cases, like vectors of pointers. 1048 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1049 1050 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1051 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1052 1053 unsigned NewOffset0 = CI.Offset; 1054 unsigned NewOffset1 = Paired.Offset; 1055 unsigned Opc = 1056 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1057 1058 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1059 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1060 1061 if (NewOffset0 > NewOffset1) { 1062 // Canonicalize the merged instruction so the smaller offset comes first. 1063 std::swap(NewOffset0, NewOffset1); 1064 std::swap(SubRegIdx0, SubRegIdx1); 1065 } 1066 1067 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1068 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1069 1070 const MCInstrDesc &Read2Desc = TII->get(Opc); 1071 1072 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1073 Register DestReg = MRI->createVirtualRegister(SuperRC); 1074 1075 DebugLoc DL = CI.I->getDebugLoc(); 1076 1077 Register BaseReg = AddrReg->getReg(); 1078 unsigned BaseSubReg = AddrReg->getSubReg(); 1079 unsigned BaseRegFlags = 0; 1080 if (CI.BaseOff) { 1081 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1082 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1083 .addImm(CI.BaseOff); 1084 1085 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1086 BaseRegFlags = RegState::Kill; 1087 1088 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1089 .addReg(ImmReg) 1090 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1091 .addImm(0); // clamp bit 1092 BaseSubReg = 0; 1093 } 1094 1095 MachineInstrBuilder Read2 = 1096 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1097 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1098 .addImm(NewOffset0) // offset0 1099 .addImm(NewOffset1) // offset1 1100 .addImm(0) // gds 1101 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1102 1103 (void)Read2; 1104 1105 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1106 1107 // Copy to the old destination registers. 1108 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1109 .add(*Dest0) // Copy to same destination including flags and sub reg. 1110 .addReg(DestReg, 0, SubRegIdx0); 1111 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1112 .add(*Dest1) 1113 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1114 1115 moveInstsAfter(Copy1, InstsToMove); 1116 1117 CI.I->eraseFromParent(); 1118 Paired.I->eraseFromParent(); 1119 1120 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1121 return Read2; 1122 } 1123 1124 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1125 if (STM->ldsRequiresM0Init()) 1126 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1127 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1128 : AMDGPU::DS_WRITE2_B64_gfx9; 1129 } 1130 1131 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1132 if (STM->ldsRequiresM0Init()) 1133 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1134 : AMDGPU::DS_WRITE2ST64_B64; 1135 1136 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1137 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1138 } 1139 1140 MachineBasicBlock::iterator 1141 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1142 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1143 MachineBasicBlock *MBB = CI.I->getParent(); 1144 1145 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1146 // sure we preserve the subregister index and any register flags set on them. 1147 const MachineOperand *AddrReg = 1148 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1149 const MachineOperand *Data0 = 1150 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1151 const MachineOperand *Data1 = 1152 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1153 1154 unsigned NewOffset0 = CI.Offset; 1155 unsigned NewOffset1 = Paired.Offset; 1156 unsigned Opc = 1157 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1158 1159 if (NewOffset0 > NewOffset1) { 1160 // Canonicalize the merged instruction so the smaller offset comes first. 1161 std::swap(NewOffset0, NewOffset1); 1162 std::swap(Data0, Data1); 1163 } 1164 1165 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1166 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1167 1168 const MCInstrDesc &Write2Desc = TII->get(Opc); 1169 DebugLoc DL = CI.I->getDebugLoc(); 1170 1171 Register BaseReg = AddrReg->getReg(); 1172 unsigned BaseSubReg = AddrReg->getSubReg(); 1173 unsigned BaseRegFlags = 0; 1174 if (CI.BaseOff) { 1175 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1176 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1177 .addImm(CI.BaseOff); 1178 1179 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1180 BaseRegFlags = RegState::Kill; 1181 1182 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1183 .addReg(ImmReg) 1184 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1185 .addImm(0); // clamp bit 1186 BaseSubReg = 0; 1187 } 1188 1189 MachineInstrBuilder Write2 = 1190 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1191 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1192 .add(*Data0) // data0 1193 .add(*Data1) // data1 1194 .addImm(NewOffset0) // offset0 1195 .addImm(NewOffset1) // offset1 1196 .addImm(0) // gds 1197 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1198 1199 moveInstsAfter(Write2, InstsToMove); 1200 1201 CI.I->eraseFromParent(); 1202 Paired.I->eraseFromParent(); 1203 1204 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1205 return Write2; 1206 } 1207 1208 MachineBasicBlock::iterator 1209 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1210 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1211 MachineBasicBlock *MBB = CI.I->getParent(); 1212 DebugLoc DL = CI.I->getDebugLoc(); 1213 const unsigned Opcode = getNewOpcode(CI, Paired); 1214 1215 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1216 1217 Register DestReg = MRI->createVirtualRegister(SuperRC); 1218 unsigned MergedDMask = CI.DMask | Paired.DMask; 1219 unsigned DMaskIdx = 1220 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1221 1222 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1223 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1224 if (I == DMaskIdx) 1225 MIB.addImm(MergedDMask); 1226 else 1227 MIB.add((*CI.I).getOperand(I)); 1228 } 1229 1230 // It shouldn't be possible to get this far if the two instructions 1231 // don't have a single memoperand, because MachineInstr::mayAlias() 1232 // will return true if this is the case. 1233 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1234 1235 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1236 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1237 1238 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1239 1240 unsigned SubRegIdx0, SubRegIdx1; 1241 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1242 1243 // Copy to the old destination registers. 1244 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1245 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1246 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1247 1248 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1249 .add(*Dest0) // Copy to same destination including flags and sub reg. 1250 .addReg(DestReg, 0, SubRegIdx0); 1251 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1252 .add(*Dest1) 1253 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1254 1255 moveInstsAfter(Copy1, InstsToMove); 1256 1257 CI.I->eraseFromParent(); 1258 Paired.I->eraseFromParent(); 1259 return New; 1260 } 1261 1262 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1263 CombineInfo &CI, CombineInfo &Paired, 1264 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1265 MachineBasicBlock *MBB = CI.I->getParent(); 1266 DebugLoc DL = CI.I->getDebugLoc(); 1267 const unsigned Opcode = getNewOpcode(CI, Paired); 1268 1269 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1270 1271 Register DestReg = MRI->createVirtualRegister(SuperRC); 1272 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1273 1274 // It shouldn't be possible to get this far if the two instructions 1275 // don't have a single memoperand, because MachineInstr::mayAlias() 1276 // will return true if this is the case. 1277 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1278 1279 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1280 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1281 1282 MachineInstr *New = 1283 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1284 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1285 .addImm(MergedOffset) // offset 1286 .addImm(CI.CPol) // cpol 1287 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1288 1289 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1290 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1291 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1292 1293 // Copy to the old destination registers. 1294 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1295 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1296 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1297 1298 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1299 .add(*Dest0) // Copy to same destination including flags and sub reg. 1300 .addReg(DestReg, 0, SubRegIdx0); 1301 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1302 .add(*Dest1) 1303 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1304 1305 moveInstsAfter(Copy1, InstsToMove); 1306 1307 CI.I->eraseFromParent(); 1308 Paired.I->eraseFromParent(); 1309 return New; 1310 } 1311 1312 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1313 CombineInfo &CI, CombineInfo &Paired, 1314 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1315 MachineBasicBlock *MBB = CI.I->getParent(); 1316 DebugLoc DL = CI.I->getDebugLoc(); 1317 1318 const unsigned Opcode = getNewOpcode(CI, Paired); 1319 1320 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1321 1322 // Copy to the new source register. 1323 Register DestReg = MRI->createVirtualRegister(SuperRC); 1324 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1325 1326 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1327 1328 AddressRegs Regs = getRegs(Opcode, *TII); 1329 1330 if (Regs.VAddr) 1331 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1332 1333 // It shouldn't be possible to get this far if the two instructions 1334 // don't have a single memoperand, because MachineInstr::mayAlias() 1335 // will return true if this is the case. 1336 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1337 1338 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1339 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1340 1341 MachineInstr *New = 1342 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1343 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1344 .addImm(MergedOffset) // offset 1345 .addImm(CI.CPol) // cpol 1346 .addImm(0) // tfe 1347 .addImm(0) // swz 1348 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1349 1350 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1351 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1352 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1353 1354 // Copy to the old destination registers. 1355 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1356 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1357 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1358 1359 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1360 .add(*Dest0) // Copy to same destination including flags and sub reg. 1361 .addReg(DestReg, 0, SubRegIdx0); 1362 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1363 .add(*Dest1) 1364 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1365 1366 moveInstsAfter(Copy1, InstsToMove); 1367 1368 CI.I->eraseFromParent(); 1369 Paired.I->eraseFromParent(); 1370 return New; 1371 } 1372 1373 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1374 CombineInfo &CI, CombineInfo &Paired, 1375 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1376 MachineBasicBlock *MBB = CI.I->getParent(); 1377 DebugLoc DL = CI.I->getDebugLoc(); 1378 1379 const unsigned Opcode = getNewOpcode(CI, Paired); 1380 1381 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1382 1383 // Copy to the new source register. 1384 Register DestReg = MRI->createVirtualRegister(SuperRC); 1385 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1386 1387 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1388 1389 AddressRegs Regs = getRegs(Opcode, *TII); 1390 1391 if (Regs.VAddr) 1392 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1393 1394 unsigned JoinedFormat = 1395 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1396 1397 // It shouldn't be possible to get this far if the two instructions 1398 // don't have a single memoperand, because MachineInstr::mayAlias() 1399 // will return true if this is the case. 1400 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1401 1402 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1403 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1404 1405 MachineInstr *New = 1406 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1407 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1408 .addImm(MergedOffset) // offset 1409 .addImm(JoinedFormat) // format 1410 .addImm(CI.CPol) // cpol 1411 .addImm(0) // tfe 1412 .addImm(0) // swz 1413 .addMemOperand( 1414 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1415 1416 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1417 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1418 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1419 1420 // Copy to the old destination registers. 1421 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1422 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1423 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1424 1425 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1426 .add(*Dest0) // Copy to same destination including flags and sub reg. 1427 .addReg(DestReg, 0, SubRegIdx0); 1428 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1429 .add(*Dest1) 1430 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1431 1432 moveInstsAfter(Copy1, InstsToMove); 1433 1434 CI.I->eraseFromParent(); 1435 Paired.I->eraseFromParent(); 1436 return New; 1437 } 1438 1439 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1440 CombineInfo &CI, CombineInfo &Paired, 1441 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1442 MachineBasicBlock *MBB = CI.I->getParent(); 1443 DebugLoc DL = CI.I->getDebugLoc(); 1444 1445 const unsigned Opcode = getNewOpcode(CI, Paired); 1446 1447 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1448 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1449 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1450 1451 // Copy to the new source register. 1452 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1453 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1454 1455 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1456 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1457 1458 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1459 .add(*Src0) 1460 .addImm(SubRegIdx0) 1461 .add(*Src1) 1462 .addImm(SubRegIdx1); 1463 1464 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1465 .addReg(SrcReg, RegState::Kill); 1466 1467 AddressRegs Regs = getRegs(Opcode, *TII); 1468 1469 if (Regs.VAddr) 1470 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1471 1472 unsigned JoinedFormat = 1473 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1474 1475 // It shouldn't be possible to get this far if the two instructions 1476 // don't have a single memoperand, because MachineInstr::mayAlias() 1477 // will return true if this is the case. 1478 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1479 1480 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1481 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1482 1483 MachineInstr *New = 1484 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1485 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1486 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1487 .addImm(JoinedFormat) // format 1488 .addImm(CI.CPol) // cpol 1489 .addImm(0) // tfe 1490 .addImm(0) // swz 1491 .addMemOperand( 1492 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1493 1494 moveInstsAfter(MIB, InstsToMove); 1495 1496 CI.I->eraseFromParent(); 1497 Paired.I->eraseFromParent(); 1498 return New; 1499 } 1500 1501 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1502 const CombineInfo &Paired) { 1503 const unsigned Width = CI.Width + Paired.Width; 1504 1505 switch (CI.InstClass) { 1506 default: 1507 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1508 // FIXME: Handle d16 correctly 1509 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1510 Width); 1511 case TBUFFER_LOAD: 1512 case TBUFFER_STORE: 1513 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1514 Width); 1515 1516 case UNKNOWN: 1517 llvm_unreachable("Unknown instruction class"); 1518 case S_BUFFER_LOAD_IMM: 1519 switch (Width) { 1520 default: 1521 return 0; 1522 case 2: 1523 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1524 case 4: 1525 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1526 } 1527 case MIMG: 1528 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1529 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1530 } 1531 } 1532 1533 std::pair<unsigned, unsigned> 1534 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1535 1536 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1537 return std::make_pair(0, 0); 1538 1539 bool ReverseOrder; 1540 if (CI.InstClass == MIMG) { 1541 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1542 "No overlaps"); 1543 ReverseOrder = CI.DMask > Paired.DMask; 1544 } else 1545 ReverseOrder = CI.Offset > Paired.Offset; 1546 1547 static const unsigned Idxs[4][4] = { 1548 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1549 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1550 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1551 {AMDGPU::sub3, 0, 0, 0}, 1552 }; 1553 unsigned Idx0; 1554 unsigned Idx1; 1555 1556 assert(CI.Width >= 1 && CI.Width <= 3); 1557 assert(Paired.Width >= 1 && Paired.Width <= 3); 1558 1559 if (ReverseOrder) { 1560 Idx1 = Idxs[0][Paired.Width - 1]; 1561 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1562 } else { 1563 Idx0 = Idxs[0][CI.Width - 1]; 1564 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1565 } 1566 1567 return std::make_pair(Idx0, Idx1); 1568 } 1569 1570 const TargetRegisterClass * 1571 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1572 const CombineInfo &Paired) { 1573 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1574 switch (CI.Width + Paired.Width) { 1575 default: 1576 return nullptr; 1577 case 2: 1578 return &AMDGPU::SReg_64_XEXECRegClass; 1579 case 4: 1580 return &AMDGPU::SGPR_128RegClass; 1581 case 8: 1582 return &AMDGPU::SGPR_256RegClass; 1583 case 16: 1584 return &AMDGPU::SGPR_512RegClass; 1585 } 1586 } 1587 1588 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1589 return TRI->hasAGPRs(getDataRegClass(*CI.I)) 1590 ? TRI->getAGPRClassForBitWidth(BitWidth) 1591 : TRI->getVGPRClassForBitWidth(BitWidth); 1592 } 1593 1594 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1595 CombineInfo &CI, CombineInfo &Paired, 1596 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1597 MachineBasicBlock *MBB = CI.I->getParent(); 1598 DebugLoc DL = CI.I->getDebugLoc(); 1599 1600 const unsigned Opcode = getNewOpcode(CI, Paired); 1601 1602 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1603 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1604 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1605 1606 // Copy to the new source register. 1607 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1608 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1609 1610 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1611 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1612 1613 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1614 .add(*Src0) 1615 .addImm(SubRegIdx0) 1616 .add(*Src1) 1617 .addImm(SubRegIdx1); 1618 1619 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1620 .addReg(SrcReg, RegState::Kill); 1621 1622 AddressRegs Regs = getRegs(Opcode, *TII); 1623 1624 if (Regs.VAddr) 1625 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1626 1627 1628 // It shouldn't be possible to get this far if the two instructions 1629 // don't have a single memoperand, because MachineInstr::mayAlias() 1630 // will return true if this is the case. 1631 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1632 1633 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1634 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1635 1636 MachineInstr *New = 1637 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1638 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1639 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1640 .addImm(CI.CPol) // cpol 1641 .addImm(0) // tfe 1642 .addImm(0) // swz 1643 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1644 1645 moveInstsAfter(MIB, InstsToMove); 1646 1647 CI.I->eraseFromParent(); 1648 Paired.I->eraseFromParent(); 1649 return New; 1650 } 1651 1652 MachineOperand 1653 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1654 APInt V(32, Val, true); 1655 if (TII->isInlineConstant(V)) 1656 return MachineOperand::CreateImm(Val); 1657 1658 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1659 MachineInstr *Mov = 1660 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1661 TII->get(AMDGPU::S_MOV_B32), Reg) 1662 .addImm(Val); 1663 (void)Mov; 1664 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1665 return MachineOperand::CreateReg(Reg, false); 1666 } 1667 1668 // Compute base address using Addr and return the final register. 1669 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1670 const MemAddress &Addr) const { 1671 MachineBasicBlock *MBB = MI.getParent(); 1672 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1673 DebugLoc DL = MI.getDebugLoc(); 1674 1675 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1676 Addr.Base.LoSubReg) && 1677 "Expected 32-bit Base-Register-Low!!"); 1678 1679 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1680 Addr.Base.HiSubReg) && 1681 "Expected 32-bit Base-Register-Hi!!"); 1682 1683 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1684 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1685 MachineOperand OffsetHi = 1686 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1687 1688 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1689 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1690 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1691 1692 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1693 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1694 MachineInstr *LoHalf = 1695 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1696 .addReg(CarryReg, RegState::Define) 1697 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1698 .add(OffsetLo) 1699 .addImm(0); // clamp bit 1700 (void)LoHalf; 1701 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1702 1703 MachineInstr *HiHalf = 1704 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1705 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1706 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1707 .add(OffsetHi) 1708 .addReg(CarryReg, RegState::Kill) 1709 .addImm(0); // clamp bit 1710 (void)HiHalf; 1711 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1712 1713 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1714 MachineInstr *FullBase = 1715 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1716 .addReg(DestSub0) 1717 .addImm(AMDGPU::sub0) 1718 .addReg(DestSub1) 1719 .addImm(AMDGPU::sub1); 1720 (void)FullBase; 1721 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1722 1723 return FullDestReg; 1724 } 1725 1726 // Update base and offset with the NewBase and NewOffset in MI. 1727 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1728 Register NewBase, 1729 int32_t NewOffset) const { 1730 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1731 Base->setReg(NewBase); 1732 Base->setIsKill(false); 1733 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1734 } 1735 1736 Optional<int32_t> 1737 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1738 if (Op.isImm()) 1739 return Op.getImm(); 1740 1741 if (!Op.isReg()) 1742 return None; 1743 1744 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1745 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1746 !Def->getOperand(1).isImm()) 1747 return None; 1748 1749 return Def->getOperand(1).getImm(); 1750 } 1751 1752 // Analyze Base and extracts: 1753 // - 32bit base registers, subregisters 1754 // - 64bit constant offset 1755 // Expecting base computation as: 1756 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1757 // %LO:vgpr_32, %c:sreg_64_xexec = 1758 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1759 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1760 // %Base:vreg_64 = 1761 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1762 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1763 MemAddress &Addr) const { 1764 if (!Base.isReg()) 1765 return; 1766 1767 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1768 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1769 || Def->getNumOperands() != 5) 1770 return; 1771 1772 MachineOperand BaseLo = Def->getOperand(1); 1773 MachineOperand BaseHi = Def->getOperand(3); 1774 if (!BaseLo.isReg() || !BaseHi.isReg()) 1775 return; 1776 1777 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1778 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1779 1780 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1781 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1782 return; 1783 1784 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1785 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1786 1787 auto Offset0P = extractConstOffset(*Src0); 1788 if (Offset0P) 1789 BaseLo = *Src1; 1790 else { 1791 if (!(Offset0P = extractConstOffset(*Src1))) 1792 return; 1793 BaseLo = *Src0; 1794 } 1795 1796 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1797 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1798 1799 if (Src0->isImm()) 1800 std::swap(Src0, Src1); 1801 1802 if (!Src1->isImm()) 1803 return; 1804 1805 uint64_t Offset1 = Src1->getImm(); 1806 BaseHi = *Src0; 1807 1808 Addr.Base.LoReg = BaseLo.getReg(); 1809 Addr.Base.HiReg = BaseHi.getReg(); 1810 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1811 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1812 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1813 } 1814 1815 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1816 MachineInstr &MI, 1817 MemInfoMap &Visited, 1818 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1819 1820 if (!(MI.mayLoad() ^ MI.mayStore())) 1821 return false; 1822 1823 // TODO: Support flat and scratch. 1824 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1825 return false; 1826 1827 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1828 return false; 1829 1830 if (AnchorList.count(&MI)) 1831 return false; 1832 1833 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1834 1835 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1836 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1837 return false; 1838 } 1839 1840 // Step1: Find the base-registers and a 64bit constant offset. 1841 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1842 MemAddress MAddr; 1843 if (Visited.find(&MI) == Visited.end()) { 1844 processBaseWithConstOffset(Base, MAddr); 1845 Visited[&MI] = MAddr; 1846 } else 1847 MAddr = Visited[&MI]; 1848 1849 if (MAddr.Offset == 0) { 1850 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1851 " constant offsets that can be promoted.\n";); 1852 return false; 1853 } 1854 1855 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1856 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1857 1858 // Step2: Traverse through MI's basic block and find an anchor(that has the 1859 // same base-registers) with the highest 13bit distance from MI's offset. 1860 // E.g. (64bit loads) 1861 // bb: 1862 // addr1 = &a + 4096; load1 = load(addr1, 0) 1863 // addr2 = &a + 6144; load2 = load(addr2, 0) 1864 // addr3 = &a + 8192; load3 = load(addr3, 0) 1865 // addr4 = &a + 10240; load4 = load(addr4, 0) 1866 // addr5 = &a + 12288; load5 = load(addr5, 0) 1867 // 1868 // Starting from the first load, the optimization will try to find a new base 1869 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1870 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1871 // as the new-base(anchor) because of the maximum distance which can 1872 // accomodate more intermediate bases presumeably. 1873 // 1874 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1875 // (&a + 8192) for load1, load2, load4. 1876 // addr = &a + 8192 1877 // load1 = load(addr, -4096) 1878 // load2 = load(addr, -2048) 1879 // load3 = load(addr, 0) 1880 // load4 = load(addr, 2048) 1881 // addr5 = &a + 12288; load5 = load(addr5, 0) 1882 // 1883 MachineInstr *AnchorInst = nullptr; 1884 MemAddress AnchorAddr; 1885 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1886 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1887 1888 MachineBasicBlock *MBB = MI.getParent(); 1889 MachineBasicBlock::iterator E = MBB->end(); 1890 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1891 ++MBBI; 1892 const SITargetLowering *TLI = 1893 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1894 1895 for ( ; MBBI != E; ++MBBI) { 1896 MachineInstr &MINext = *MBBI; 1897 // TODO: Support finding an anchor(with same base) from store addresses or 1898 // any other load addresses where the opcodes are different. 1899 if (MINext.getOpcode() != MI.getOpcode() || 1900 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1901 continue; 1902 1903 const MachineOperand &BaseNext = 1904 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1905 MemAddress MAddrNext; 1906 if (Visited.find(&MINext) == Visited.end()) { 1907 processBaseWithConstOffset(BaseNext, MAddrNext); 1908 Visited[&MINext] = MAddrNext; 1909 } else 1910 MAddrNext = Visited[&MINext]; 1911 1912 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1913 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1914 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1915 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1916 continue; 1917 1918 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1919 1920 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1921 TargetLoweringBase::AddrMode AM; 1922 AM.HasBaseReg = true; 1923 AM.BaseOffs = Dist; 1924 if (TLI->isLegalGlobalAddressingMode(AM) && 1925 (uint32_t)std::abs(Dist) > MaxDist) { 1926 MaxDist = std::abs(Dist); 1927 1928 AnchorAddr = MAddrNext; 1929 AnchorInst = &MINext; 1930 } 1931 } 1932 1933 if (AnchorInst) { 1934 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1935 AnchorInst->dump()); 1936 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1937 << AnchorAddr.Offset << "\n\n"); 1938 1939 // Instead of moving up, just re-compute anchor-instruction's base address. 1940 Register Base = computeBase(MI, AnchorAddr); 1941 1942 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1943 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1944 1945 for (auto P : InstsWCommonBase) { 1946 TargetLoweringBase::AddrMode AM; 1947 AM.HasBaseReg = true; 1948 AM.BaseOffs = P.second - AnchorAddr.Offset; 1949 1950 if (TLI->isLegalGlobalAddressingMode(AM)) { 1951 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1952 dbgs() << ")"; P.first->dump()); 1953 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1954 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1955 } 1956 } 1957 AnchorList.insert(AnchorInst); 1958 return true; 1959 } 1960 1961 return false; 1962 } 1963 1964 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1965 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1966 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1967 if (AddrList.front().InstClass == CI.InstClass && 1968 AddrList.front().hasSameBaseAddress(*CI.I)) { 1969 AddrList.emplace_back(CI); 1970 return; 1971 } 1972 } 1973 1974 // Base address not found, so add a new list. 1975 MergeableInsts.emplace_back(1, CI); 1976 } 1977 1978 std::pair<MachineBasicBlock::iterator, bool> 1979 SILoadStoreOptimizer::collectMergeableInsts( 1980 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1981 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1982 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1983 bool Modified = false; 1984 1985 // Sort potential mergeable instructions into lists. One list per base address. 1986 unsigned Order = 0; 1987 MachineBasicBlock::iterator BlockI = Begin; 1988 for (; BlockI != End; ++BlockI) { 1989 MachineInstr &MI = *BlockI; 1990 1991 // We run this before checking if an address is mergeable, because it can produce 1992 // better code even if the instructions aren't mergeable. 1993 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1994 Modified = true; 1995 1996 // Don't combine if volatile. We also won't be able to merge across this, so 1997 // break the search. We can look after this barrier for separate merges. 1998 if (MI.hasOrderedMemoryRef()) { 1999 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 2000 2001 // Search will resume after this instruction in a separate merge list. 2002 ++BlockI; 2003 break; 2004 } 2005 2006 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2007 if (InstClass == UNKNOWN) 2008 continue; 2009 2010 CombineInfo CI; 2011 CI.setMI(MI, *TII, *STM); 2012 CI.Order = Order++; 2013 2014 if (!CI.hasMergeableAddress(*MRI)) 2015 continue; 2016 2017 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2018 2019 addInstToMergeableList(CI, MergeableInsts); 2020 } 2021 2022 // At this point we have lists of Mergeable instructions. 2023 // 2024 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2025 // list try to find an instruction that can be merged with I. If an instruction 2026 // is found, it is stored in the Paired field. If no instructions are found, then 2027 // the CombineInfo object is deleted from the list. 2028 2029 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2030 E = MergeableInsts.end(); I != E;) { 2031 2032 std::list<CombineInfo> &MergeList = *I; 2033 if (MergeList.size() <= 1) { 2034 // This means we have found only one instruction with a given address 2035 // that can be merged, and we need at least 2 instructions to do a merge, 2036 // so this list can be discarded. 2037 I = MergeableInsts.erase(I); 2038 continue; 2039 } 2040 2041 // Sort the lists by offsets, this way mergeable instructions will be 2042 // adjacent to each other in the list, which will make it easier to find 2043 // matches. 2044 MergeList.sort( 2045 [] (const CombineInfo &A, CombineInfo &B) { 2046 return A.Offset < B.Offset; 2047 }); 2048 ++I; 2049 } 2050 2051 return std::make_pair(BlockI, Modified); 2052 } 2053 2054 // Scan through looking for adjacent LDS operations with constant offsets from 2055 // the same base register. We rely on the scheduler to do the hard work of 2056 // clustering nearby loads, and assume these are all adjacent. 2057 bool SILoadStoreOptimizer::optimizeBlock( 2058 std::list<std::list<CombineInfo> > &MergeableInsts) { 2059 bool Modified = false; 2060 2061 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2062 E = MergeableInsts.end(); I != E;) { 2063 std::list<CombineInfo> &MergeList = *I; 2064 2065 bool OptimizeListAgain = false; 2066 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2067 // We weren't able to make any changes, so delete the list so we don't 2068 // process the same instructions the next time we try to optimize this 2069 // block. 2070 I = MergeableInsts.erase(I); 2071 continue; 2072 } 2073 2074 Modified = true; 2075 2076 // We made changes, but also determined that there were no more optimization 2077 // opportunities, so we don't need to reprocess the list 2078 if (!OptimizeListAgain) { 2079 I = MergeableInsts.erase(I); 2080 continue; 2081 } 2082 OptimizeAgain = true; 2083 } 2084 return Modified; 2085 } 2086 2087 bool 2088 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2089 std::list<CombineInfo> &MergeList, 2090 bool &OptimizeListAgain) { 2091 if (MergeList.empty()) 2092 return false; 2093 2094 bool Modified = false; 2095 2096 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2097 Next = std::next(I)) { 2098 2099 auto First = I; 2100 auto Second = Next; 2101 2102 if ((*First).Order > (*Second).Order) 2103 std::swap(First, Second); 2104 CombineInfo &CI = *First; 2105 CombineInfo &Paired = *Second; 2106 2107 SmallVector<MachineInstr *, 8> InstsToMove; 2108 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2109 ++I; 2110 continue; 2111 } 2112 2113 Modified = true; 2114 2115 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2116 2117 switch (CI.InstClass) { 2118 default: 2119 llvm_unreachable("unknown InstClass"); 2120 break; 2121 case DS_READ: { 2122 MachineBasicBlock::iterator NewMI = 2123 mergeRead2Pair(CI, Paired, InstsToMove); 2124 CI.setMI(NewMI, *TII, *STM); 2125 break; 2126 } 2127 case DS_WRITE: { 2128 MachineBasicBlock::iterator NewMI = 2129 mergeWrite2Pair(CI, Paired, InstsToMove); 2130 CI.setMI(NewMI, *TII, *STM); 2131 break; 2132 } 2133 case S_BUFFER_LOAD_IMM: { 2134 MachineBasicBlock::iterator NewMI = 2135 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2136 CI.setMI(NewMI, *TII, *STM); 2137 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2138 break; 2139 } 2140 case BUFFER_LOAD: { 2141 MachineBasicBlock::iterator NewMI = 2142 mergeBufferLoadPair(CI, Paired, InstsToMove); 2143 CI.setMI(NewMI, *TII, *STM); 2144 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2145 break; 2146 } 2147 case BUFFER_STORE: { 2148 MachineBasicBlock::iterator NewMI = 2149 mergeBufferStorePair(CI, Paired, InstsToMove); 2150 CI.setMI(NewMI, *TII, *STM); 2151 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2152 break; 2153 } 2154 case MIMG: { 2155 MachineBasicBlock::iterator NewMI = 2156 mergeImagePair(CI, Paired, InstsToMove); 2157 CI.setMI(NewMI, *TII, *STM); 2158 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2159 break; 2160 } 2161 case TBUFFER_LOAD: { 2162 MachineBasicBlock::iterator NewMI = 2163 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2164 CI.setMI(NewMI, *TII, *STM); 2165 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2166 break; 2167 } 2168 case TBUFFER_STORE: { 2169 MachineBasicBlock::iterator NewMI = 2170 mergeTBufferStorePair(CI, Paired, InstsToMove); 2171 CI.setMI(NewMI, *TII, *STM); 2172 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2173 break; 2174 } 2175 } 2176 CI.Order = Paired.Order; 2177 if (I == Second) 2178 I = Next; 2179 2180 MergeList.erase(Second); 2181 } 2182 2183 return Modified; 2184 } 2185 2186 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2187 if (skipFunction(MF.getFunction())) 2188 return false; 2189 2190 STM = &MF.getSubtarget<GCNSubtarget>(); 2191 if (!STM->loadStoreOptEnabled()) 2192 return false; 2193 2194 TII = STM->getInstrInfo(); 2195 TRI = &TII->getRegisterInfo(); 2196 2197 MRI = &MF.getRegInfo(); 2198 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2199 2200 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2201 2202 bool Modified = false; 2203 2204 // Contains the list of instructions for which constant offsets are being 2205 // promoted to the IMM. This is tracked for an entire block at time. 2206 SmallPtrSet<MachineInstr *, 4> AnchorList; 2207 MemInfoMap Visited; 2208 2209 for (MachineBasicBlock &MBB : MF) { 2210 MachineBasicBlock::iterator SectionEnd; 2211 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2212 I = SectionEnd) { 2213 bool CollectModified; 2214 std::list<std::list<CombineInfo>> MergeableInsts; 2215 2216 // First pass: Collect list of all instructions we know how to merge in a 2217 // subset of the block. 2218 std::tie(SectionEnd, CollectModified) = 2219 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2220 2221 Modified |= CollectModified; 2222 2223 do { 2224 OptimizeAgain = false; 2225 Modified |= optimizeBlock(MergeableInsts); 2226 } while (OptimizeAgain); 2227 } 2228 2229 Visited.clear(); 2230 AnchorList.clear(); 2231 } 2232 2233 return Modified; 2234 } 2235