1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool UseST64; 109 int AddrIdx[MaxAddressRegs]; 110 const MachineOperand *AddrReg[MaxAddressRegs]; 111 unsigned NumAddresses; 112 unsigned Order; 113 114 bool hasSameBaseAddress(const MachineInstr &MI) { 115 for (unsigned i = 0; i < NumAddresses; i++) { 116 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 117 118 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 119 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 120 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 121 return false; 122 } 123 continue; 124 } 125 126 // Check same base pointer. Be careful of subregisters, which can occur 127 // with vectors of pointers. 128 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 129 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 130 return false; 131 } 132 } 133 return true; 134 } 135 136 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 137 for (unsigned i = 0; i < NumAddresses; ++i) { 138 const MachineOperand *AddrOp = AddrReg[i]; 139 // Immediates are always OK. 140 if (AddrOp->isImm()) 141 continue; 142 143 // Don't try to merge addresses that aren't either immediates or registers. 144 // TODO: Should be possible to merge FrameIndexes and maybe some other 145 // non-register 146 if (!AddrOp->isReg()) 147 return false; 148 149 // TODO: We should be able to merge physical reg addresses. 150 if (AddrOp->getReg().isPhysical()) 151 return false; 152 153 // If an address has only one use then there will be on other 154 // instructions with the same address, so we can't merge this one. 155 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 156 return false; 157 } 158 return true; 159 } 160 161 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 162 }; 163 164 struct BaseRegisters { 165 Register LoReg; 166 Register HiReg; 167 168 unsigned LoSubReg = 0; 169 unsigned HiSubReg = 0; 170 }; 171 172 struct MemAddress { 173 BaseRegisters Base; 174 int64_t Offset = 0; 175 }; 176 177 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 178 179 private: 180 const GCNSubtarget *STM = nullptr; 181 const SIInstrInfo *TII = nullptr; 182 const SIRegisterInfo *TRI = nullptr; 183 MachineRegisterInfo *MRI = nullptr; 184 AliasAnalysis *AA = nullptr; 185 bool OptimizeAgain; 186 187 static bool dmasksCanBeCombined(const CombineInfo &CI, 188 const SIInstrInfo &TII, 189 const CombineInfo &Paired); 190 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 191 CombineInfo &Paired, bool Modify = false); 192 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 193 const CombineInfo &Paired); 194 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 195 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 196 const CombineInfo &Paired); 197 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 198 const CombineInfo &Paired); 199 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 200 201 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 202 SmallVectorImpl<MachineInstr *> &InstsToMove); 203 204 unsigned read2Opcode(unsigned EltSize) const; 205 unsigned read2ST64Opcode(unsigned EltSize) const; 206 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 207 CombineInfo &Paired, 208 const SmallVectorImpl<MachineInstr *> &InstsToMove); 209 210 unsigned write2Opcode(unsigned EltSize) const; 211 unsigned write2ST64Opcode(unsigned EltSize) const; 212 MachineBasicBlock::iterator 213 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 214 const SmallVectorImpl<MachineInstr *> &InstsToMove); 215 MachineBasicBlock::iterator 216 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 217 const SmallVectorImpl<MachineInstr *> &InstsToMove); 218 MachineBasicBlock::iterator 219 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 220 const SmallVectorImpl<MachineInstr *> &InstsToMove); 221 MachineBasicBlock::iterator 222 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 223 const SmallVectorImpl<MachineInstr *> &InstsToMove); 224 MachineBasicBlock::iterator 225 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 226 const SmallVectorImpl<MachineInstr *> &InstsToMove); 227 MachineBasicBlock::iterator 228 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 229 const SmallVectorImpl<MachineInstr *> &InstsToMove); 230 MachineBasicBlock::iterator 231 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 232 const SmallVectorImpl<MachineInstr *> &InstsToMove); 233 234 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 235 int32_t NewOffset) const; 236 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 237 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 238 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 239 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 240 /// Promotes constant offset to the immediate by adjusting the base. It 241 /// tries to use a base from the nearby instructions that allows it to have 242 /// a 13bit constant offset which gets promoted to the immediate. 243 bool promoteConstantOffsetToImm(MachineInstr &CI, 244 MemInfoMap &Visited, 245 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 246 void addInstToMergeableList(const CombineInfo &CI, 247 std::list<std::list<CombineInfo> > &MergeableInsts) const; 248 249 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 250 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 251 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 252 std::list<std::list<CombineInfo>> &MergeableInsts) const; 253 254 public: 255 static char ID; 256 257 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 258 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 259 } 260 261 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 262 bool &OptimizeListAgain); 263 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 264 265 bool runOnMachineFunction(MachineFunction &MF) override; 266 267 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 268 269 void getAnalysisUsage(AnalysisUsage &AU) const override { 270 AU.setPreservesCFG(); 271 AU.addRequired<AAResultsWrapperPass>(); 272 273 MachineFunctionPass::getAnalysisUsage(AU); 274 } 275 276 MachineFunctionProperties getRequiredProperties() const override { 277 return MachineFunctionProperties() 278 .set(MachineFunctionProperties::Property::IsSSA); 279 } 280 }; 281 282 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 283 const unsigned Opc = MI.getOpcode(); 284 285 if (TII.isMUBUF(Opc)) { 286 // FIXME: Handle d16 correctly 287 return AMDGPU::getMUBUFElements(Opc); 288 } 289 if (TII.isMIMG(MI)) { 290 uint64_t DMaskImm = 291 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 292 return countPopulation(DMaskImm); 293 } 294 if (TII.isMTBUF(Opc)) { 295 return AMDGPU::getMTBUFElements(Opc); 296 } 297 298 switch (Opc) { 299 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 300 return 1; 301 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 302 return 2; 303 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 304 return 4; 305 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 306 return 8; 307 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 308 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 309 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 310 case AMDGPU::DS_WRITE_B32_gfx9: 311 return 1; 312 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 313 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 314 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 315 case AMDGPU::DS_WRITE_B64_gfx9: 316 return 2; 317 default: 318 return 0; 319 } 320 } 321 322 /// Maps instruction opcode to enum InstClassEnum. 323 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 324 switch (Opc) { 325 default: 326 if (TII.isMUBUF(Opc)) { 327 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 328 default: 329 return UNKNOWN; 330 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 334 return BUFFER_LOAD; 335 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 339 return BUFFER_STORE; 340 } 341 } 342 if (TII.isMIMG(Opc)) { 343 // Ignore instructions encoded without vaddr. 344 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 345 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 346 return UNKNOWN; 347 // Ignore BVH instructions 348 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 349 return UNKNOWN; 350 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 351 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 352 TII.isGather4(Opc)) 353 return UNKNOWN; 354 return MIMG; 355 } 356 if (TII.isMTBUF(Opc)) { 357 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 358 default: 359 return UNKNOWN; 360 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 364 return TBUFFER_LOAD; 365 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 369 return TBUFFER_STORE; 370 } 371 } 372 return UNKNOWN; 373 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 377 return S_BUFFER_LOAD_IMM; 378 case AMDGPU::DS_READ_B32: 379 case AMDGPU::DS_READ_B32_gfx9: 380 case AMDGPU::DS_READ_B64: 381 case AMDGPU::DS_READ_B64_gfx9: 382 return DS_READ; 383 case AMDGPU::DS_WRITE_B32: 384 case AMDGPU::DS_WRITE_B32_gfx9: 385 case AMDGPU::DS_WRITE_B64: 386 case AMDGPU::DS_WRITE_B64_gfx9: 387 return DS_WRITE; 388 } 389 } 390 391 /// Determines instruction subclass from opcode. Only instructions 392 /// of the same subclass can be merged together. 393 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 394 switch (Opc) { 395 default: 396 if (TII.isMUBUF(Opc)) 397 return AMDGPU::getMUBUFBaseOpcode(Opc); 398 if (TII.isMIMG(Opc)) { 399 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 400 assert(Info); 401 return Info->BaseOpcode; 402 } 403 if (TII.isMTBUF(Opc)) 404 return AMDGPU::getMTBUFBaseOpcode(Opc); 405 return -1; 406 case AMDGPU::DS_READ_B32: 407 case AMDGPU::DS_READ_B32_gfx9: 408 case AMDGPU::DS_READ_B64: 409 case AMDGPU::DS_READ_B64_gfx9: 410 case AMDGPU::DS_WRITE_B32: 411 case AMDGPU::DS_WRITE_B32_gfx9: 412 case AMDGPU::DS_WRITE_B64: 413 case AMDGPU::DS_WRITE_B64_gfx9: 414 return Opc; 415 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 416 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 417 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 418 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 419 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 420 } 421 } 422 423 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 424 AddressRegs Result; 425 426 if (TII.isMUBUF(Opc)) { 427 if (AMDGPU::getMUBUFHasVAddr(Opc)) 428 Result.VAddr = true; 429 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 430 Result.SRsrc = true; 431 if (AMDGPU::getMUBUFHasSoffset(Opc)) 432 Result.SOffset = true; 433 434 return Result; 435 } 436 437 if (TII.isMIMG(Opc)) { 438 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 439 if (VAddr0Idx >= 0) { 440 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 441 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 442 } else { 443 Result.VAddr = true; 444 } 445 Result.SRsrc = true; 446 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 447 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 448 Result.SSamp = true; 449 450 return Result; 451 } 452 if (TII.isMTBUF(Opc)) { 453 if (AMDGPU::getMTBUFHasVAddr(Opc)) 454 Result.VAddr = true; 455 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 456 Result.SRsrc = true; 457 if (AMDGPU::getMTBUFHasSoffset(Opc)) 458 Result.SOffset = true; 459 460 return Result; 461 } 462 463 switch (Opc) { 464 default: 465 return Result; 466 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 467 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 468 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 469 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 470 Result.SBase = true; 471 return Result; 472 case AMDGPU::DS_READ_B32: 473 case AMDGPU::DS_READ_B64: 474 case AMDGPU::DS_READ_B32_gfx9: 475 case AMDGPU::DS_READ_B64_gfx9: 476 case AMDGPU::DS_WRITE_B32: 477 case AMDGPU::DS_WRITE_B64: 478 case AMDGPU::DS_WRITE_B32_gfx9: 479 case AMDGPU::DS_WRITE_B64_gfx9: 480 Result.Addr = true; 481 return Result; 482 } 483 } 484 485 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 486 const SILoadStoreOptimizer &LSO) { 487 I = MI; 488 unsigned Opc = MI->getOpcode(); 489 InstClass = getInstClass(Opc, *LSO.TII); 490 491 if (InstClass == UNKNOWN) 492 return; 493 494 switch (InstClass) { 495 case DS_READ: 496 EltSize = 497 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 498 : 4; 499 break; 500 case DS_WRITE: 501 EltSize = 502 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 503 : 4; 504 break; 505 case S_BUFFER_LOAD_IMM: 506 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 507 break; 508 default: 509 EltSize = 4; 510 break; 511 } 512 513 if (InstClass == MIMG) { 514 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 515 // Offset is not considered for MIMG instructions. 516 Offset = 0; 517 } else { 518 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 519 Offset = I->getOperand(OffsetIdx).getImm(); 520 } 521 522 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 523 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 524 525 Width = getOpcodeWidth(*I, *LSO.TII); 526 527 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 528 Offset &= 0xffff; 529 } else if (InstClass != MIMG) { 530 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 531 } 532 533 AddressRegs Regs = getRegs(Opc, *LSO.TII); 534 535 NumAddresses = 0; 536 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 537 AddrIdx[NumAddresses++] = 538 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 539 if (Regs.Addr) 540 AddrIdx[NumAddresses++] = 541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 542 if (Regs.SBase) 543 AddrIdx[NumAddresses++] = 544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 545 if (Regs.SRsrc) 546 AddrIdx[NumAddresses++] = 547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 548 if (Regs.SOffset) 549 AddrIdx[NumAddresses++] = 550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 551 if (Regs.VAddr) 552 AddrIdx[NumAddresses++] = 553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 554 if (Regs.SSamp) 555 AddrIdx[NumAddresses++] = 556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 557 assert(NumAddresses <= MaxAddressRegs); 558 559 for (unsigned J = 0; J < NumAddresses; J++) 560 AddrReg[J] = &I->getOperand(AddrIdx[J]); 561 } 562 563 } // end anonymous namespace. 564 565 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 566 "SI Load Store Optimizer", false, false) 567 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 568 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 569 false, false) 570 571 char SILoadStoreOptimizer::ID = 0; 572 573 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 574 575 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 576 return new SILoadStoreOptimizer(); 577 } 578 579 static void moveInstsAfter(MachineBasicBlock::iterator I, 580 ArrayRef<MachineInstr *> InstsToMove) { 581 MachineBasicBlock *MBB = I->getParent(); 582 ++I; 583 for (MachineInstr *MI : InstsToMove) { 584 MI->removeFromParent(); 585 MBB->insert(I, MI); 586 } 587 } 588 589 static void addDefsUsesToList(const MachineInstr &MI, 590 DenseSet<Register> &RegDefs, 591 DenseSet<Register> &PhysRegUses) { 592 for (const MachineOperand &Op : MI.operands()) { 593 if (Op.isReg()) { 594 if (Op.isDef()) 595 RegDefs.insert(Op.getReg()); 596 else if (Op.readsReg() && Op.getReg().isPhysical()) 597 PhysRegUses.insert(Op.getReg()); 598 } 599 } 600 } 601 602 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 603 MachineBasicBlock::iterator B, 604 AliasAnalysis *AA) { 605 // RAW or WAR - cannot reorder 606 // WAW - cannot reorder 607 // RAR - safe to reorder 608 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 609 } 610 611 // Add MI and its defs to the lists if MI reads one of the defs that are 612 // already in the list. Returns true in that case. 613 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 614 DenseSet<Register> &PhysRegUses, 615 SmallVectorImpl<MachineInstr *> &Insts) { 616 for (MachineOperand &Use : MI.operands()) { 617 // If one of the defs is read, then there is a use of Def between I and the 618 // instruction that I will potentially be merged with. We will need to move 619 // this instruction after the merged instructions. 620 // 621 // Similarly, if there is a def which is read by an instruction that is to 622 // be moved for merging, then we need to move the def-instruction as well. 623 // This can only happen for physical registers such as M0; virtual 624 // registers are in SSA form. 625 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 626 (Use.isDef() && RegDefs.count(Use.getReg())) || 627 (Use.isDef() && Use.getReg().isPhysical() && 628 PhysRegUses.count(Use.getReg())))) { 629 Insts.push_back(&MI); 630 addDefsUsesToList(MI, RegDefs, PhysRegUses); 631 return true; 632 } 633 } 634 635 return false; 636 } 637 638 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 639 ArrayRef<MachineInstr *> InstsToMove, 640 AliasAnalysis *AA) { 641 assert(MemOp.mayLoadOrStore()); 642 643 for (MachineInstr *InstToMove : InstsToMove) { 644 if (!InstToMove->mayLoadOrStore()) 645 continue; 646 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 647 return false; 648 } 649 return true; 650 } 651 652 // This function assumes that \p A and \p B have are identical except for 653 // size and offset, and they reference adjacent memory. 654 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 655 const MachineMemOperand *A, 656 const MachineMemOperand *B) { 657 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 658 unsigned Size = A->getSize() + B->getSize(); 659 // This function adds the offset parameter to the existing offset for A, 660 // so we pass 0 here as the offset and then manually set it to the correct 661 // value after the call. 662 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 663 MMO->setOffset(MinOffset); 664 return MMO; 665 } 666 667 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 668 const SIInstrInfo &TII, 669 const CombineInfo &Paired) { 670 assert(CI.InstClass == MIMG); 671 672 // Ignore instructions with tfe/lwe set. 673 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 674 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 675 676 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 677 return false; 678 679 // Check other optional immediate operands for equality. 680 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 681 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 682 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 683 684 for (auto op : OperandsToMatch) { 685 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 686 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 687 return false; 688 if (Idx != -1 && 689 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 690 return false; 691 } 692 693 // Check DMask for overlaps. 694 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 695 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 696 697 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 698 if ((1u << AllowedBitsForMin) <= MinMask) 699 return false; 700 701 return true; 702 } 703 704 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 705 unsigned ComponentCount, 706 const GCNSubtarget &STI) { 707 if (ComponentCount > 4) 708 return 0; 709 710 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 711 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 712 if (!OldFormatInfo) 713 return 0; 714 715 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 716 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 717 ComponentCount, 718 OldFormatInfo->NumFormat, STI); 719 720 if (!NewFormatInfo) 721 return 0; 722 723 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 724 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 725 726 return NewFormatInfo->Format; 727 } 728 729 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 730 // highest power of two. Note that the result is well defined for all inputs 731 // including corner cases like: 732 // - if Lo == Hi, return that value 733 // - if Lo == 0, return 0 (even though the "- 1" below underflows 734 // - if Lo > Hi, return 0 (as if the range wrapped around) 735 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 736 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 737 } 738 739 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 740 const GCNSubtarget &STI, 741 CombineInfo &Paired, 742 bool Modify) { 743 assert(CI.InstClass != MIMG); 744 745 // XXX - Would the same offset be OK? Is there any reason this would happen or 746 // be useful? 747 if (CI.Offset == Paired.Offset) 748 return false; 749 750 // This won't be valid if the offset isn't aligned. 751 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 752 return false; 753 754 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 755 756 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 757 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 758 if (!Info0) 759 return false; 760 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 761 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 762 if (!Info1) 763 return false; 764 765 if (Info0->BitsPerComp != Info1->BitsPerComp || 766 Info0->NumFormat != Info1->NumFormat) 767 return false; 768 769 // TODO: Should be possible to support more formats, but if format loads 770 // are not dword-aligned, the merged load might not be valid. 771 if (Info0->BitsPerComp != 32) 772 return false; 773 774 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 775 return false; 776 } 777 778 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 779 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 780 CI.UseST64 = false; 781 CI.BaseOff = 0; 782 783 // Handle all non-DS instructions. 784 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 785 return (EltOffset0 + CI.Width == EltOffset1 || 786 EltOffset1 + Paired.Width == EltOffset0) && 787 CI.CPol == Paired.CPol && 788 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 789 } 790 791 // If the offset in elements doesn't fit in 8-bits, we might be able to use 792 // the stride 64 versions. 793 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 794 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 795 if (Modify) { 796 CI.Offset = EltOffset0 / 64; 797 Paired.Offset = EltOffset1 / 64; 798 CI.UseST64 = true; 799 } 800 return true; 801 } 802 803 // Check if the new offsets fit in the reduced 8-bit range. 804 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 805 if (Modify) { 806 CI.Offset = EltOffset0; 807 Paired.Offset = EltOffset1; 808 } 809 return true; 810 } 811 812 // Try to shift base address to decrease offsets. 813 uint32_t Min = std::min(EltOffset0, EltOffset1); 814 uint32_t Max = std::max(EltOffset0, EltOffset1); 815 816 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 817 if (((Max - Min) & ~Mask) == 0) { 818 if (Modify) { 819 // From the range of values we could use for BaseOff, choose the one that 820 // is aligned to the highest power of two, to maximise the chance that 821 // the same offset can be reused for other load/store pairs. 822 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 823 // Copy the low bits of the offsets, so that when we adjust them by 824 // subtracting BaseOff they will be multiples of 64. 825 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 826 CI.BaseOff = BaseOff * CI.EltSize; 827 CI.Offset = (EltOffset0 - BaseOff) / 64; 828 Paired.Offset = (EltOffset1 - BaseOff) / 64; 829 CI.UseST64 = true; 830 } 831 return true; 832 } 833 834 if (isUInt<8>(Max - Min)) { 835 if (Modify) { 836 // From the range of values we could use for BaseOff, choose the one that 837 // is aligned to the highest power of two, to maximise the chance that 838 // the same offset can be reused for other load/store pairs. 839 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 840 CI.BaseOff = BaseOff * CI.EltSize; 841 CI.Offset = EltOffset0 - BaseOff; 842 Paired.Offset = EltOffset1 - BaseOff; 843 } 844 return true; 845 } 846 847 return false; 848 } 849 850 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 851 const CombineInfo &CI, 852 const CombineInfo &Paired) { 853 const unsigned Width = (CI.Width + Paired.Width); 854 switch (CI.InstClass) { 855 default: 856 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 857 case S_BUFFER_LOAD_IMM: 858 switch (Width) { 859 default: 860 return false; 861 case 2: 862 case 4: 863 case 8: 864 return true; 865 } 866 } 867 } 868 869 const TargetRegisterClass * 870 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 871 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 872 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 873 } 874 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 875 return TRI->getRegClassForReg(*MRI, Src->getReg()); 876 } 877 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 878 return TRI->getRegClassForReg(*MRI, Src->getReg()); 879 } 880 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 881 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 882 } 883 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 884 return TRI->getRegClassForReg(*MRI, Src->getReg()); 885 } 886 return nullptr; 887 } 888 889 /// This function assumes that CI comes before Paired in a basic block. 890 bool SILoadStoreOptimizer::checkAndPrepareMerge( 891 CombineInfo &CI, CombineInfo &Paired, 892 SmallVectorImpl<MachineInstr *> &InstsToMove) { 893 894 // Check both offsets (or masks for MIMG) can be combined and fit in the 895 // reduced range. 896 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 897 return false; 898 899 if (CI.InstClass != MIMG && 900 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 901 return false; 902 903 const unsigned Opc = CI.I->getOpcode(); 904 const InstClassEnum InstClass = getInstClass(Opc, *TII); 905 906 if (InstClass == UNKNOWN) { 907 return false; 908 } 909 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 910 911 DenseSet<Register> RegDefsToMove; 912 DenseSet<Register> PhysRegUsesToMove; 913 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 914 915 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); 916 bool IsAGPR = TRI->hasAGPRs(DataRC); 917 918 MachineBasicBlock::iterator E = std::next(Paired.I); 919 MachineBasicBlock::iterator MBBI = std::next(CI.I); 920 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 921 for (; MBBI != E; ++MBBI) { 922 923 if (MBBI == MBBE) { 924 // CombineInfo::Order is a hint on the instruction ordering within the 925 // basic block. This hint suggests that CI precedes Paired, which is 926 // true most of the time. However, moveInstsAfter() processing a 927 // previous list may have changed this order in a situation when it 928 // moves an instruction which exists in some other merge list. 929 // In this case it must be dependent. 930 return false; 931 } 932 933 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 934 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 935 // This is not a matching instruction, but we can keep looking as 936 // long as one of these conditions are met: 937 // 1. It is safe to move I down past MBBI. 938 // 2. It is safe to move MBBI down past the instruction that I will 939 // be merged into. 940 941 if (MBBI->hasUnmodeledSideEffects()) { 942 // We can't re-order this instruction with respect to other memory 943 // operations, so we fail both conditions mentioned above. 944 return false; 945 } 946 947 if (MBBI->mayLoadOrStore() && 948 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 949 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 950 // We fail condition #1, but we may still be able to satisfy condition 951 // #2. Add this instruction to the move list and then we will check 952 // if condition #2 holds once we have selected the matching instruction. 953 InstsToMove.push_back(&*MBBI); 954 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 955 continue; 956 } 957 958 // When we match I with another DS instruction we will be moving I down 959 // to the location of the matched instruction any uses of I will need to 960 // be moved down as well. 961 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 962 InstsToMove); 963 continue; 964 } 965 966 // Handle a case like 967 // DS_WRITE_B32 addr, v, idx0 968 // w = DS_READ_B32 addr, idx0 969 // DS_WRITE_B32 addr, f(w), idx1 970 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 971 // merging of the two writes. 972 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 973 InstsToMove)) 974 continue; 975 976 if (&*MBBI == &*Paired.I) { 977 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) 978 return false; 979 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 980 // operands. However we are reporting that ds_write2 shall have 981 // only VGPR data so that machine copy propagation does not 982 // create an illegal instruction with a VGPR and AGPR sources. 983 // Consequenctially if we create such instruction the verifier 984 // will complain. 985 if (IsAGPR && CI.InstClass == DS_WRITE) 986 return false; 987 988 // We need to go through the list of instructions that we plan to 989 // move and make sure they are all safe to move down past the merged 990 // instruction. 991 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 992 993 // Call offsetsCanBeCombined with modify = true so that the offsets are 994 // correct for the new instruction. This should return true, because 995 // this function should only be called on CombineInfo objects that 996 // have already been confirmed to be mergeable. 997 if (CI.InstClass != MIMG) 998 offsetsCanBeCombined(CI, *STM, Paired, true); 999 return true; 1000 } 1001 return false; 1002 } 1003 1004 // We've found a load/store that we couldn't merge for some reason. 1005 // We could potentially keep looking, but we'd need to make sure that 1006 // it was safe to move I and also all the instruction in InstsToMove 1007 // down past this instruction. 1008 // check if we can move I across MBBI and if we can move all I's users 1009 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 1010 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 1011 break; 1012 } 1013 return false; 1014 } 1015 1016 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1017 if (STM->ldsRequiresM0Init()) 1018 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1019 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1020 } 1021 1022 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1023 if (STM->ldsRequiresM0Init()) 1024 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1025 1026 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1027 : AMDGPU::DS_READ2ST64_B64_gfx9; 1028 } 1029 1030 MachineBasicBlock::iterator 1031 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1032 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1033 MachineBasicBlock *MBB = CI.I->getParent(); 1034 1035 // Be careful, since the addresses could be subregisters themselves in weird 1036 // cases, like vectors of pointers. 1037 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1038 1039 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1040 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1041 1042 unsigned NewOffset0 = CI.Offset; 1043 unsigned NewOffset1 = Paired.Offset; 1044 unsigned Opc = 1045 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1046 1047 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1048 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1049 1050 if (NewOffset0 > NewOffset1) { 1051 // Canonicalize the merged instruction so the smaller offset comes first. 1052 std::swap(NewOffset0, NewOffset1); 1053 std::swap(SubRegIdx0, SubRegIdx1); 1054 } 1055 1056 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1057 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1058 1059 const MCInstrDesc &Read2Desc = TII->get(Opc); 1060 1061 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1062 Register DestReg = MRI->createVirtualRegister(SuperRC); 1063 1064 DebugLoc DL = CI.I->getDebugLoc(); 1065 1066 Register BaseReg = AddrReg->getReg(); 1067 unsigned BaseSubReg = AddrReg->getSubReg(); 1068 unsigned BaseRegFlags = 0; 1069 if (CI.BaseOff) { 1070 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1071 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1072 .addImm(CI.BaseOff); 1073 1074 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1075 BaseRegFlags = RegState::Kill; 1076 1077 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1078 .addReg(ImmReg) 1079 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1080 .addImm(0); // clamp bit 1081 BaseSubReg = 0; 1082 } 1083 1084 MachineInstrBuilder Read2 = 1085 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1086 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1087 .addImm(NewOffset0) // offset0 1088 .addImm(NewOffset1) // offset1 1089 .addImm(0) // gds 1090 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1091 1092 (void)Read2; 1093 1094 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1095 1096 // Copy to the old destination registers. 1097 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1098 .add(*Dest0) // Copy to same destination including flags and sub reg. 1099 .addReg(DestReg, 0, SubRegIdx0); 1100 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1101 .add(*Dest1) 1102 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1103 1104 moveInstsAfter(Copy1, InstsToMove); 1105 1106 CI.I->eraseFromParent(); 1107 Paired.I->eraseFromParent(); 1108 1109 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1110 return Read2; 1111 } 1112 1113 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1114 if (STM->ldsRequiresM0Init()) 1115 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1116 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1117 : AMDGPU::DS_WRITE2_B64_gfx9; 1118 } 1119 1120 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1121 if (STM->ldsRequiresM0Init()) 1122 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1123 : AMDGPU::DS_WRITE2ST64_B64; 1124 1125 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1126 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1127 } 1128 1129 MachineBasicBlock::iterator 1130 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1131 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1132 MachineBasicBlock *MBB = CI.I->getParent(); 1133 1134 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1135 // sure we preserve the subregister index and any register flags set on them. 1136 const MachineOperand *AddrReg = 1137 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1138 const MachineOperand *Data0 = 1139 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1140 const MachineOperand *Data1 = 1141 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1142 1143 unsigned NewOffset0 = CI.Offset; 1144 unsigned NewOffset1 = Paired.Offset; 1145 unsigned Opc = 1146 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1147 1148 if (NewOffset0 > NewOffset1) { 1149 // Canonicalize the merged instruction so the smaller offset comes first. 1150 std::swap(NewOffset0, NewOffset1); 1151 std::swap(Data0, Data1); 1152 } 1153 1154 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1155 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1156 1157 const MCInstrDesc &Write2Desc = TII->get(Opc); 1158 DebugLoc DL = CI.I->getDebugLoc(); 1159 1160 Register BaseReg = AddrReg->getReg(); 1161 unsigned BaseSubReg = AddrReg->getSubReg(); 1162 unsigned BaseRegFlags = 0; 1163 if (CI.BaseOff) { 1164 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1165 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1166 .addImm(CI.BaseOff); 1167 1168 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1169 BaseRegFlags = RegState::Kill; 1170 1171 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1172 .addReg(ImmReg) 1173 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1174 .addImm(0); // clamp bit 1175 BaseSubReg = 0; 1176 } 1177 1178 MachineInstrBuilder Write2 = 1179 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1180 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1181 .add(*Data0) // data0 1182 .add(*Data1) // data1 1183 .addImm(NewOffset0) // offset0 1184 .addImm(NewOffset1) // offset1 1185 .addImm(0) // gds 1186 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1187 1188 moveInstsAfter(Write2, InstsToMove); 1189 1190 CI.I->eraseFromParent(); 1191 Paired.I->eraseFromParent(); 1192 1193 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1194 return Write2; 1195 } 1196 1197 MachineBasicBlock::iterator 1198 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1199 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1200 MachineBasicBlock *MBB = CI.I->getParent(); 1201 DebugLoc DL = CI.I->getDebugLoc(); 1202 const unsigned Opcode = getNewOpcode(CI, Paired); 1203 1204 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1205 1206 Register DestReg = MRI->createVirtualRegister(SuperRC); 1207 unsigned MergedDMask = CI.DMask | Paired.DMask; 1208 unsigned DMaskIdx = 1209 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1210 1211 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1212 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1213 if (I == DMaskIdx) 1214 MIB.addImm(MergedDMask); 1215 else 1216 MIB.add((*CI.I).getOperand(I)); 1217 } 1218 1219 // It shouldn't be possible to get this far if the two instructions 1220 // don't have a single memoperand, because MachineInstr::mayAlias() 1221 // will return true if this is the case. 1222 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1223 1224 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1225 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1226 1227 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1228 1229 unsigned SubRegIdx0, SubRegIdx1; 1230 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1231 1232 // Copy to the old destination registers. 1233 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1234 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1235 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1236 1237 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1238 .add(*Dest0) // Copy to same destination including flags and sub reg. 1239 .addReg(DestReg, 0, SubRegIdx0); 1240 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1241 .add(*Dest1) 1242 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1243 1244 moveInstsAfter(Copy1, InstsToMove); 1245 1246 CI.I->eraseFromParent(); 1247 Paired.I->eraseFromParent(); 1248 return New; 1249 } 1250 1251 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1252 CombineInfo &CI, CombineInfo &Paired, 1253 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1254 MachineBasicBlock *MBB = CI.I->getParent(); 1255 DebugLoc DL = CI.I->getDebugLoc(); 1256 const unsigned Opcode = getNewOpcode(CI, Paired); 1257 1258 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1259 1260 Register DestReg = MRI->createVirtualRegister(SuperRC); 1261 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1262 1263 // It shouldn't be possible to get this far if the two instructions 1264 // don't have a single memoperand, because MachineInstr::mayAlias() 1265 // will return true if this is the case. 1266 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1267 1268 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1269 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1270 1271 MachineInstr *New = 1272 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1273 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1274 .addImm(MergedOffset) // offset 1275 .addImm(CI.CPol) // cpol 1276 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1277 1278 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1279 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1280 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1281 1282 // Copy to the old destination registers. 1283 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1284 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1285 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1286 1287 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1288 .add(*Dest0) // Copy to same destination including flags and sub reg. 1289 .addReg(DestReg, 0, SubRegIdx0); 1290 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1291 .add(*Dest1) 1292 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1293 1294 moveInstsAfter(Copy1, InstsToMove); 1295 1296 CI.I->eraseFromParent(); 1297 Paired.I->eraseFromParent(); 1298 return New; 1299 } 1300 1301 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1302 CombineInfo &CI, CombineInfo &Paired, 1303 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1304 MachineBasicBlock *MBB = CI.I->getParent(); 1305 DebugLoc DL = CI.I->getDebugLoc(); 1306 1307 const unsigned Opcode = getNewOpcode(CI, Paired); 1308 1309 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1310 1311 // Copy to the new source register. 1312 Register DestReg = MRI->createVirtualRegister(SuperRC); 1313 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1314 1315 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1316 1317 AddressRegs Regs = getRegs(Opcode, *TII); 1318 1319 if (Regs.VAddr) 1320 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1321 1322 // It shouldn't be possible to get this far if the two instructions 1323 // don't have a single memoperand, because MachineInstr::mayAlias() 1324 // will return true if this is the case. 1325 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1326 1327 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1328 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1329 1330 MachineInstr *New = 1331 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1332 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1333 .addImm(MergedOffset) // offset 1334 .addImm(CI.CPol) // cpol 1335 .addImm(0) // tfe 1336 .addImm(0) // swz 1337 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1338 1339 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1340 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1341 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1342 1343 // Copy to the old destination registers. 1344 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1345 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1346 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1347 1348 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1349 .add(*Dest0) // Copy to same destination including flags and sub reg. 1350 .addReg(DestReg, 0, SubRegIdx0); 1351 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1352 .add(*Dest1) 1353 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1354 1355 moveInstsAfter(Copy1, InstsToMove); 1356 1357 CI.I->eraseFromParent(); 1358 Paired.I->eraseFromParent(); 1359 return New; 1360 } 1361 1362 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1363 CombineInfo &CI, CombineInfo &Paired, 1364 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1365 MachineBasicBlock *MBB = CI.I->getParent(); 1366 DebugLoc DL = CI.I->getDebugLoc(); 1367 1368 const unsigned Opcode = getNewOpcode(CI, Paired); 1369 1370 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1371 1372 // Copy to the new source register. 1373 Register DestReg = MRI->createVirtualRegister(SuperRC); 1374 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1375 1376 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1377 1378 AddressRegs Regs = getRegs(Opcode, *TII); 1379 1380 if (Regs.VAddr) 1381 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1382 1383 unsigned JoinedFormat = 1384 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1385 1386 // It shouldn't be possible to get this far if the two instructions 1387 // don't have a single memoperand, because MachineInstr::mayAlias() 1388 // will return true if this is the case. 1389 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1390 1391 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1392 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1393 1394 MachineInstr *New = 1395 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1396 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1397 .addImm(MergedOffset) // offset 1398 .addImm(JoinedFormat) // format 1399 .addImm(CI.CPol) // cpol 1400 .addImm(0) // tfe 1401 .addImm(0) // swz 1402 .addMemOperand( 1403 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1404 1405 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1406 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1407 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1408 1409 // Copy to the old destination registers. 1410 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1411 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1412 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1413 1414 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1415 .add(*Dest0) // Copy to same destination including flags and sub reg. 1416 .addReg(DestReg, 0, SubRegIdx0); 1417 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1418 .add(*Dest1) 1419 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1420 1421 moveInstsAfter(Copy1, InstsToMove); 1422 1423 CI.I->eraseFromParent(); 1424 Paired.I->eraseFromParent(); 1425 return New; 1426 } 1427 1428 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1429 CombineInfo &CI, CombineInfo &Paired, 1430 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1431 MachineBasicBlock *MBB = CI.I->getParent(); 1432 DebugLoc DL = CI.I->getDebugLoc(); 1433 1434 const unsigned Opcode = getNewOpcode(CI, Paired); 1435 1436 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1437 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1438 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1439 1440 // Copy to the new source register. 1441 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1442 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1443 1444 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1445 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1446 1447 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1448 .add(*Src0) 1449 .addImm(SubRegIdx0) 1450 .add(*Src1) 1451 .addImm(SubRegIdx1); 1452 1453 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1454 .addReg(SrcReg, RegState::Kill); 1455 1456 AddressRegs Regs = getRegs(Opcode, *TII); 1457 1458 if (Regs.VAddr) 1459 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1460 1461 unsigned JoinedFormat = 1462 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1463 1464 // It shouldn't be possible to get this far if the two instructions 1465 // don't have a single memoperand, because MachineInstr::mayAlias() 1466 // will return true if this is the case. 1467 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1468 1469 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1470 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1471 1472 MachineInstr *New = 1473 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1474 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1475 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1476 .addImm(JoinedFormat) // format 1477 .addImm(CI.CPol) // cpol 1478 .addImm(0) // tfe 1479 .addImm(0) // swz 1480 .addMemOperand( 1481 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1482 1483 moveInstsAfter(MIB, InstsToMove); 1484 1485 CI.I->eraseFromParent(); 1486 Paired.I->eraseFromParent(); 1487 return New; 1488 } 1489 1490 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1491 const CombineInfo &Paired) { 1492 const unsigned Width = CI.Width + Paired.Width; 1493 1494 switch (CI.InstClass) { 1495 default: 1496 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1497 // FIXME: Handle d16 correctly 1498 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1499 Width); 1500 case TBUFFER_LOAD: 1501 case TBUFFER_STORE: 1502 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1503 Width); 1504 1505 case UNKNOWN: 1506 llvm_unreachable("Unknown instruction class"); 1507 case S_BUFFER_LOAD_IMM: 1508 switch (Width) { 1509 default: 1510 return 0; 1511 case 2: 1512 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1513 case 4: 1514 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1515 case 8: 1516 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1517 } 1518 case MIMG: 1519 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1520 "No overlaps"); 1521 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1522 } 1523 } 1524 1525 std::pair<unsigned, unsigned> 1526 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1527 const CombineInfo &Paired) { 1528 bool ReverseOrder; 1529 if (CI.InstClass == MIMG) { 1530 assert( 1531 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1532 "No overlaps"); 1533 ReverseOrder = CI.DMask > Paired.DMask; 1534 } else { 1535 ReverseOrder = CI.Offset > Paired.Offset; 1536 } 1537 1538 unsigned Idx0; 1539 unsigned Idx1; 1540 1541 static const unsigned Idxs[5][4] = { 1542 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1543 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1544 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1545 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1546 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1547 }; 1548 1549 assert(CI.Width >= 1 && CI.Width <= 4); 1550 assert(Paired.Width >= 1 && Paired.Width <= 4); 1551 1552 if (ReverseOrder) { 1553 Idx1 = Idxs[0][Paired.Width - 1]; 1554 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1555 } else { 1556 Idx0 = Idxs[0][CI.Width - 1]; 1557 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1558 } 1559 1560 return std::make_pair(Idx0, Idx1); 1561 } 1562 1563 const TargetRegisterClass * 1564 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1565 const CombineInfo &Paired) { 1566 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1567 switch (CI.Width + Paired.Width) { 1568 default: 1569 return nullptr; 1570 case 2: 1571 return &AMDGPU::SReg_64_XEXECRegClass; 1572 case 4: 1573 return &AMDGPU::SGPR_128RegClass; 1574 case 8: 1575 return &AMDGPU::SGPR_256RegClass; 1576 case 16: 1577 return &AMDGPU::SGPR_512RegClass; 1578 } 1579 } 1580 1581 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1582 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1583 ? TRI->getAGPRClassForBitWidth(BitWidth) 1584 : TRI->getVGPRClassForBitWidth(BitWidth); 1585 } 1586 1587 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1588 CombineInfo &CI, CombineInfo &Paired, 1589 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1590 MachineBasicBlock *MBB = CI.I->getParent(); 1591 DebugLoc DL = CI.I->getDebugLoc(); 1592 1593 const unsigned Opcode = getNewOpcode(CI, Paired); 1594 1595 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1596 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1597 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1598 1599 // Copy to the new source register. 1600 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1601 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1602 1603 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1604 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1605 1606 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1607 .add(*Src0) 1608 .addImm(SubRegIdx0) 1609 .add(*Src1) 1610 .addImm(SubRegIdx1); 1611 1612 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1613 .addReg(SrcReg, RegState::Kill); 1614 1615 AddressRegs Regs = getRegs(Opcode, *TII); 1616 1617 if (Regs.VAddr) 1618 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1619 1620 1621 // It shouldn't be possible to get this far if the two instructions 1622 // don't have a single memoperand, because MachineInstr::mayAlias() 1623 // will return true if this is the case. 1624 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1625 1626 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1627 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1628 1629 MachineInstr *New = 1630 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1631 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1632 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1633 .addImm(CI.CPol) // cpol 1634 .addImm(0) // tfe 1635 .addImm(0) // swz 1636 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1637 1638 moveInstsAfter(MIB, InstsToMove); 1639 1640 CI.I->eraseFromParent(); 1641 Paired.I->eraseFromParent(); 1642 return New; 1643 } 1644 1645 MachineOperand 1646 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1647 APInt V(32, Val, true); 1648 if (TII->isInlineConstant(V)) 1649 return MachineOperand::CreateImm(Val); 1650 1651 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1652 MachineInstr *Mov = 1653 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1654 TII->get(AMDGPU::S_MOV_B32), Reg) 1655 .addImm(Val); 1656 (void)Mov; 1657 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1658 return MachineOperand::CreateReg(Reg, false); 1659 } 1660 1661 // Compute base address using Addr and return the final register. 1662 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1663 const MemAddress &Addr) const { 1664 MachineBasicBlock *MBB = MI.getParent(); 1665 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1666 DebugLoc DL = MI.getDebugLoc(); 1667 1668 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1669 Addr.Base.LoSubReg) && 1670 "Expected 32-bit Base-Register-Low!!"); 1671 1672 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1673 Addr.Base.HiSubReg) && 1674 "Expected 32-bit Base-Register-Hi!!"); 1675 1676 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1677 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1678 MachineOperand OffsetHi = 1679 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1680 1681 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1682 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1683 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1684 1685 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1686 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1687 MachineInstr *LoHalf = 1688 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1689 .addReg(CarryReg, RegState::Define) 1690 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1691 .add(OffsetLo) 1692 .addImm(0); // clamp bit 1693 (void)LoHalf; 1694 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1695 1696 MachineInstr *HiHalf = 1697 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1698 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1699 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1700 .add(OffsetHi) 1701 .addReg(CarryReg, RegState::Kill) 1702 .addImm(0); // clamp bit 1703 (void)HiHalf; 1704 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1705 1706 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1707 MachineInstr *FullBase = 1708 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1709 .addReg(DestSub0) 1710 .addImm(AMDGPU::sub0) 1711 .addReg(DestSub1) 1712 .addImm(AMDGPU::sub1); 1713 (void)FullBase; 1714 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1715 1716 return FullDestReg; 1717 } 1718 1719 // Update base and offset with the NewBase and NewOffset in MI. 1720 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1721 Register NewBase, 1722 int32_t NewOffset) const { 1723 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1724 Base->setReg(NewBase); 1725 Base->setIsKill(false); 1726 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1727 } 1728 1729 Optional<int32_t> 1730 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1731 if (Op.isImm()) 1732 return Op.getImm(); 1733 1734 if (!Op.isReg()) 1735 return None; 1736 1737 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1738 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1739 !Def->getOperand(1).isImm()) 1740 return None; 1741 1742 return Def->getOperand(1).getImm(); 1743 } 1744 1745 // Analyze Base and extracts: 1746 // - 32bit base registers, subregisters 1747 // - 64bit constant offset 1748 // Expecting base computation as: 1749 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1750 // %LO:vgpr_32, %c:sreg_64_xexec = 1751 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1752 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1753 // %Base:vreg_64 = 1754 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1755 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1756 MemAddress &Addr) const { 1757 if (!Base.isReg()) 1758 return; 1759 1760 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1761 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1762 || Def->getNumOperands() != 5) 1763 return; 1764 1765 MachineOperand BaseLo = Def->getOperand(1); 1766 MachineOperand BaseHi = Def->getOperand(3); 1767 if (!BaseLo.isReg() || !BaseHi.isReg()) 1768 return; 1769 1770 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1771 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1772 1773 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1774 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1775 return; 1776 1777 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1778 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1779 1780 auto Offset0P = extractConstOffset(*Src0); 1781 if (Offset0P) 1782 BaseLo = *Src1; 1783 else { 1784 if (!(Offset0P = extractConstOffset(*Src1))) 1785 return; 1786 BaseLo = *Src0; 1787 } 1788 1789 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1790 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1791 1792 if (Src0->isImm()) 1793 std::swap(Src0, Src1); 1794 1795 if (!Src1->isImm()) 1796 return; 1797 1798 uint64_t Offset1 = Src1->getImm(); 1799 BaseHi = *Src0; 1800 1801 Addr.Base.LoReg = BaseLo.getReg(); 1802 Addr.Base.HiReg = BaseHi.getReg(); 1803 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1804 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1805 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1806 } 1807 1808 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1809 MachineInstr &MI, 1810 MemInfoMap &Visited, 1811 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1812 1813 if (!(MI.mayLoad() ^ MI.mayStore())) 1814 return false; 1815 1816 // TODO: Support flat and scratch. 1817 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1818 return false; 1819 1820 if (MI.mayLoad() && 1821 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1822 return false; 1823 1824 if (AnchorList.count(&MI)) 1825 return false; 1826 1827 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1828 1829 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1830 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1831 return false; 1832 } 1833 1834 // Step1: Find the base-registers and a 64bit constant offset. 1835 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1836 MemAddress MAddr; 1837 if (Visited.find(&MI) == Visited.end()) { 1838 processBaseWithConstOffset(Base, MAddr); 1839 Visited[&MI] = MAddr; 1840 } else 1841 MAddr = Visited[&MI]; 1842 1843 if (MAddr.Offset == 0) { 1844 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1845 " constant offsets that can be promoted.\n";); 1846 return false; 1847 } 1848 1849 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1850 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1851 1852 // Step2: Traverse through MI's basic block and find an anchor(that has the 1853 // same base-registers) with the highest 13bit distance from MI's offset. 1854 // E.g. (64bit loads) 1855 // bb: 1856 // addr1 = &a + 4096; load1 = load(addr1, 0) 1857 // addr2 = &a + 6144; load2 = load(addr2, 0) 1858 // addr3 = &a + 8192; load3 = load(addr3, 0) 1859 // addr4 = &a + 10240; load4 = load(addr4, 0) 1860 // addr5 = &a + 12288; load5 = load(addr5, 0) 1861 // 1862 // Starting from the first load, the optimization will try to find a new base 1863 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1864 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1865 // as the new-base(anchor) because of the maximum distance which can 1866 // accomodate more intermediate bases presumeably. 1867 // 1868 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1869 // (&a + 8192) for load1, load2, load4. 1870 // addr = &a + 8192 1871 // load1 = load(addr, -4096) 1872 // load2 = load(addr, -2048) 1873 // load3 = load(addr, 0) 1874 // load4 = load(addr, 2048) 1875 // addr5 = &a + 12288; load5 = load(addr5, 0) 1876 // 1877 MachineInstr *AnchorInst = nullptr; 1878 MemAddress AnchorAddr; 1879 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1880 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1881 1882 MachineBasicBlock *MBB = MI.getParent(); 1883 MachineBasicBlock::iterator E = MBB->end(); 1884 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1885 ++MBBI; 1886 const SITargetLowering *TLI = 1887 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1888 1889 for ( ; MBBI != E; ++MBBI) { 1890 MachineInstr &MINext = *MBBI; 1891 // TODO: Support finding an anchor(with same base) from store addresses or 1892 // any other load addresses where the opcodes are different. 1893 if (MINext.getOpcode() != MI.getOpcode() || 1894 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1895 continue; 1896 1897 const MachineOperand &BaseNext = 1898 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1899 MemAddress MAddrNext; 1900 if (Visited.find(&MINext) == Visited.end()) { 1901 processBaseWithConstOffset(BaseNext, MAddrNext); 1902 Visited[&MINext] = MAddrNext; 1903 } else 1904 MAddrNext = Visited[&MINext]; 1905 1906 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1907 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1908 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1909 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1910 continue; 1911 1912 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1913 1914 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1915 TargetLoweringBase::AddrMode AM; 1916 AM.HasBaseReg = true; 1917 AM.BaseOffs = Dist; 1918 if (TLI->isLegalGlobalAddressingMode(AM) && 1919 (uint32_t)std::abs(Dist) > MaxDist) { 1920 MaxDist = std::abs(Dist); 1921 1922 AnchorAddr = MAddrNext; 1923 AnchorInst = &MINext; 1924 } 1925 } 1926 1927 if (AnchorInst) { 1928 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1929 AnchorInst->dump()); 1930 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1931 << AnchorAddr.Offset << "\n\n"); 1932 1933 // Instead of moving up, just re-compute anchor-instruction's base address. 1934 Register Base = computeBase(MI, AnchorAddr); 1935 1936 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1937 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1938 1939 for (auto P : InstsWCommonBase) { 1940 TargetLoweringBase::AddrMode AM; 1941 AM.HasBaseReg = true; 1942 AM.BaseOffs = P.second - AnchorAddr.Offset; 1943 1944 if (TLI->isLegalGlobalAddressingMode(AM)) { 1945 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1946 dbgs() << ")"; P.first->dump()); 1947 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1948 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1949 } 1950 } 1951 AnchorList.insert(AnchorInst); 1952 return true; 1953 } 1954 1955 return false; 1956 } 1957 1958 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1959 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1960 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1961 if (AddrList.front().InstClass == CI.InstClass && 1962 AddrList.front().hasSameBaseAddress(*CI.I)) { 1963 AddrList.emplace_back(CI); 1964 return; 1965 } 1966 } 1967 1968 // Base address not found, so add a new list. 1969 MergeableInsts.emplace_back(1, CI); 1970 } 1971 1972 std::pair<MachineBasicBlock::iterator, bool> 1973 SILoadStoreOptimizer::collectMergeableInsts( 1974 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1975 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1976 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1977 bool Modified = false; 1978 1979 // Sort potential mergeable instructions into lists. One list per base address. 1980 unsigned Order = 0; 1981 MachineBasicBlock::iterator BlockI = Begin; 1982 for (; BlockI != End; ++BlockI) { 1983 MachineInstr &MI = *BlockI; 1984 1985 // We run this before checking if an address is mergeable, because it can produce 1986 // better code even if the instructions aren't mergeable. 1987 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1988 Modified = true; 1989 1990 // Don't combine if volatile. We also won't be able to merge across this, so 1991 // break the search. We can look after this barrier for separate merges. 1992 if (MI.hasOrderedMemoryRef()) { 1993 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1994 1995 // Search will resume after this instruction in a separate merge list. 1996 ++BlockI; 1997 break; 1998 } 1999 2000 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2001 if (InstClass == UNKNOWN) 2002 continue; 2003 2004 // Do not merge VMEM buffer instructions with "swizzled" bit set. 2005 int Swizzled = 2006 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 2007 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 2008 continue; 2009 2010 CombineInfo CI; 2011 CI.setMI(MI, *this); 2012 CI.Order = Order++; 2013 2014 if (!CI.hasMergeableAddress(*MRI)) 2015 continue; 2016 2017 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2018 2019 addInstToMergeableList(CI, MergeableInsts); 2020 } 2021 2022 // At this point we have lists of Mergeable instructions. 2023 // 2024 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2025 // list try to find an instruction that can be merged with I. If an instruction 2026 // is found, it is stored in the Paired field. If no instructions are found, then 2027 // the CombineInfo object is deleted from the list. 2028 2029 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2030 E = MergeableInsts.end(); I != E;) { 2031 2032 std::list<CombineInfo> &MergeList = *I; 2033 if (MergeList.size() <= 1) { 2034 // This means we have found only one instruction with a given address 2035 // that can be merged, and we need at least 2 instructions to do a merge, 2036 // so this list can be discarded. 2037 I = MergeableInsts.erase(I); 2038 continue; 2039 } 2040 2041 // Sort the lists by offsets, this way mergeable instructions will be 2042 // adjacent to each other in the list, which will make it easier to find 2043 // matches. 2044 MergeList.sort( 2045 [] (const CombineInfo &A, const CombineInfo &B) { 2046 return A.Offset < B.Offset; 2047 }); 2048 ++I; 2049 } 2050 2051 return std::make_pair(BlockI, Modified); 2052 } 2053 2054 // Scan through looking for adjacent LDS operations with constant offsets from 2055 // the same base register. We rely on the scheduler to do the hard work of 2056 // clustering nearby loads, and assume these are all adjacent. 2057 bool SILoadStoreOptimizer::optimizeBlock( 2058 std::list<std::list<CombineInfo> > &MergeableInsts) { 2059 bool Modified = false; 2060 2061 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2062 E = MergeableInsts.end(); I != E;) { 2063 std::list<CombineInfo> &MergeList = *I; 2064 2065 bool OptimizeListAgain = false; 2066 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2067 // We weren't able to make any changes, so delete the list so we don't 2068 // process the same instructions the next time we try to optimize this 2069 // block. 2070 I = MergeableInsts.erase(I); 2071 continue; 2072 } 2073 2074 Modified = true; 2075 2076 // We made changes, but also determined that there were no more optimization 2077 // opportunities, so we don't need to reprocess the list 2078 if (!OptimizeListAgain) { 2079 I = MergeableInsts.erase(I); 2080 continue; 2081 } 2082 OptimizeAgain = true; 2083 } 2084 return Modified; 2085 } 2086 2087 bool 2088 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2089 std::list<CombineInfo> &MergeList, 2090 bool &OptimizeListAgain) { 2091 if (MergeList.empty()) 2092 return false; 2093 2094 bool Modified = false; 2095 2096 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2097 Next = std::next(I)) { 2098 2099 auto First = I; 2100 auto Second = Next; 2101 2102 if ((*First).Order > (*Second).Order) 2103 std::swap(First, Second); 2104 CombineInfo &CI = *First; 2105 CombineInfo &Paired = *Second; 2106 2107 SmallVector<MachineInstr *, 8> InstsToMove; 2108 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2109 ++I; 2110 continue; 2111 } 2112 2113 Modified = true; 2114 2115 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2116 2117 switch (CI.InstClass) { 2118 default: 2119 llvm_unreachable("unknown InstClass"); 2120 break; 2121 case DS_READ: { 2122 MachineBasicBlock::iterator NewMI = 2123 mergeRead2Pair(CI, Paired, InstsToMove); 2124 CI.setMI(NewMI, *this); 2125 break; 2126 } 2127 case DS_WRITE: { 2128 MachineBasicBlock::iterator NewMI = 2129 mergeWrite2Pair(CI, Paired, InstsToMove); 2130 CI.setMI(NewMI, *this); 2131 break; 2132 } 2133 case S_BUFFER_LOAD_IMM: { 2134 MachineBasicBlock::iterator NewMI = 2135 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2136 CI.setMI(NewMI, *this); 2137 OptimizeListAgain |= (CI.Width + Paired.Width) < 8; 2138 break; 2139 } 2140 case BUFFER_LOAD: { 2141 MachineBasicBlock::iterator NewMI = 2142 mergeBufferLoadPair(CI, Paired, InstsToMove); 2143 CI.setMI(NewMI, *this); 2144 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2145 break; 2146 } 2147 case BUFFER_STORE: { 2148 MachineBasicBlock::iterator NewMI = 2149 mergeBufferStorePair(CI, Paired, InstsToMove); 2150 CI.setMI(NewMI, *this); 2151 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2152 break; 2153 } 2154 case MIMG: { 2155 MachineBasicBlock::iterator NewMI = 2156 mergeImagePair(CI, Paired, InstsToMove); 2157 CI.setMI(NewMI, *this); 2158 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2159 break; 2160 } 2161 case TBUFFER_LOAD: { 2162 MachineBasicBlock::iterator NewMI = 2163 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2164 CI.setMI(NewMI, *this); 2165 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2166 break; 2167 } 2168 case TBUFFER_STORE: { 2169 MachineBasicBlock::iterator NewMI = 2170 mergeTBufferStorePair(CI, Paired, InstsToMove); 2171 CI.setMI(NewMI, *this); 2172 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2173 break; 2174 } 2175 } 2176 CI.Order = Paired.Order; 2177 if (I == Second) 2178 I = Next; 2179 2180 MergeList.erase(Second); 2181 } 2182 2183 return Modified; 2184 } 2185 2186 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2187 if (skipFunction(MF.getFunction())) 2188 return false; 2189 2190 STM = &MF.getSubtarget<GCNSubtarget>(); 2191 if (!STM->loadStoreOptEnabled()) 2192 return false; 2193 2194 TII = STM->getInstrInfo(); 2195 TRI = &TII->getRegisterInfo(); 2196 2197 MRI = &MF.getRegInfo(); 2198 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2199 2200 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2201 2202 bool Modified = false; 2203 2204 // Contains the list of instructions for which constant offsets are being 2205 // promoted to the IMM. This is tracked for an entire block at time. 2206 SmallPtrSet<MachineInstr *, 4> AnchorList; 2207 MemInfoMap Visited; 2208 2209 for (MachineBasicBlock &MBB : MF) { 2210 MachineBasicBlock::iterator SectionEnd; 2211 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2212 I = SectionEnd) { 2213 bool CollectModified; 2214 std::list<std::list<CombineInfo>> MergeableInsts; 2215 2216 // First pass: Collect list of all instructions we know how to merge in a 2217 // subset of the block. 2218 std::tie(SectionEnd, CollectModified) = 2219 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2220 2221 Modified |= CollectModified; 2222 2223 do { 2224 OptimizeAgain = false; 2225 Modified |= optimizeBlock(MergeableInsts); 2226 } while (OptimizeAgain); 2227 } 2228 2229 Visited.clear(); 2230 AnchorList.clear(); 2231 } 2232 2233 return Modified; 2234 } 2235