1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 bool GLC = 0; 108 bool SLC = 0; 109 bool DLC = 0; 110 bool SCCB = 0; // vmem only. 111 bool UseST64; 112 int AddrIdx[MaxAddressRegs]; 113 const MachineOperand *AddrReg[MaxAddressRegs]; 114 unsigned NumAddresses; 115 unsigned Order; 116 117 bool hasSameBaseAddress(const MachineInstr &MI) { 118 for (unsigned i = 0; i < NumAddresses; i++) { 119 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 120 121 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 122 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 123 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 124 return false; 125 } 126 continue; 127 } 128 129 // Check same base pointer. Be careful of subregisters, which can occur 130 // with vectors of pointers. 131 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 132 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 133 return false; 134 } 135 } 136 return true; 137 } 138 139 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 140 for (unsigned i = 0; i < NumAddresses; ++i) { 141 const MachineOperand *AddrOp = AddrReg[i]; 142 // Immediates are always OK. 143 if (AddrOp->isImm()) 144 continue; 145 146 // Don't try to merge addresses that aren't either immediates or registers. 147 // TODO: Should be possible to merge FrameIndexes and maybe some other 148 // non-register 149 if (!AddrOp->isReg()) 150 return false; 151 152 // TODO: We should be able to merge physical reg addreses. 153 if (AddrOp->getReg().isPhysical()) 154 return false; 155 156 // If an address has only one use then there will be on other 157 // instructions with the same address, so we can't merge this one. 158 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 159 return false; 160 } 161 return true; 162 } 163 164 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 165 const GCNSubtarget &STM); 166 }; 167 168 struct BaseRegisters { 169 Register LoReg; 170 Register HiReg; 171 172 unsigned LoSubReg = 0; 173 unsigned HiSubReg = 0; 174 }; 175 176 struct MemAddress { 177 BaseRegisters Base; 178 int64_t Offset = 0; 179 }; 180 181 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 182 183 private: 184 const GCNSubtarget *STM = nullptr; 185 const SIInstrInfo *TII = nullptr; 186 const SIRegisterInfo *TRI = nullptr; 187 MachineRegisterInfo *MRI = nullptr; 188 AliasAnalysis *AA = nullptr; 189 bool OptimizeAgain; 190 191 static bool dmasksCanBeCombined(const CombineInfo &CI, 192 const SIInstrInfo &TII, 193 const CombineInfo &Paired); 194 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 195 CombineInfo &Paired, bool Modify = false); 196 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 197 const CombineInfo &Paired); 198 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 199 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 200 const CombineInfo &Paired); 201 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 202 const CombineInfo &Paired); 203 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 204 205 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 206 SmallVectorImpl<MachineInstr *> &InstsToMove); 207 208 unsigned read2Opcode(unsigned EltSize) const; 209 unsigned read2ST64Opcode(unsigned EltSize) const; 210 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 211 CombineInfo &Paired, 212 const SmallVectorImpl<MachineInstr *> &InstsToMove); 213 214 unsigned write2Opcode(unsigned EltSize) const; 215 unsigned write2ST64Opcode(unsigned EltSize) const; 216 MachineBasicBlock::iterator 217 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 MachineBasicBlock::iterator 235 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 236 const SmallVectorImpl<MachineInstr *> &InstsToMove); 237 238 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 239 int32_t NewOffset) const; 240 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 241 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 242 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 243 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 244 /// Promotes constant offset to the immediate by adjusting the base. It 245 /// tries to use a base from the nearby instructions that allows it to have 246 /// a 13bit constant offset which gets promoted to the immediate. 247 bool promoteConstantOffsetToImm(MachineInstr &CI, 248 MemInfoMap &Visited, 249 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 250 void addInstToMergeableList(const CombineInfo &CI, 251 std::list<std::list<CombineInfo> > &MergeableInsts) const; 252 253 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 254 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 255 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 256 std::list<std::list<CombineInfo>> &MergeableInsts) const; 257 258 public: 259 static char ID; 260 261 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 262 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 263 } 264 265 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 266 bool &OptimizeListAgain); 267 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 268 269 bool runOnMachineFunction(MachineFunction &MF) override; 270 271 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 272 273 void getAnalysisUsage(AnalysisUsage &AU) const override { 274 AU.setPreservesCFG(); 275 AU.addRequired<AAResultsWrapperPass>(); 276 277 MachineFunctionPass::getAnalysisUsage(AU); 278 } 279 280 MachineFunctionProperties getRequiredProperties() const override { 281 return MachineFunctionProperties() 282 .set(MachineFunctionProperties::Property::IsSSA); 283 } 284 }; 285 286 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 287 const unsigned Opc = MI.getOpcode(); 288 289 if (TII.isMUBUF(Opc)) { 290 // FIXME: Handle d16 correctly 291 return AMDGPU::getMUBUFElements(Opc); 292 } 293 if (TII.isMIMG(MI)) { 294 uint64_t DMaskImm = 295 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 296 return countPopulation(DMaskImm); 297 } 298 if (TII.isMTBUF(Opc)) { 299 return AMDGPU::getMTBUFElements(Opc); 300 } 301 302 switch (Opc) { 303 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 304 return 1; 305 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 306 return 2; 307 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 308 return 4; 309 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 310 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 311 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 312 case AMDGPU::DS_WRITE_B32_gfx9: 313 return 1; 314 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 315 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 316 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 317 case AMDGPU::DS_WRITE_B64_gfx9: 318 return 2; 319 default: 320 return 0; 321 } 322 } 323 324 /// Maps instruction opcode to enum InstClassEnum. 325 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 326 switch (Opc) { 327 default: 328 if (TII.isMUBUF(Opc)) { 329 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 330 default: 331 return UNKNOWN; 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 335 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 336 return BUFFER_LOAD; 337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 340 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 341 return BUFFER_STORE; 342 } 343 } 344 if (TII.isMIMG(Opc)) { 345 // Ignore instructions encoded without vaddr. 346 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 347 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 348 return UNKNOWN; 349 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 350 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 351 TII.isGather4(Opc)) 352 return UNKNOWN; 353 return MIMG; 354 } 355 if (TII.isMTBUF(Opc)) { 356 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 357 default: 358 return UNKNOWN; 359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 360 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 363 return TBUFFER_LOAD; 364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 365 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 368 return TBUFFER_STORE; 369 } 370 } 371 return UNKNOWN; 372 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 375 return S_BUFFER_LOAD_IMM; 376 case AMDGPU::DS_READ_B32: 377 case AMDGPU::DS_READ_B32_gfx9: 378 case AMDGPU::DS_READ_B64: 379 case AMDGPU::DS_READ_B64_gfx9: 380 return DS_READ; 381 case AMDGPU::DS_WRITE_B32: 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 case AMDGPU::DS_WRITE_B64: 384 case AMDGPU::DS_WRITE_B64_gfx9: 385 return DS_WRITE; 386 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: 387 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: 388 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: 389 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: 390 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: 391 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: 392 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: 393 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: 394 return UNKNOWN; 395 } 396 } 397 398 /// Determines instruction subclass from opcode. Only instructions 399 /// of the same subclass can be merged together. 400 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 401 switch (Opc) { 402 default: 403 if (TII.isMUBUF(Opc)) 404 return AMDGPU::getMUBUFBaseOpcode(Opc); 405 if (TII.isMIMG(Opc)) { 406 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 407 assert(Info); 408 return Info->BaseOpcode; 409 } 410 if (TII.isMTBUF(Opc)) 411 return AMDGPU::getMTBUFBaseOpcode(Opc); 412 return -1; 413 case AMDGPU::DS_READ_B32: 414 case AMDGPU::DS_READ_B32_gfx9: 415 case AMDGPU::DS_READ_B64: 416 case AMDGPU::DS_READ_B64_gfx9: 417 case AMDGPU::DS_WRITE_B32: 418 case AMDGPU::DS_WRITE_B32_gfx9: 419 case AMDGPU::DS_WRITE_B64: 420 case AMDGPU::DS_WRITE_B64_gfx9: 421 return Opc; 422 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 423 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 424 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 425 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 426 } 427 } 428 429 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 430 AddressRegs Result; 431 432 if (TII.isMUBUF(Opc)) { 433 if (AMDGPU::getMUBUFHasVAddr(Opc)) 434 Result.VAddr = true; 435 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 436 Result.SRsrc = true; 437 if (AMDGPU::getMUBUFHasSoffset(Opc)) 438 Result.SOffset = true; 439 440 return Result; 441 } 442 443 if (TII.isMIMG(Opc)) { 444 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 445 if (VAddr0Idx >= 0) { 446 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 447 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 448 } else { 449 Result.VAddr = true; 450 } 451 Result.SRsrc = true; 452 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 453 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 454 Result.SSamp = true; 455 456 return Result; 457 } 458 if (TII.isMTBUF(Opc)) { 459 if (AMDGPU::getMTBUFHasVAddr(Opc)) 460 Result.VAddr = true; 461 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 462 Result.SRsrc = true; 463 if (AMDGPU::getMTBUFHasSoffset(Opc)) 464 Result.SOffset = true; 465 466 return Result; 467 } 468 469 switch (Opc) { 470 default: 471 return Result; 472 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 473 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 474 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 475 Result.SBase = true; 476 return Result; 477 case AMDGPU::DS_READ_B32: 478 case AMDGPU::DS_READ_B64: 479 case AMDGPU::DS_READ_B32_gfx9: 480 case AMDGPU::DS_READ_B64_gfx9: 481 case AMDGPU::DS_WRITE_B32: 482 case AMDGPU::DS_WRITE_B64: 483 case AMDGPU::DS_WRITE_B32_gfx9: 484 case AMDGPU::DS_WRITE_B64_gfx9: 485 Result.Addr = true; 486 return Result; 487 } 488 } 489 490 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 491 const SIInstrInfo &TII, 492 const GCNSubtarget &STM) { 493 I = MI; 494 unsigned Opc = MI->getOpcode(); 495 InstClass = getInstClass(Opc, TII); 496 497 if (InstClass == UNKNOWN) 498 return; 499 500 switch (InstClass) { 501 case DS_READ: 502 EltSize = 503 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 504 : 4; 505 break; 506 case DS_WRITE: 507 EltSize = 508 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 509 : 4; 510 break; 511 case S_BUFFER_LOAD_IMM: 512 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 513 break; 514 default: 515 EltSize = 4; 516 break; 517 } 518 519 if (InstClass == MIMG) { 520 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 521 // Offset is not considered for MIMG instructions. 522 Offset = 0; 523 } else { 524 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 525 Offset = I->getOperand(OffsetIdx).getImm(); 526 } 527 528 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 529 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 530 531 Width = getOpcodeWidth(*I, TII); 532 533 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 534 Offset &= 0xffff; 535 } else if (InstClass != MIMG) { 536 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 537 if (InstClass != S_BUFFER_LOAD_IMM) { 538 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 539 } 540 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 541 if (InstClass != S_BUFFER_LOAD_IMM) { 542 SCCB = TII.getNamedOperand(*I, AMDGPU::OpName::sccb)->getImm(); 543 } 544 } 545 546 AddressRegs Regs = getRegs(Opc, TII); 547 548 NumAddresses = 0; 549 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 550 AddrIdx[NumAddresses++] = 551 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 552 if (Regs.Addr) 553 AddrIdx[NumAddresses++] = 554 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 555 if (Regs.SBase) 556 AddrIdx[NumAddresses++] = 557 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 558 if (Regs.SRsrc) 559 AddrIdx[NumAddresses++] = 560 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 561 if (Regs.SOffset) 562 AddrIdx[NumAddresses++] = 563 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 564 if (Regs.VAddr) 565 AddrIdx[NumAddresses++] = 566 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 567 if (Regs.SSamp) 568 AddrIdx[NumAddresses++] = 569 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 570 assert(NumAddresses <= MaxAddressRegs); 571 572 for (unsigned J = 0; J < NumAddresses; J++) 573 AddrReg[J] = &I->getOperand(AddrIdx[J]); 574 } 575 576 } // end anonymous namespace. 577 578 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 579 "SI Load Store Optimizer", false, false) 580 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 581 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 582 false, false) 583 584 char SILoadStoreOptimizer::ID = 0; 585 586 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 587 588 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 589 return new SILoadStoreOptimizer(); 590 } 591 592 static void moveInstsAfter(MachineBasicBlock::iterator I, 593 ArrayRef<MachineInstr *> InstsToMove) { 594 MachineBasicBlock *MBB = I->getParent(); 595 ++I; 596 for (MachineInstr *MI : InstsToMove) { 597 MI->removeFromParent(); 598 MBB->insert(I, MI); 599 } 600 } 601 602 static void addDefsUsesToList(const MachineInstr &MI, 603 DenseSet<Register> &RegDefs, 604 DenseSet<Register> &PhysRegUses) { 605 for (const MachineOperand &Op : MI.operands()) { 606 if (Op.isReg()) { 607 if (Op.isDef()) 608 RegDefs.insert(Op.getReg()); 609 else if (Op.readsReg() && Op.getReg().isPhysical()) 610 PhysRegUses.insert(Op.getReg()); 611 } 612 } 613 } 614 615 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 616 MachineBasicBlock::iterator B, 617 AliasAnalysis *AA) { 618 // RAW or WAR - cannot reorder 619 // WAW - cannot reorder 620 // RAR - safe to reorder 621 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 622 } 623 624 // Add MI and its defs to the lists if MI reads one of the defs that are 625 // already in the list. Returns true in that case. 626 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 627 DenseSet<Register> &PhysRegUses, 628 SmallVectorImpl<MachineInstr *> &Insts) { 629 for (MachineOperand &Use : MI.operands()) { 630 // If one of the defs is read, then there is a use of Def between I and the 631 // instruction that I will potentially be merged with. We will need to move 632 // this instruction after the merged instructions. 633 // 634 // Similarly, if there is a def which is read by an instruction that is to 635 // be moved for merging, then we need to move the def-instruction as well. 636 // This can only happen for physical registers such as M0; virtual 637 // registers are in SSA form. 638 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 639 (Use.isDef() && RegDefs.count(Use.getReg())) || 640 (Use.isDef() && Use.getReg().isPhysical() && 641 PhysRegUses.count(Use.getReg())))) { 642 Insts.push_back(&MI); 643 addDefsUsesToList(MI, RegDefs, PhysRegUses); 644 return true; 645 } 646 } 647 648 return false; 649 } 650 651 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 652 ArrayRef<MachineInstr *> InstsToMove, 653 AliasAnalysis *AA) { 654 assert(MemOp.mayLoadOrStore()); 655 656 for (MachineInstr *InstToMove : InstsToMove) { 657 if (!InstToMove->mayLoadOrStore()) 658 continue; 659 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 660 return false; 661 } 662 return true; 663 } 664 665 // This function assumes that \p A and \p B have are identical except for 666 // size and offset, and they referecne adjacent memory. 667 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 668 const MachineMemOperand *A, 669 const MachineMemOperand *B) { 670 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 671 unsigned Size = A->getSize() + B->getSize(); 672 // This function adds the offset parameter to the existing offset for A, 673 // so we pass 0 here as the offset and then manually set it to the correct 674 // value after the call. 675 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 676 MMO->setOffset(MinOffset); 677 return MMO; 678 } 679 680 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 681 const SIInstrInfo &TII, 682 const CombineInfo &Paired) { 683 assert(CI.InstClass == MIMG); 684 685 // Ignore instructions with tfe/lwe set. 686 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 687 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 688 689 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 690 return false; 691 692 // Check other optional immediate operands for equality. 693 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 694 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 695 AMDGPU::OpName::da, AMDGPU::OpName::r128, 696 AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; 697 698 for (auto op : OperandsToMatch) { 699 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 700 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 701 return false; 702 if (Idx != -1 && 703 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 704 return false; 705 } 706 707 // Check DMask for overlaps. 708 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 709 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 710 711 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 712 if ((1u << AllowedBitsForMin) <= MinMask) 713 return false; 714 715 return true; 716 } 717 718 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 719 unsigned ComponentCount, 720 const GCNSubtarget &STI) { 721 if (ComponentCount > 4) 722 return 0; 723 724 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 725 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 726 if (!OldFormatInfo) 727 return 0; 728 729 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 730 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 731 ComponentCount, 732 OldFormatInfo->NumFormat, STI); 733 734 if (!NewFormatInfo) 735 return 0; 736 737 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 738 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 739 740 return NewFormatInfo->Format; 741 } 742 743 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 744 // highest power of two. Note that the result is well defined for all inputs 745 // including corner cases like: 746 // - if Lo == Hi, return that value 747 // - if Lo == 0, return 0 (even though the "- 1" below underflows 748 // - if Lo > Hi, return 0 (as if the range wrapped around) 749 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 750 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 751 } 752 753 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 754 const GCNSubtarget &STI, 755 CombineInfo &Paired, 756 bool Modify) { 757 assert(CI.InstClass != MIMG); 758 759 // XXX - Would the same offset be OK? Is there any reason this would happen or 760 // be useful? 761 if (CI.Offset == Paired.Offset) 762 return false; 763 764 // This won't be valid if the offset isn't aligned. 765 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 766 return false; 767 768 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 769 770 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 771 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 772 if (!Info0) 773 return false; 774 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 775 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 776 if (!Info1) 777 return false; 778 779 if (Info0->BitsPerComp != Info1->BitsPerComp || 780 Info0->NumFormat != Info1->NumFormat) 781 return false; 782 783 // TODO: Should be possible to support more formats, but if format loads 784 // are not dword-aligned, the merged load might not be valid. 785 if (Info0->BitsPerComp != 32) 786 return false; 787 788 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 789 return false; 790 } 791 792 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 793 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 794 CI.UseST64 = false; 795 CI.BaseOff = 0; 796 797 // Handle all non-DS instructions. 798 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 799 return (EltOffset0 + CI.Width == EltOffset1 || 800 EltOffset1 + Paired.Width == EltOffset0) && 801 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 802 (CI.InstClass == S_BUFFER_LOAD_IMM || 803 (CI.SLC == Paired.SLC && CI.SCCB == Paired.SCCB)); 804 } 805 806 // If the offset in elements doesn't fit in 8-bits, we might be able to use 807 // the stride 64 versions. 808 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 809 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 810 if (Modify) { 811 CI.Offset = EltOffset0 / 64; 812 Paired.Offset = EltOffset1 / 64; 813 CI.UseST64 = true; 814 } 815 return true; 816 } 817 818 // Check if the new offsets fit in the reduced 8-bit range. 819 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 820 if (Modify) { 821 CI.Offset = EltOffset0; 822 Paired.Offset = EltOffset1; 823 } 824 return true; 825 } 826 827 // Try to shift base address to decrease offsets. 828 uint32_t Min = std::min(EltOffset0, EltOffset1); 829 uint32_t Max = std::max(EltOffset0, EltOffset1); 830 831 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 832 if (((Max - Min) & ~Mask) == 0) { 833 if (Modify) { 834 // From the range of values we could use for BaseOff, choose the one that 835 // is aligned to the highest power of two, to maximise the chance that 836 // the same offset can be reused for other load/store pairs. 837 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 838 // Copy the low bits of the offsets, so that when we adjust them by 839 // subtracting BaseOff they will be multiples of 64. 840 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 841 CI.BaseOff = BaseOff * CI.EltSize; 842 CI.Offset = (EltOffset0 - BaseOff) / 64; 843 Paired.Offset = (EltOffset1 - BaseOff) / 64; 844 CI.UseST64 = true; 845 } 846 return true; 847 } 848 849 if (isUInt<8>(Max - Min)) { 850 if (Modify) { 851 // From the range of values we could use for BaseOff, choose the one that 852 // is aligned to the highest power of two, to maximise the chance that 853 // the same offset can be reused for other load/store pairs. 854 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 855 CI.BaseOff = BaseOff * CI.EltSize; 856 CI.Offset = EltOffset0 - BaseOff; 857 Paired.Offset = EltOffset1 - BaseOff; 858 } 859 return true; 860 } 861 862 return false; 863 } 864 865 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 866 const CombineInfo &CI, 867 const CombineInfo &Paired) { 868 const unsigned Width = (CI.Width + Paired.Width); 869 switch (CI.InstClass) { 870 default: 871 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 872 case S_BUFFER_LOAD_IMM: 873 switch (Width) { 874 default: 875 return false; 876 case 2: 877 case 4: 878 return true; 879 } 880 } 881 } 882 883 const TargetRegisterClass * 884 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 885 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 886 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 887 } 888 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 889 return TRI->getRegClassForReg(*MRI, Src->getReg()); 890 } 891 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 892 return TRI->getRegClassForReg(*MRI, Src->getReg()); 893 } 894 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 895 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 896 } 897 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 898 return TRI->getRegClassForReg(*MRI, Src->getReg()); 899 } 900 return nullptr; 901 } 902 903 /// This function assumes that CI comes before Paired in a basic block. 904 bool SILoadStoreOptimizer::checkAndPrepareMerge( 905 CombineInfo &CI, CombineInfo &Paired, 906 SmallVectorImpl<MachineInstr *> &InstsToMove) { 907 908 // Check both offsets (or masks for MIMG) can be combined and fit in the 909 // reduced range. 910 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 911 return false; 912 913 if (CI.InstClass != MIMG && 914 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 915 return false; 916 917 const unsigned Opc = CI.I->getOpcode(); 918 const InstClassEnum InstClass = getInstClass(Opc, *TII); 919 920 if (InstClass == UNKNOWN) { 921 return false; 922 } 923 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 924 925 // Do not merge VMEM buffer instructions with "swizzled" bit set. 926 int Swizzled = 927 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 928 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 929 return false; 930 931 DenseSet<Register> RegDefsToMove; 932 DenseSet<Register> PhysRegUsesToMove; 933 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 934 935 const TargetRegisterClass *DataRC = getDataRegClass(*CI.I); 936 bool IsAGPR = TRI->hasAGPRs(DataRC); 937 938 MachineBasicBlock::iterator E = std::next(Paired.I); 939 MachineBasicBlock::iterator MBBI = std::next(CI.I); 940 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 941 for (; MBBI != E; ++MBBI) { 942 943 if (MBBI == MBBE) { 944 // CombineInfo::Order is a hint on the instruction ordering within the 945 // basic block. This hint suggests that CI precedes Paired, which is 946 // true most of the time. However, moveInstsAfter() processing a 947 // previous list may have changed this order in a situation when it 948 // moves an instruction which exists in some other merge list. 949 // In this case it must be dependent. 950 return false; 951 } 952 953 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 954 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 955 // This is not a matching instruction, but we can keep looking as 956 // long as one of these conditions are met: 957 // 1. It is safe to move I down past MBBI. 958 // 2. It is safe to move MBBI down past the instruction that I will 959 // be merged into. 960 961 if (MBBI->hasUnmodeledSideEffects()) { 962 // We can't re-order this instruction with respect to other memory 963 // operations, so we fail both conditions mentioned above. 964 return false; 965 } 966 967 if (MBBI->mayLoadOrStore() && 968 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 969 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 970 // We fail condition #1, but we may still be able to satisfy condition 971 // #2. Add this instruction to the move list and then we will check 972 // if condition #2 holds once we have selected the matching instruction. 973 InstsToMove.push_back(&*MBBI); 974 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 975 continue; 976 } 977 978 // When we match I with another DS instruction we will be moving I down 979 // to the location of the matched instruction any uses of I will need to 980 // be moved down as well. 981 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 982 InstsToMove); 983 continue; 984 } 985 986 // Don't merge volatiles. 987 if (MBBI->hasOrderedMemoryRef()) 988 return false; 989 990 int Swizzled = 991 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 992 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 993 return false; 994 995 // Handle a case like 996 // DS_WRITE_B32 addr, v, idx0 997 // w = DS_READ_B32 addr, idx0 998 // DS_WRITE_B32 addr, f(w), idx1 999 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 1000 // merging of the two writes. 1001 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 1002 InstsToMove)) 1003 continue; 1004 1005 if (&*MBBI == &*Paired.I) { 1006 if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR) 1007 return false; 1008 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 1009 // operands. However we are reporting that ds_write2 shall have 1010 // only VGPR data so that machine copy propagation does not 1011 // create an illegal instruction with a VGPR and AGPR sources. 1012 // Consequenctially if we create such instruction the verifier 1013 // will complain. 1014 if (IsAGPR && CI.InstClass == DS_WRITE) 1015 return false; 1016 1017 // We need to go through the list of instructions that we plan to 1018 // move and make sure they are all safe to move down past the merged 1019 // instruction. 1020 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 1021 1022 // Call offsetsCanBeCombined with modify = true so that the offsets are 1023 // correct for the new instruction. This should return true, because 1024 // this function should only be called on CombineInfo objects that 1025 // have already been confirmed to be mergeable. 1026 if (CI.InstClass != MIMG) 1027 offsetsCanBeCombined(CI, *STM, Paired, true); 1028 return true; 1029 } 1030 return false; 1031 } 1032 1033 // We've found a load/store that we couldn't merge for some reason. 1034 // We could potentially keep looking, but we'd need to make sure that 1035 // it was safe to move I and also all the instruction in InstsToMove 1036 // down past this instruction. 1037 // check if we can move I across MBBI and if we can move all I's users 1038 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 1039 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 1040 break; 1041 } 1042 return false; 1043 } 1044 1045 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 1046 if (STM->ldsRequiresM0Init()) 1047 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 1048 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 1049 } 1050 1051 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1052 if (STM->ldsRequiresM0Init()) 1053 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1054 1055 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1056 : AMDGPU::DS_READ2ST64_B64_gfx9; 1057 } 1058 1059 MachineBasicBlock::iterator 1060 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1061 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1062 MachineBasicBlock *MBB = CI.I->getParent(); 1063 1064 // Be careful, since the addresses could be subregisters themselves in weird 1065 // cases, like vectors of pointers. 1066 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1067 1068 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1069 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1070 1071 unsigned NewOffset0 = CI.Offset; 1072 unsigned NewOffset1 = Paired.Offset; 1073 unsigned Opc = 1074 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1075 1076 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1077 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1078 1079 if (NewOffset0 > NewOffset1) { 1080 // Canonicalize the merged instruction so the smaller offset comes first. 1081 std::swap(NewOffset0, NewOffset1); 1082 std::swap(SubRegIdx0, SubRegIdx1); 1083 } 1084 1085 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1086 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1087 1088 const MCInstrDesc &Read2Desc = TII->get(Opc); 1089 1090 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1091 Register DestReg = MRI->createVirtualRegister(SuperRC); 1092 1093 DebugLoc DL = CI.I->getDebugLoc(); 1094 1095 Register BaseReg = AddrReg->getReg(); 1096 unsigned BaseSubReg = AddrReg->getSubReg(); 1097 unsigned BaseRegFlags = 0; 1098 if (CI.BaseOff) { 1099 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1100 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1101 .addImm(CI.BaseOff); 1102 1103 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1104 BaseRegFlags = RegState::Kill; 1105 1106 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1107 .addReg(ImmReg) 1108 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1109 .addImm(0); // clamp bit 1110 BaseSubReg = 0; 1111 } 1112 1113 MachineInstrBuilder Read2 = 1114 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1115 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1116 .addImm(NewOffset0) // offset0 1117 .addImm(NewOffset1) // offset1 1118 .addImm(0) // gds 1119 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1120 1121 (void)Read2; 1122 1123 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1124 1125 // Copy to the old destination registers. 1126 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1127 .add(*Dest0) // Copy to same destination including flags and sub reg. 1128 .addReg(DestReg, 0, SubRegIdx0); 1129 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1130 .add(*Dest1) 1131 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1132 1133 moveInstsAfter(Copy1, InstsToMove); 1134 1135 CI.I->eraseFromParent(); 1136 Paired.I->eraseFromParent(); 1137 1138 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1139 return Read2; 1140 } 1141 1142 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1143 if (STM->ldsRequiresM0Init()) 1144 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1145 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1146 : AMDGPU::DS_WRITE2_B64_gfx9; 1147 } 1148 1149 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1150 if (STM->ldsRequiresM0Init()) 1151 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1152 : AMDGPU::DS_WRITE2ST64_B64; 1153 1154 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1155 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1156 } 1157 1158 MachineBasicBlock::iterator 1159 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1160 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1161 MachineBasicBlock *MBB = CI.I->getParent(); 1162 1163 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1164 // sure we preserve the subregister index and any register flags set on them. 1165 const MachineOperand *AddrReg = 1166 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1167 const MachineOperand *Data0 = 1168 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1169 const MachineOperand *Data1 = 1170 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1171 1172 unsigned NewOffset0 = CI.Offset; 1173 unsigned NewOffset1 = Paired.Offset; 1174 unsigned Opc = 1175 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1176 1177 if (NewOffset0 > NewOffset1) { 1178 // Canonicalize the merged instruction so the smaller offset comes first. 1179 std::swap(NewOffset0, NewOffset1); 1180 std::swap(Data0, Data1); 1181 } 1182 1183 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1184 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1185 1186 const MCInstrDesc &Write2Desc = TII->get(Opc); 1187 DebugLoc DL = CI.I->getDebugLoc(); 1188 1189 Register BaseReg = AddrReg->getReg(); 1190 unsigned BaseSubReg = AddrReg->getSubReg(); 1191 unsigned BaseRegFlags = 0; 1192 if (CI.BaseOff) { 1193 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1194 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1195 .addImm(CI.BaseOff); 1196 1197 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1198 BaseRegFlags = RegState::Kill; 1199 1200 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1201 .addReg(ImmReg) 1202 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1203 .addImm(0); // clamp bit 1204 BaseSubReg = 0; 1205 } 1206 1207 MachineInstrBuilder Write2 = 1208 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1209 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1210 .add(*Data0) // data0 1211 .add(*Data1) // data1 1212 .addImm(NewOffset0) // offset0 1213 .addImm(NewOffset1) // offset1 1214 .addImm(0) // gds 1215 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1216 1217 moveInstsAfter(Write2, InstsToMove); 1218 1219 CI.I->eraseFromParent(); 1220 Paired.I->eraseFromParent(); 1221 1222 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1223 return Write2; 1224 } 1225 1226 MachineBasicBlock::iterator 1227 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1228 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1229 MachineBasicBlock *MBB = CI.I->getParent(); 1230 DebugLoc DL = CI.I->getDebugLoc(); 1231 const unsigned Opcode = getNewOpcode(CI, Paired); 1232 1233 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1234 1235 Register DestReg = MRI->createVirtualRegister(SuperRC); 1236 unsigned MergedDMask = CI.DMask | Paired.DMask; 1237 unsigned DMaskIdx = 1238 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1239 1240 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1241 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1242 if (I == DMaskIdx) 1243 MIB.addImm(MergedDMask); 1244 else 1245 MIB.add((*CI.I).getOperand(I)); 1246 } 1247 1248 // It shouldn't be possible to get this far if the two instructions 1249 // don't have a single memoperand, because MachineInstr::mayAlias() 1250 // will return true if this is the case. 1251 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1252 1253 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1254 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1255 1256 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1257 1258 unsigned SubRegIdx0, SubRegIdx1; 1259 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1260 1261 // Copy to the old destination registers. 1262 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1263 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1264 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1265 1266 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1267 .add(*Dest0) // Copy to same destination including flags and sub reg. 1268 .addReg(DestReg, 0, SubRegIdx0); 1269 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1270 .add(*Dest1) 1271 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1272 1273 moveInstsAfter(Copy1, InstsToMove); 1274 1275 CI.I->eraseFromParent(); 1276 Paired.I->eraseFromParent(); 1277 return New; 1278 } 1279 1280 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1281 CombineInfo &CI, CombineInfo &Paired, 1282 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1283 MachineBasicBlock *MBB = CI.I->getParent(); 1284 DebugLoc DL = CI.I->getDebugLoc(); 1285 const unsigned Opcode = getNewOpcode(CI, Paired); 1286 1287 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1288 1289 Register DestReg = MRI->createVirtualRegister(SuperRC); 1290 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1291 1292 // It shouldn't be possible to get this far if the two instructions 1293 // don't have a single memoperand, because MachineInstr::mayAlias() 1294 // will return true if this is the case. 1295 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1296 1297 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1298 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1299 1300 MachineInstr *New = 1301 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1302 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1303 .addImm(MergedOffset) // offset 1304 .addImm(CI.GLC) // glc 1305 .addImm(CI.DLC) // dlc 1306 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1307 1308 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1309 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1310 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1311 1312 // Copy to the old destination registers. 1313 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1314 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1315 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1316 1317 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1318 .add(*Dest0) // Copy to same destination including flags and sub reg. 1319 .addReg(DestReg, 0, SubRegIdx0); 1320 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1321 .add(*Dest1) 1322 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1323 1324 moveInstsAfter(Copy1, InstsToMove); 1325 1326 CI.I->eraseFromParent(); 1327 Paired.I->eraseFromParent(); 1328 return New; 1329 } 1330 1331 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1332 CombineInfo &CI, CombineInfo &Paired, 1333 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1334 MachineBasicBlock *MBB = CI.I->getParent(); 1335 DebugLoc DL = CI.I->getDebugLoc(); 1336 1337 const unsigned Opcode = getNewOpcode(CI, Paired); 1338 1339 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1340 1341 // Copy to the new source register. 1342 Register DestReg = MRI->createVirtualRegister(SuperRC); 1343 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1344 1345 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1346 1347 AddressRegs Regs = getRegs(Opcode, *TII); 1348 1349 if (Regs.VAddr) 1350 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1351 1352 // It shouldn't be possible to get this far if the two instructions 1353 // don't have a single memoperand, because MachineInstr::mayAlias() 1354 // will return true if this is the case. 1355 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1356 1357 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1358 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1359 1360 MachineInstr *New = 1361 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1362 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1363 .addImm(MergedOffset) // offset 1364 .addImm(CI.GLC) // glc 1365 .addImm(CI.SLC) // slc 1366 .addImm(0) // tfe 1367 .addImm(CI.DLC) // dlc 1368 .addImm(0) // swz 1369 .addImm(CI.SCCB) // scc 1370 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1371 1372 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1373 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1374 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1375 1376 // Copy to the old destination registers. 1377 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1378 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1379 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1380 1381 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1382 .add(*Dest0) // Copy to same destination including flags and sub reg. 1383 .addReg(DestReg, 0, SubRegIdx0); 1384 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1385 .add(*Dest1) 1386 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1387 1388 moveInstsAfter(Copy1, InstsToMove); 1389 1390 CI.I->eraseFromParent(); 1391 Paired.I->eraseFromParent(); 1392 return New; 1393 } 1394 1395 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1396 CombineInfo &CI, CombineInfo &Paired, 1397 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1398 MachineBasicBlock *MBB = CI.I->getParent(); 1399 DebugLoc DL = CI.I->getDebugLoc(); 1400 1401 const unsigned Opcode = getNewOpcode(CI, Paired); 1402 1403 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1404 1405 // Copy to the new source register. 1406 Register DestReg = MRI->createVirtualRegister(SuperRC); 1407 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1408 1409 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1410 1411 AddressRegs Regs = getRegs(Opcode, *TII); 1412 1413 if (Regs.VAddr) 1414 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1415 1416 unsigned JoinedFormat = 1417 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1418 1419 // It shouldn't be possible to get this far if the two instructions 1420 // don't have a single memoperand, because MachineInstr::mayAlias() 1421 // will return true if this is the case. 1422 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1423 1424 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1425 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1426 1427 MachineInstr *New = 1428 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1429 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1430 .addImm(MergedOffset) // offset 1431 .addImm(JoinedFormat) // format 1432 .addImm(CI.GLC) // glc 1433 .addImm(CI.SLC) // slc 1434 .addImm(0) // tfe 1435 .addImm(CI.DLC) // dlc 1436 .addImm(0) // swz 1437 .addImm(CI.SCCB) // scc 1438 .addMemOperand( 1439 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1440 1441 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1442 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1443 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1444 1445 // Copy to the old destination registers. 1446 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1447 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1448 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1449 1450 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1451 .add(*Dest0) // Copy to same destination including flags and sub reg. 1452 .addReg(DestReg, 0, SubRegIdx0); 1453 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1454 .add(*Dest1) 1455 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1456 1457 moveInstsAfter(Copy1, InstsToMove); 1458 1459 CI.I->eraseFromParent(); 1460 Paired.I->eraseFromParent(); 1461 return New; 1462 } 1463 1464 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1465 CombineInfo &CI, CombineInfo &Paired, 1466 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1467 MachineBasicBlock *MBB = CI.I->getParent(); 1468 DebugLoc DL = CI.I->getDebugLoc(); 1469 1470 const unsigned Opcode = getNewOpcode(CI, Paired); 1471 1472 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1473 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1474 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1475 1476 // Copy to the new source register. 1477 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1478 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1479 1480 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1481 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1482 1483 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1484 .add(*Src0) 1485 .addImm(SubRegIdx0) 1486 .add(*Src1) 1487 .addImm(SubRegIdx1); 1488 1489 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1490 .addReg(SrcReg, RegState::Kill); 1491 1492 AddressRegs Regs = getRegs(Opcode, *TII); 1493 1494 if (Regs.VAddr) 1495 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1496 1497 unsigned JoinedFormat = 1498 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1499 1500 // It shouldn't be possible to get this far if the two instructions 1501 // don't have a single memoperand, because MachineInstr::mayAlias() 1502 // will return true if this is the case. 1503 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1504 1505 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1506 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1507 1508 MachineInstr *New = 1509 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1510 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1511 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1512 .addImm(JoinedFormat) // format 1513 .addImm(CI.GLC) // glc 1514 .addImm(CI.SLC) // slc 1515 .addImm(0) // tfe 1516 .addImm(CI.DLC) // dlc 1517 .addImm(0) // swz 1518 .addImm(CI.SCCB) // scc 1519 .addMemOperand( 1520 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1521 1522 moveInstsAfter(MIB, InstsToMove); 1523 1524 CI.I->eraseFromParent(); 1525 Paired.I->eraseFromParent(); 1526 return New; 1527 } 1528 1529 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1530 const CombineInfo &Paired) { 1531 const unsigned Width = CI.Width + Paired.Width; 1532 1533 switch (CI.InstClass) { 1534 default: 1535 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1536 // FIXME: Handle d16 correctly 1537 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1538 Width); 1539 case TBUFFER_LOAD: 1540 case TBUFFER_STORE: 1541 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1542 Width); 1543 1544 case UNKNOWN: 1545 llvm_unreachable("Unknown instruction class"); 1546 case S_BUFFER_LOAD_IMM: 1547 switch (Width) { 1548 default: 1549 return 0; 1550 case 2: 1551 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1552 case 4: 1553 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1554 } 1555 case MIMG: 1556 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1557 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1558 } 1559 } 1560 1561 std::pair<unsigned, unsigned> 1562 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1563 1564 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1565 return std::make_pair(0, 0); 1566 1567 bool ReverseOrder; 1568 if (CI.InstClass == MIMG) { 1569 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1570 "No overlaps"); 1571 ReverseOrder = CI.DMask > Paired.DMask; 1572 } else 1573 ReverseOrder = CI.Offset > Paired.Offset; 1574 1575 static const unsigned Idxs[4][4] = { 1576 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1577 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1578 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1579 {AMDGPU::sub3, 0, 0, 0}, 1580 }; 1581 unsigned Idx0; 1582 unsigned Idx1; 1583 1584 assert(CI.Width >= 1 && CI.Width <= 3); 1585 assert(Paired.Width >= 1 && Paired.Width <= 3); 1586 1587 if (ReverseOrder) { 1588 Idx1 = Idxs[0][Paired.Width - 1]; 1589 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1590 } else { 1591 Idx0 = Idxs[0][CI.Width - 1]; 1592 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1593 } 1594 1595 return std::make_pair(Idx0, Idx1); 1596 } 1597 1598 const TargetRegisterClass * 1599 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1600 const CombineInfo &Paired) { 1601 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1602 switch (CI.Width + Paired.Width) { 1603 default: 1604 return nullptr; 1605 case 2: 1606 return &AMDGPU::SReg_64_XEXECRegClass; 1607 case 4: 1608 return &AMDGPU::SGPR_128RegClass; 1609 case 8: 1610 return &AMDGPU::SGPR_256RegClass; 1611 case 16: 1612 return &AMDGPU::SGPR_512RegClass; 1613 } 1614 } 1615 1616 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1617 return TRI->hasAGPRs(getDataRegClass(*CI.I)) 1618 ? TRI->getAGPRClassForBitWidth(BitWidth) 1619 : TRI->getVGPRClassForBitWidth(BitWidth); 1620 } 1621 1622 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1623 CombineInfo &CI, CombineInfo &Paired, 1624 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1625 MachineBasicBlock *MBB = CI.I->getParent(); 1626 DebugLoc DL = CI.I->getDebugLoc(); 1627 1628 const unsigned Opcode = getNewOpcode(CI, Paired); 1629 1630 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1631 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1632 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1633 1634 // Copy to the new source register. 1635 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1636 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1637 1638 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1639 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1640 1641 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1642 .add(*Src0) 1643 .addImm(SubRegIdx0) 1644 .add(*Src1) 1645 .addImm(SubRegIdx1); 1646 1647 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1648 .addReg(SrcReg, RegState::Kill); 1649 1650 AddressRegs Regs = getRegs(Opcode, *TII); 1651 1652 if (Regs.VAddr) 1653 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1654 1655 1656 // It shouldn't be possible to get this far if the two instructions 1657 // don't have a single memoperand, because MachineInstr::mayAlias() 1658 // will return true if this is the case. 1659 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1660 1661 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1662 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1663 1664 MachineInstr *New = 1665 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1666 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1667 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1668 .addImm(CI.GLC) // glc 1669 .addImm(CI.SLC) // slc 1670 .addImm(0) // tfe 1671 .addImm(CI.DLC) // dlc 1672 .addImm(0) // swz 1673 .addImm(CI.SCCB) // scc 1674 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1675 1676 moveInstsAfter(MIB, InstsToMove); 1677 1678 CI.I->eraseFromParent(); 1679 Paired.I->eraseFromParent(); 1680 return New; 1681 } 1682 1683 MachineOperand 1684 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1685 APInt V(32, Val, true); 1686 if (TII->isInlineConstant(V)) 1687 return MachineOperand::CreateImm(Val); 1688 1689 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1690 MachineInstr *Mov = 1691 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1692 TII->get(AMDGPU::S_MOV_B32), Reg) 1693 .addImm(Val); 1694 (void)Mov; 1695 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1696 return MachineOperand::CreateReg(Reg, false); 1697 } 1698 1699 // Compute base address using Addr and return the final register. 1700 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1701 const MemAddress &Addr) const { 1702 MachineBasicBlock *MBB = MI.getParent(); 1703 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1704 DebugLoc DL = MI.getDebugLoc(); 1705 1706 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1707 Addr.Base.LoSubReg) && 1708 "Expected 32-bit Base-Register-Low!!"); 1709 1710 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1711 Addr.Base.HiSubReg) && 1712 "Expected 32-bit Base-Register-Hi!!"); 1713 1714 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1715 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1716 MachineOperand OffsetHi = 1717 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1718 1719 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1720 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1721 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1722 1723 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1724 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1725 MachineInstr *LoHalf = 1726 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1727 .addReg(CarryReg, RegState::Define) 1728 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1729 .add(OffsetLo) 1730 .addImm(0); // clamp bit 1731 (void)LoHalf; 1732 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1733 1734 MachineInstr *HiHalf = 1735 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1736 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1737 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1738 .add(OffsetHi) 1739 .addReg(CarryReg, RegState::Kill) 1740 .addImm(0); // clamp bit 1741 (void)HiHalf; 1742 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1743 1744 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1745 MachineInstr *FullBase = 1746 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1747 .addReg(DestSub0) 1748 .addImm(AMDGPU::sub0) 1749 .addReg(DestSub1) 1750 .addImm(AMDGPU::sub1); 1751 (void)FullBase; 1752 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1753 1754 return FullDestReg; 1755 } 1756 1757 // Update base and offset with the NewBase and NewOffset in MI. 1758 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1759 Register NewBase, 1760 int32_t NewOffset) const { 1761 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1762 Base->setReg(NewBase); 1763 Base->setIsKill(false); 1764 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1765 } 1766 1767 Optional<int32_t> 1768 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1769 if (Op.isImm()) 1770 return Op.getImm(); 1771 1772 if (!Op.isReg()) 1773 return None; 1774 1775 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1776 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1777 !Def->getOperand(1).isImm()) 1778 return None; 1779 1780 return Def->getOperand(1).getImm(); 1781 } 1782 1783 // Analyze Base and extracts: 1784 // - 32bit base registers, subregisters 1785 // - 64bit constant offset 1786 // Expecting base computation as: 1787 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1788 // %LO:vgpr_32, %c:sreg_64_xexec = 1789 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1790 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1791 // %Base:vreg_64 = 1792 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1793 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1794 MemAddress &Addr) const { 1795 if (!Base.isReg()) 1796 return; 1797 1798 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1799 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1800 || Def->getNumOperands() != 5) 1801 return; 1802 1803 MachineOperand BaseLo = Def->getOperand(1); 1804 MachineOperand BaseHi = Def->getOperand(3); 1805 if (!BaseLo.isReg() || !BaseHi.isReg()) 1806 return; 1807 1808 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1809 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1810 1811 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1812 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1813 return; 1814 1815 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1816 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1817 1818 auto Offset0P = extractConstOffset(*Src0); 1819 if (Offset0P) 1820 BaseLo = *Src1; 1821 else { 1822 if (!(Offset0P = extractConstOffset(*Src1))) 1823 return; 1824 BaseLo = *Src0; 1825 } 1826 1827 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1828 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1829 1830 if (Src0->isImm()) 1831 std::swap(Src0, Src1); 1832 1833 if (!Src1->isImm()) 1834 return; 1835 1836 uint64_t Offset1 = Src1->getImm(); 1837 BaseHi = *Src0; 1838 1839 Addr.Base.LoReg = BaseLo.getReg(); 1840 Addr.Base.HiReg = BaseHi.getReg(); 1841 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1842 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1843 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1844 } 1845 1846 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1847 MachineInstr &MI, 1848 MemInfoMap &Visited, 1849 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1850 1851 if (!(MI.mayLoad() ^ MI.mayStore())) 1852 return false; 1853 1854 // TODO: Support flat and scratch. 1855 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1856 return false; 1857 1858 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1859 return false; 1860 1861 if (AnchorList.count(&MI)) 1862 return false; 1863 1864 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1865 1866 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1867 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1868 return false; 1869 } 1870 1871 // Step1: Find the base-registers and a 64bit constant offset. 1872 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1873 MemAddress MAddr; 1874 if (Visited.find(&MI) == Visited.end()) { 1875 processBaseWithConstOffset(Base, MAddr); 1876 Visited[&MI] = MAddr; 1877 } else 1878 MAddr = Visited[&MI]; 1879 1880 if (MAddr.Offset == 0) { 1881 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1882 " constant offsets that can be promoted.\n";); 1883 return false; 1884 } 1885 1886 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1887 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1888 1889 // Step2: Traverse through MI's basic block and find an anchor(that has the 1890 // same base-registers) with the highest 13bit distance from MI's offset. 1891 // E.g. (64bit loads) 1892 // bb: 1893 // addr1 = &a + 4096; load1 = load(addr1, 0) 1894 // addr2 = &a + 6144; load2 = load(addr2, 0) 1895 // addr3 = &a + 8192; load3 = load(addr3, 0) 1896 // addr4 = &a + 10240; load4 = load(addr4, 0) 1897 // addr5 = &a + 12288; load5 = load(addr5, 0) 1898 // 1899 // Starting from the first load, the optimization will try to find a new base 1900 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1901 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1902 // as the new-base(anchor) because of the maximum distance which can 1903 // accomodate more intermediate bases presumeably. 1904 // 1905 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1906 // (&a + 8192) for load1, load2, load4. 1907 // addr = &a + 8192 1908 // load1 = load(addr, -4096) 1909 // load2 = load(addr, -2048) 1910 // load3 = load(addr, 0) 1911 // load4 = load(addr, 2048) 1912 // addr5 = &a + 12288; load5 = load(addr5, 0) 1913 // 1914 MachineInstr *AnchorInst = nullptr; 1915 MemAddress AnchorAddr; 1916 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1917 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1918 1919 MachineBasicBlock *MBB = MI.getParent(); 1920 MachineBasicBlock::iterator E = MBB->end(); 1921 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1922 ++MBBI; 1923 const SITargetLowering *TLI = 1924 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1925 1926 for ( ; MBBI != E; ++MBBI) { 1927 MachineInstr &MINext = *MBBI; 1928 // TODO: Support finding an anchor(with same base) from store addresses or 1929 // any other load addresses where the opcodes are different. 1930 if (MINext.getOpcode() != MI.getOpcode() || 1931 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1932 continue; 1933 1934 const MachineOperand &BaseNext = 1935 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1936 MemAddress MAddrNext; 1937 if (Visited.find(&MINext) == Visited.end()) { 1938 processBaseWithConstOffset(BaseNext, MAddrNext); 1939 Visited[&MINext] = MAddrNext; 1940 } else 1941 MAddrNext = Visited[&MINext]; 1942 1943 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1944 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1945 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1946 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1947 continue; 1948 1949 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1950 1951 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1952 TargetLoweringBase::AddrMode AM; 1953 AM.HasBaseReg = true; 1954 AM.BaseOffs = Dist; 1955 if (TLI->isLegalGlobalAddressingMode(AM) && 1956 (uint32_t)std::abs(Dist) > MaxDist) { 1957 MaxDist = std::abs(Dist); 1958 1959 AnchorAddr = MAddrNext; 1960 AnchorInst = &MINext; 1961 } 1962 } 1963 1964 if (AnchorInst) { 1965 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1966 AnchorInst->dump()); 1967 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1968 << AnchorAddr.Offset << "\n\n"); 1969 1970 // Instead of moving up, just re-compute anchor-instruction's base address. 1971 Register Base = computeBase(MI, AnchorAddr); 1972 1973 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1974 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1975 1976 for (auto P : InstsWCommonBase) { 1977 TargetLoweringBase::AddrMode AM; 1978 AM.HasBaseReg = true; 1979 AM.BaseOffs = P.second - AnchorAddr.Offset; 1980 1981 if (TLI->isLegalGlobalAddressingMode(AM)) { 1982 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1983 dbgs() << ")"; P.first->dump()); 1984 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1985 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1986 } 1987 } 1988 AnchorList.insert(AnchorInst); 1989 return true; 1990 } 1991 1992 return false; 1993 } 1994 1995 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1996 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1997 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1998 if (AddrList.front().InstClass == CI.InstClass && 1999 AddrList.front().hasSameBaseAddress(*CI.I)) { 2000 AddrList.emplace_back(CI); 2001 return; 2002 } 2003 } 2004 2005 // Base address not found, so add a new list. 2006 MergeableInsts.emplace_back(1, CI); 2007 } 2008 2009 std::pair<MachineBasicBlock::iterator, bool> 2010 SILoadStoreOptimizer::collectMergeableInsts( 2011 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 2012 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 2013 std::list<std::list<CombineInfo>> &MergeableInsts) const { 2014 bool Modified = false; 2015 2016 // Sort potential mergeable instructions into lists. One list per base address. 2017 unsigned Order = 0; 2018 MachineBasicBlock::iterator BlockI = Begin; 2019 for (; BlockI != End; ++BlockI) { 2020 MachineInstr &MI = *BlockI; 2021 2022 // We run this before checking if an address is mergeable, because it can produce 2023 // better code even if the instructions aren't mergeable. 2024 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 2025 Modified = true; 2026 2027 // Don't combine if volatile. We also won't be able to merge across this, so 2028 // break the search. We can look after this barrier for separate merges. 2029 if (MI.hasOrderedMemoryRef()) { 2030 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 2031 2032 // Search will resume after this instruction in a separate merge list. 2033 ++BlockI; 2034 break; 2035 } 2036 2037 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 2038 if (InstClass == UNKNOWN) 2039 continue; 2040 2041 CombineInfo CI; 2042 CI.setMI(MI, *TII, *STM); 2043 CI.Order = Order++; 2044 2045 if (!CI.hasMergeableAddress(*MRI)) 2046 continue; 2047 2048 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2049 2050 addInstToMergeableList(CI, MergeableInsts); 2051 } 2052 2053 // At this point we have lists of Mergeable instructions. 2054 // 2055 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2056 // list try to find an instruction that can be merged with I. If an instruction 2057 // is found, it is stored in the Paired field. If no instructions are found, then 2058 // the CombineInfo object is deleted from the list. 2059 2060 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2061 E = MergeableInsts.end(); I != E;) { 2062 2063 std::list<CombineInfo> &MergeList = *I; 2064 if (MergeList.size() <= 1) { 2065 // This means we have found only one instruction with a given address 2066 // that can be merged, and we need at least 2 instructions to do a merge, 2067 // so this list can be discarded. 2068 I = MergeableInsts.erase(I); 2069 continue; 2070 } 2071 2072 // Sort the lists by offsets, this way mergeable instructions will be 2073 // adjacent to each other in the list, which will make it easier to find 2074 // matches. 2075 MergeList.sort( 2076 [] (const CombineInfo &A, CombineInfo &B) { 2077 return A.Offset < B.Offset; 2078 }); 2079 ++I; 2080 } 2081 2082 return std::make_pair(BlockI, Modified); 2083 } 2084 2085 // Scan through looking for adjacent LDS operations with constant offsets from 2086 // the same base register. We rely on the scheduler to do the hard work of 2087 // clustering nearby loads, and assume these are all adjacent. 2088 bool SILoadStoreOptimizer::optimizeBlock( 2089 std::list<std::list<CombineInfo> > &MergeableInsts) { 2090 bool Modified = false; 2091 2092 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2093 E = MergeableInsts.end(); I != E;) { 2094 std::list<CombineInfo> &MergeList = *I; 2095 2096 bool OptimizeListAgain = false; 2097 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2098 // We weren't able to make any changes, so delete the list so we don't 2099 // process the same instructions the next time we try to optimize this 2100 // block. 2101 I = MergeableInsts.erase(I); 2102 continue; 2103 } 2104 2105 Modified = true; 2106 2107 // We made changes, but also determined that there were no more optimization 2108 // opportunities, so we don't need to reprocess the list 2109 if (!OptimizeListAgain) { 2110 I = MergeableInsts.erase(I); 2111 continue; 2112 } 2113 OptimizeAgain = true; 2114 } 2115 return Modified; 2116 } 2117 2118 bool 2119 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2120 std::list<CombineInfo> &MergeList, 2121 bool &OptimizeListAgain) { 2122 if (MergeList.empty()) 2123 return false; 2124 2125 bool Modified = false; 2126 2127 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2128 Next = std::next(I)) { 2129 2130 auto First = I; 2131 auto Second = Next; 2132 2133 if ((*First).Order > (*Second).Order) 2134 std::swap(First, Second); 2135 CombineInfo &CI = *First; 2136 CombineInfo &Paired = *Second; 2137 2138 SmallVector<MachineInstr *, 8> InstsToMove; 2139 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2140 ++I; 2141 continue; 2142 } 2143 2144 Modified = true; 2145 2146 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2147 2148 switch (CI.InstClass) { 2149 default: 2150 llvm_unreachable("unknown InstClass"); 2151 break; 2152 case DS_READ: { 2153 MachineBasicBlock::iterator NewMI = 2154 mergeRead2Pair(CI, Paired, InstsToMove); 2155 CI.setMI(NewMI, *TII, *STM); 2156 break; 2157 } 2158 case DS_WRITE: { 2159 MachineBasicBlock::iterator NewMI = 2160 mergeWrite2Pair(CI, Paired, InstsToMove); 2161 CI.setMI(NewMI, *TII, *STM); 2162 break; 2163 } 2164 case S_BUFFER_LOAD_IMM: { 2165 MachineBasicBlock::iterator NewMI = 2166 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2167 CI.setMI(NewMI, *TII, *STM); 2168 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2169 break; 2170 } 2171 case BUFFER_LOAD: { 2172 MachineBasicBlock::iterator NewMI = 2173 mergeBufferLoadPair(CI, Paired, InstsToMove); 2174 CI.setMI(NewMI, *TII, *STM); 2175 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2176 break; 2177 } 2178 case BUFFER_STORE: { 2179 MachineBasicBlock::iterator NewMI = 2180 mergeBufferStorePair(CI, Paired, InstsToMove); 2181 CI.setMI(NewMI, *TII, *STM); 2182 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2183 break; 2184 } 2185 case MIMG: { 2186 MachineBasicBlock::iterator NewMI = 2187 mergeImagePair(CI, Paired, InstsToMove); 2188 CI.setMI(NewMI, *TII, *STM); 2189 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2190 break; 2191 } 2192 case TBUFFER_LOAD: { 2193 MachineBasicBlock::iterator NewMI = 2194 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2195 CI.setMI(NewMI, *TII, *STM); 2196 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2197 break; 2198 } 2199 case TBUFFER_STORE: { 2200 MachineBasicBlock::iterator NewMI = 2201 mergeTBufferStorePair(CI, Paired, InstsToMove); 2202 CI.setMI(NewMI, *TII, *STM); 2203 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2204 break; 2205 } 2206 } 2207 CI.Order = Paired.Order; 2208 if (I == Second) 2209 I = Next; 2210 2211 MergeList.erase(Second); 2212 } 2213 2214 return Modified; 2215 } 2216 2217 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2218 if (skipFunction(MF.getFunction())) 2219 return false; 2220 2221 STM = &MF.getSubtarget<GCNSubtarget>(); 2222 if (!STM->loadStoreOptEnabled()) 2223 return false; 2224 2225 TII = STM->getInstrInfo(); 2226 TRI = &TII->getRegisterInfo(); 2227 2228 MRI = &MF.getRegInfo(); 2229 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2230 2231 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2232 2233 bool Modified = false; 2234 2235 // Contains the list of instructions for which constant offsets are being 2236 // promoted to the IMM. This is tracked for an entire block at time. 2237 SmallPtrSet<MachineInstr *, 4> AnchorList; 2238 MemInfoMap Visited; 2239 2240 for (MachineBasicBlock &MBB : MF) { 2241 MachineBasicBlock::iterator SectionEnd; 2242 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2243 I = SectionEnd) { 2244 bool CollectModified; 2245 std::list<std::list<CombineInfo>> MergeableInsts; 2246 2247 // First pass: Collect list of all instructions we know how to merge in a 2248 // subset of the block. 2249 std::tie(SectionEnd, CollectModified) = 2250 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2251 2252 Modified |= CollectModified; 2253 2254 do { 2255 OptimizeAgain = false; 2256 Modified |= optimizeBlock(MergeableInsts); 2257 } while (OptimizeAgain); 2258 } 2259 2260 Visited.clear(); 2261 AnchorList.clear(); 2262 } 2263 2264 return Modified; 2265 } 2266