1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 bool GLC; 108 bool SLC; 109 bool DLC; 110 bool UseST64; 111 int AddrIdx[MaxAddressRegs]; 112 const MachineOperand *AddrReg[MaxAddressRegs]; 113 unsigned NumAddresses; 114 unsigned Order; 115 116 bool hasSameBaseAddress(const MachineInstr &MI) { 117 for (unsigned i = 0; i < NumAddresses; i++) { 118 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 119 120 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 121 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 122 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 123 return false; 124 } 125 continue; 126 } 127 128 // Check same base pointer. Be careful of subregisters, which can occur 129 // with vectors of pointers. 130 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 131 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 132 return false; 133 } 134 } 135 return true; 136 } 137 138 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 139 for (unsigned i = 0; i < NumAddresses; ++i) { 140 const MachineOperand *AddrOp = AddrReg[i]; 141 // Immediates are always OK. 142 if (AddrOp->isImm()) 143 continue; 144 145 // Don't try to merge addresses that aren't either immediates or registers. 146 // TODO: Should be possible to merge FrameIndexes and maybe some other 147 // non-register 148 if (!AddrOp->isReg()) 149 return false; 150 151 // TODO: We should be able to merge physical reg addreses. 152 if (AddrOp->getReg().isPhysical()) 153 return false; 154 155 // If an address has only one use then there will be on other 156 // instructions with the same address, so we can't merge this one. 157 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 158 return false; 159 } 160 return true; 161 } 162 163 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 164 const GCNSubtarget &STM); 165 }; 166 167 struct BaseRegisters { 168 Register LoReg; 169 Register HiReg; 170 171 unsigned LoSubReg = 0; 172 unsigned HiSubReg = 0; 173 }; 174 175 struct MemAddress { 176 BaseRegisters Base; 177 int64_t Offset = 0; 178 }; 179 180 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 181 182 private: 183 const GCNSubtarget *STM = nullptr; 184 const SIInstrInfo *TII = nullptr; 185 const SIRegisterInfo *TRI = nullptr; 186 MachineRegisterInfo *MRI = nullptr; 187 AliasAnalysis *AA = nullptr; 188 bool OptimizeAgain; 189 190 static bool dmasksCanBeCombined(const CombineInfo &CI, 191 const SIInstrInfo &TII, 192 const CombineInfo &Paired); 193 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 194 CombineInfo &Paired, bool Modify = false); 195 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 196 const CombineInfo &Paired); 197 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 198 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 199 const CombineInfo &Paired); 200 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 201 const CombineInfo &Paired); 202 203 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 204 SmallVectorImpl<MachineInstr *> &InstsToMove); 205 206 unsigned read2Opcode(unsigned EltSize) const; 207 unsigned read2ST64Opcode(unsigned EltSize) const; 208 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 209 CombineInfo &Paired, 210 const SmallVectorImpl<MachineInstr *> &InstsToMove); 211 212 unsigned write2Opcode(unsigned EltSize) const; 213 unsigned write2ST64Opcode(unsigned EltSize) const; 214 MachineBasicBlock::iterator 215 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 216 const SmallVectorImpl<MachineInstr *> &InstsToMove); 217 MachineBasicBlock::iterator 218 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 219 const SmallVectorImpl<MachineInstr *> &InstsToMove); 220 MachineBasicBlock::iterator 221 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 222 const SmallVectorImpl<MachineInstr *> &InstsToMove); 223 MachineBasicBlock::iterator 224 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 225 const SmallVectorImpl<MachineInstr *> &InstsToMove); 226 MachineBasicBlock::iterator 227 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 228 const SmallVectorImpl<MachineInstr *> &InstsToMove); 229 MachineBasicBlock::iterator 230 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 231 const SmallVectorImpl<MachineInstr *> &InstsToMove); 232 MachineBasicBlock::iterator 233 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 234 const SmallVectorImpl<MachineInstr *> &InstsToMove); 235 236 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 237 int32_t NewOffset) const; 238 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 239 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 240 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 241 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 242 /// Promotes constant offset to the immediate by adjusting the base. It 243 /// tries to use a base from the nearby instructions that allows it to have 244 /// a 13bit constant offset which gets promoted to the immediate. 245 bool promoteConstantOffsetToImm(MachineInstr &CI, 246 MemInfoMap &Visited, 247 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 248 void addInstToMergeableList(const CombineInfo &CI, 249 std::list<std::list<CombineInfo> > &MergeableInsts) const; 250 251 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 252 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 253 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 254 std::list<std::list<CombineInfo>> &MergeableInsts) const; 255 256 public: 257 static char ID; 258 259 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 260 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 261 } 262 263 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 264 bool &OptimizeListAgain); 265 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 266 267 bool runOnMachineFunction(MachineFunction &MF) override; 268 269 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 270 271 void getAnalysisUsage(AnalysisUsage &AU) const override { 272 AU.setPreservesCFG(); 273 AU.addRequired<AAResultsWrapperPass>(); 274 275 MachineFunctionPass::getAnalysisUsage(AU); 276 } 277 278 MachineFunctionProperties getRequiredProperties() const override { 279 return MachineFunctionProperties() 280 .set(MachineFunctionProperties::Property::IsSSA); 281 } 282 }; 283 284 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 285 const unsigned Opc = MI.getOpcode(); 286 287 if (TII.isMUBUF(Opc)) { 288 // FIXME: Handle d16 correctly 289 return AMDGPU::getMUBUFElements(Opc); 290 } 291 if (TII.isMIMG(MI)) { 292 uint64_t DMaskImm = 293 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 294 return countPopulation(DMaskImm); 295 } 296 if (TII.isMTBUF(Opc)) { 297 return AMDGPU::getMTBUFElements(Opc); 298 } 299 300 switch (Opc) { 301 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 302 return 1; 303 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 304 return 2; 305 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 306 return 4; 307 default: 308 return 0; 309 } 310 } 311 312 /// Maps instruction opcode to enum InstClassEnum. 313 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 314 switch (Opc) { 315 default: 316 if (TII.isMUBUF(Opc)) { 317 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 318 default: 319 return UNKNOWN; 320 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 321 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 322 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 323 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 324 return BUFFER_LOAD; 325 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 326 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 327 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 328 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 329 return BUFFER_STORE; 330 } 331 } 332 if (TII.isMIMG(Opc)) { 333 // Ignore instructions encoded without vaddr. 334 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 335 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 336 return UNKNOWN; 337 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 338 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 339 TII.isGather4(Opc)) 340 return UNKNOWN; 341 return MIMG; 342 } 343 if (TII.isMTBUF(Opc)) { 344 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 345 default: 346 return UNKNOWN; 347 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 348 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 349 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 350 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 351 return TBUFFER_LOAD; 352 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 353 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 354 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 355 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 356 return TBUFFER_STORE; 357 } 358 } 359 return UNKNOWN; 360 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 361 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 362 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 363 return S_BUFFER_LOAD_IMM; 364 case AMDGPU::DS_READ_B32: 365 case AMDGPU::DS_READ_B32_gfx9: 366 case AMDGPU::DS_READ_B64: 367 case AMDGPU::DS_READ_B64_gfx9: 368 return DS_READ; 369 case AMDGPU::DS_WRITE_B32: 370 case AMDGPU::DS_WRITE_B32_gfx9: 371 case AMDGPU::DS_WRITE_B64: 372 case AMDGPU::DS_WRITE_B64_gfx9: 373 return DS_WRITE; 374 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: 375 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: 376 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: 377 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: 378 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: 379 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: 380 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: 381 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: 382 return UNKNOWN; 383 } 384 } 385 386 /// Determines instruction subclass from opcode. Only instructions 387 /// of the same subclass can be merged together. 388 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 389 switch (Opc) { 390 default: 391 if (TII.isMUBUF(Opc)) 392 return AMDGPU::getMUBUFBaseOpcode(Opc); 393 if (TII.isMIMG(Opc)) { 394 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 395 assert(Info); 396 return Info->BaseOpcode; 397 } 398 if (TII.isMTBUF(Opc)) 399 return AMDGPU::getMTBUFBaseOpcode(Opc); 400 return -1; 401 case AMDGPU::DS_READ_B32: 402 case AMDGPU::DS_READ_B32_gfx9: 403 case AMDGPU::DS_READ_B64: 404 case AMDGPU::DS_READ_B64_gfx9: 405 case AMDGPU::DS_WRITE_B32: 406 case AMDGPU::DS_WRITE_B32_gfx9: 407 case AMDGPU::DS_WRITE_B64: 408 case AMDGPU::DS_WRITE_B64_gfx9: 409 return Opc; 410 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 411 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 412 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 413 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 414 } 415 } 416 417 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 418 AddressRegs Result; 419 420 if (TII.isMUBUF(Opc)) { 421 if (AMDGPU::getMUBUFHasVAddr(Opc)) 422 Result.VAddr = true; 423 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 424 Result.SRsrc = true; 425 if (AMDGPU::getMUBUFHasSoffset(Opc)) 426 Result.SOffset = true; 427 428 return Result; 429 } 430 431 if (TII.isMIMG(Opc)) { 432 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 433 if (VAddr0Idx >= 0) { 434 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 435 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 436 } else { 437 Result.VAddr = true; 438 } 439 Result.SRsrc = true; 440 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 441 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 442 Result.SSamp = true; 443 444 return Result; 445 } 446 if (TII.isMTBUF(Opc)) { 447 if (AMDGPU::getMTBUFHasVAddr(Opc)) 448 Result.VAddr = true; 449 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 450 Result.SRsrc = true; 451 if (AMDGPU::getMTBUFHasSoffset(Opc)) 452 Result.SOffset = true; 453 454 return Result; 455 } 456 457 switch (Opc) { 458 default: 459 return Result; 460 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 461 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 462 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 463 Result.SBase = true; 464 return Result; 465 case AMDGPU::DS_READ_B32: 466 case AMDGPU::DS_READ_B64: 467 case AMDGPU::DS_READ_B32_gfx9: 468 case AMDGPU::DS_READ_B64_gfx9: 469 case AMDGPU::DS_WRITE_B32: 470 case AMDGPU::DS_WRITE_B64: 471 case AMDGPU::DS_WRITE_B32_gfx9: 472 case AMDGPU::DS_WRITE_B64_gfx9: 473 Result.Addr = true; 474 return Result; 475 } 476 } 477 478 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 479 const SIInstrInfo &TII, 480 const GCNSubtarget &STM) { 481 I = MI; 482 unsigned Opc = MI->getOpcode(); 483 InstClass = getInstClass(Opc, TII); 484 485 if (InstClass == UNKNOWN) 486 return; 487 488 switch (InstClass) { 489 case DS_READ: 490 EltSize = 491 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 492 : 4; 493 break; 494 case DS_WRITE: 495 EltSize = 496 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 497 : 4; 498 break; 499 case S_BUFFER_LOAD_IMM: 500 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 501 break; 502 default: 503 EltSize = 4; 504 break; 505 } 506 507 if (InstClass == MIMG) { 508 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 509 // Offset is not considered for MIMG instructions. 510 Offset = 0; 511 } else { 512 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 513 Offset = I->getOperand(OffsetIdx).getImm(); 514 } 515 516 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 517 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 518 519 Width = getOpcodeWidth(*I, TII); 520 521 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 522 Offset &= 0xffff; 523 } else if (InstClass != MIMG) { 524 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 525 if (InstClass != S_BUFFER_LOAD_IMM) { 526 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 527 } 528 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 529 } 530 531 AddressRegs Regs = getRegs(Opc, TII); 532 533 NumAddresses = 0; 534 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 535 AddrIdx[NumAddresses++] = 536 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 537 if (Regs.Addr) 538 AddrIdx[NumAddresses++] = 539 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 540 if (Regs.SBase) 541 AddrIdx[NumAddresses++] = 542 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 543 if (Regs.SRsrc) 544 AddrIdx[NumAddresses++] = 545 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 546 if (Regs.SOffset) 547 AddrIdx[NumAddresses++] = 548 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 549 if (Regs.VAddr) 550 AddrIdx[NumAddresses++] = 551 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 552 if (Regs.SSamp) 553 AddrIdx[NumAddresses++] = 554 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 555 assert(NumAddresses <= MaxAddressRegs); 556 557 for (unsigned J = 0; J < NumAddresses; J++) 558 AddrReg[J] = &I->getOperand(AddrIdx[J]); 559 } 560 561 } // end anonymous namespace. 562 563 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 564 "SI Load Store Optimizer", false, false) 565 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 566 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 567 false, false) 568 569 char SILoadStoreOptimizer::ID = 0; 570 571 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 572 573 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 574 return new SILoadStoreOptimizer(); 575 } 576 577 static void moveInstsAfter(MachineBasicBlock::iterator I, 578 ArrayRef<MachineInstr *> InstsToMove) { 579 MachineBasicBlock *MBB = I->getParent(); 580 ++I; 581 for (MachineInstr *MI : InstsToMove) { 582 MI->removeFromParent(); 583 MBB->insert(I, MI); 584 } 585 } 586 587 static void addDefsUsesToList(const MachineInstr &MI, 588 DenseSet<Register> &RegDefs, 589 DenseSet<Register> &PhysRegUses) { 590 for (const MachineOperand &Op : MI.operands()) { 591 if (Op.isReg()) { 592 if (Op.isDef()) 593 RegDefs.insert(Op.getReg()); 594 else if (Op.readsReg() && Op.getReg().isPhysical()) 595 PhysRegUses.insert(Op.getReg()); 596 } 597 } 598 } 599 600 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 601 MachineBasicBlock::iterator B, 602 AliasAnalysis *AA) { 603 // RAW or WAR - cannot reorder 604 // WAW - cannot reorder 605 // RAR - safe to reorder 606 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 607 } 608 609 // Add MI and its defs to the lists if MI reads one of the defs that are 610 // already in the list. Returns true in that case. 611 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 612 DenseSet<Register> &PhysRegUses, 613 SmallVectorImpl<MachineInstr *> &Insts) { 614 for (MachineOperand &Use : MI.operands()) { 615 // If one of the defs is read, then there is a use of Def between I and the 616 // instruction that I will potentially be merged with. We will need to move 617 // this instruction after the merged instructions. 618 // 619 // Similarly, if there is a def which is read by an instruction that is to 620 // be moved for merging, then we need to move the def-instruction as well. 621 // This can only happen for physical registers such as M0; virtual 622 // registers are in SSA form. 623 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 624 (Use.isDef() && RegDefs.count(Use.getReg())) || 625 (Use.isDef() && Use.getReg().isPhysical() && 626 PhysRegUses.count(Use.getReg())))) { 627 Insts.push_back(&MI); 628 addDefsUsesToList(MI, RegDefs, PhysRegUses); 629 return true; 630 } 631 } 632 633 return false; 634 } 635 636 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 637 ArrayRef<MachineInstr *> InstsToMove, 638 AliasAnalysis *AA) { 639 assert(MemOp.mayLoadOrStore()); 640 641 for (MachineInstr *InstToMove : InstsToMove) { 642 if (!InstToMove->mayLoadOrStore()) 643 continue; 644 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 645 return false; 646 } 647 return true; 648 } 649 650 // This function assumes that \p A and \p B have are identical except for 651 // size and offset, and they referecne adjacent memory. 652 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 653 const MachineMemOperand *A, 654 const MachineMemOperand *B) { 655 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 656 unsigned Size = A->getSize() + B->getSize(); 657 // This function adds the offset parameter to the existing offset for A, 658 // so we pass 0 here as the offset and then manually set it to the correct 659 // value after the call. 660 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 661 MMO->setOffset(MinOffset); 662 return MMO; 663 } 664 665 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 666 const SIInstrInfo &TII, 667 const CombineInfo &Paired) { 668 assert(CI.InstClass == MIMG); 669 670 // Ignore instructions with tfe/lwe set. 671 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 672 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 673 674 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 675 return false; 676 677 // Check other optional immediate operands for equality. 678 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 679 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 680 AMDGPU::OpName::da, AMDGPU::OpName::r128, 681 AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; 682 683 for (auto op : OperandsToMatch) { 684 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 685 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 686 return false; 687 if (Idx != -1 && 688 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 689 return false; 690 } 691 692 // Check DMask for overlaps. 693 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 694 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 695 696 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 697 if ((1u << AllowedBitsForMin) <= MinMask) 698 return false; 699 700 return true; 701 } 702 703 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 704 unsigned ComponentCount, 705 const GCNSubtarget &STI) { 706 if (ComponentCount > 4) 707 return 0; 708 709 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 710 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 711 if (!OldFormatInfo) 712 return 0; 713 714 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 715 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 716 ComponentCount, 717 OldFormatInfo->NumFormat, STI); 718 719 if (!NewFormatInfo) 720 return 0; 721 722 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 723 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 724 725 return NewFormatInfo->Format; 726 } 727 728 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 729 // highest power of two. Note that the result is well defined for all inputs 730 // including corner cases like: 731 // - if Lo == Hi, return that value 732 // - if Lo == 0, return 0 (even though the "- 1" below underflows 733 // - if Lo > Hi, return 0 (as if the range wrapped around) 734 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 735 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 736 } 737 738 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 739 const GCNSubtarget &STI, 740 CombineInfo &Paired, 741 bool Modify) { 742 assert(CI.InstClass != MIMG); 743 744 // XXX - Would the same offset be OK? Is there any reason this would happen or 745 // be useful? 746 if (CI.Offset == Paired.Offset) 747 return false; 748 749 // This won't be valid if the offset isn't aligned. 750 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 751 return false; 752 753 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 754 755 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 756 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 757 if (!Info0) 758 return false; 759 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 760 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 761 if (!Info1) 762 return false; 763 764 if (Info0->BitsPerComp != Info1->BitsPerComp || 765 Info0->NumFormat != Info1->NumFormat) 766 return false; 767 768 // TODO: Should be possible to support more formats, but if format loads 769 // are not dword-aligned, the merged load might not be valid. 770 if (Info0->BitsPerComp != 32) 771 return false; 772 773 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 774 return false; 775 } 776 777 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 778 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 779 CI.UseST64 = false; 780 CI.BaseOff = 0; 781 782 // Handle all non-DS instructions. 783 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 784 return (EltOffset0 + CI.Width == EltOffset1 || 785 EltOffset1 + Paired.Width == EltOffset0) && 786 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 787 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 788 } 789 790 // If the offset in elements doesn't fit in 8-bits, we might be able to use 791 // the stride 64 versions. 792 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 793 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 794 if (Modify) { 795 CI.Offset = EltOffset0 / 64; 796 Paired.Offset = EltOffset1 / 64; 797 CI.UseST64 = true; 798 } 799 return true; 800 } 801 802 // Check if the new offsets fit in the reduced 8-bit range. 803 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 804 if (Modify) { 805 CI.Offset = EltOffset0; 806 Paired.Offset = EltOffset1; 807 } 808 return true; 809 } 810 811 // Try to shift base address to decrease offsets. 812 uint32_t Min = std::min(EltOffset0, EltOffset1); 813 uint32_t Max = std::max(EltOffset0, EltOffset1); 814 815 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 816 if (((Max - Min) & ~Mask) == 0) { 817 if (Modify) { 818 // From the range of values we could use for BaseOff, choose the one that 819 // is aligned to the highest power of two, to maximise the chance that 820 // the same offset can be reused for other load/store pairs. 821 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 822 // Copy the low bits of the offsets, so that when we adjust them by 823 // subtracting BaseOff they will be multiples of 64. 824 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 825 CI.BaseOff = BaseOff * CI.EltSize; 826 CI.Offset = (EltOffset0 - BaseOff) / 64; 827 Paired.Offset = (EltOffset1 - BaseOff) / 64; 828 CI.UseST64 = true; 829 } 830 return true; 831 } 832 833 if (isUInt<8>(Max - Min)) { 834 if (Modify) { 835 // From the range of values we could use for BaseOff, choose the one that 836 // is aligned to the highest power of two, to maximise the chance that 837 // the same offset can be reused for other load/store pairs. 838 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 839 CI.BaseOff = BaseOff * CI.EltSize; 840 CI.Offset = EltOffset0 - BaseOff; 841 Paired.Offset = EltOffset1 - BaseOff; 842 } 843 return true; 844 } 845 846 return false; 847 } 848 849 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 850 const CombineInfo &CI, 851 const CombineInfo &Paired) { 852 const unsigned Width = (CI.Width + Paired.Width); 853 switch (CI.InstClass) { 854 default: 855 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 856 case S_BUFFER_LOAD_IMM: 857 switch (Width) { 858 default: 859 return false; 860 case 2: 861 case 4: 862 return true; 863 } 864 } 865 } 866 867 /// This function assumes that CI comes before Paired in a basic block. 868 bool SILoadStoreOptimizer::checkAndPrepareMerge( 869 CombineInfo &CI, CombineInfo &Paired, 870 SmallVectorImpl<MachineInstr *> &InstsToMove) { 871 872 // Check both offsets (or masks for MIMG) can be combined and fit in the 873 // reduced range. 874 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 875 return false; 876 877 if (CI.InstClass != MIMG && 878 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 879 return false; 880 881 const unsigned Opc = CI.I->getOpcode(); 882 const InstClassEnum InstClass = getInstClass(Opc, *TII); 883 884 if (InstClass == UNKNOWN) { 885 return false; 886 } 887 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 888 889 // Do not merge VMEM buffer instructions with "swizzled" bit set. 890 int Swizzled = 891 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 892 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 893 return false; 894 895 DenseSet<Register> RegDefsToMove; 896 DenseSet<Register> PhysRegUsesToMove; 897 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 898 899 MachineBasicBlock::iterator E = std::next(Paired.I); 900 MachineBasicBlock::iterator MBBI = std::next(CI.I); 901 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 902 for (; MBBI != E; ++MBBI) { 903 904 if (MBBI == MBBE) { 905 // CombineInfo::Order is a hint on the instruction ordering within the 906 // basic block. This hint suggests that CI precedes Paired, which is 907 // true most of the time. However, moveInstsAfter() processing a 908 // previous list may have changed this order in a situation when it 909 // moves an instruction which exists in some other merge list. 910 // In this case it must be dependent. 911 return false; 912 } 913 914 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 915 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 916 // This is not a matching instruction, but we can keep looking as 917 // long as one of these conditions are met: 918 // 1. It is safe to move I down past MBBI. 919 // 2. It is safe to move MBBI down past the instruction that I will 920 // be merged into. 921 922 if (MBBI->hasUnmodeledSideEffects()) { 923 // We can't re-order this instruction with respect to other memory 924 // operations, so we fail both conditions mentioned above. 925 return false; 926 } 927 928 if (MBBI->mayLoadOrStore() && 929 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 930 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 931 // We fail condition #1, but we may still be able to satisfy condition 932 // #2. Add this instruction to the move list and then we will check 933 // if condition #2 holds once we have selected the matching instruction. 934 InstsToMove.push_back(&*MBBI); 935 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 936 continue; 937 } 938 939 // When we match I with another DS instruction we will be moving I down 940 // to the location of the matched instruction any uses of I will need to 941 // be moved down as well. 942 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 943 InstsToMove); 944 continue; 945 } 946 947 // Don't merge volatiles. 948 if (MBBI->hasOrderedMemoryRef()) 949 return false; 950 951 int Swizzled = 952 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 953 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 954 return false; 955 956 // Handle a case like 957 // DS_WRITE_B32 addr, v, idx0 958 // w = DS_READ_B32 addr, idx0 959 // DS_WRITE_B32 addr, f(w), idx1 960 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 961 // merging of the two writes. 962 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 963 InstsToMove)) 964 continue; 965 966 if (&*MBBI == &*Paired.I) { 967 // We need to go through the list of instructions that we plan to 968 // move and make sure they are all safe to move down past the merged 969 // instruction. 970 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 971 972 // Call offsetsCanBeCombined with modify = true so that the offsets are 973 // correct for the new instruction. This should return true, because 974 // this function should only be called on CombineInfo objects that 975 // have already been confirmed to be mergeable. 976 if (CI.InstClass != MIMG) 977 offsetsCanBeCombined(CI, *STM, Paired, true); 978 return true; 979 } 980 return false; 981 } 982 983 // We've found a load/store that we couldn't merge for some reason. 984 // We could potentially keep looking, but we'd need to make sure that 985 // it was safe to move I and also all the instruction in InstsToMove 986 // down past this instruction. 987 // check if we can move I across MBBI and if we can move all I's users 988 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 989 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 990 break; 991 } 992 return false; 993 } 994 995 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 996 if (STM->ldsRequiresM0Init()) 997 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 998 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 999 } 1000 1001 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1002 if (STM->ldsRequiresM0Init()) 1003 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1004 1005 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1006 : AMDGPU::DS_READ2ST64_B64_gfx9; 1007 } 1008 1009 MachineBasicBlock::iterator 1010 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1011 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1012 MachineBasicBlock *MBB = CI.I->getParent(); 1013 1014 // Be careful, since the addresses could be subregisters themselves in weird 1015 // cases, like vectors of pointers. 1016 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1017 1018 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1019 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1020 1021 unsigned NewOffset0 = CI.Offset; 1022 unsigned NewOffset1 = Paired.Offset; 1023 unsigned Opc = 1024 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1025 1026 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1027 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1028 1029 if (NewOffset0 > NewOffset1) { 1030 // Canonicalize the merged instruction so the smaller offset comes first. 1031 std::swap(NewOffset0, NewOffset1); 1032 std::swap(SubRegIdx0, SubRegIdx1); 1033 } 1034 1035 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1036 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1037 1038 const MCInstrDesc &Read2Desc = TII->get(Opc); 1039 1040 const TargetRegisterClass *SuperRC = 1041 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1042 Register DestReg = MRI->createVirtualRegister(SuperRC); 1043 1044 DebugLoc DL = CI.I->getDebugLoc(); 1045 1046 Register BaseReg = AddrReg->getReg(); 1047 unsigned BaseSubReg = AddrReg->getSubReg(); 1048 unsigned BaseRegFlags = 0; 1049 if (CI.BaseOff) { 1050 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1051 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1052 .addImm(CI.BaseOff); 1053 1054 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1055 BaseRegFlags = RegState::Kill; 1056 1057 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1058 .addReg(ImmReg) 1059 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1060 .addImm(0); // clamp bit 1061 BaseSubReg = 0; 1062 } 1063 1064 MachineInstrBuilder Read2 = 1065 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1066 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1067 .addImm(NewOffset0) // offset0 1068 .addImm(NewOffset1) // offset1 1069 .addImm(0) // gds 1070 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1071 1072 (void)Read2; 1073 1074 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1075 1076 // Copy to the old destination registers. 1077 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1078 .add(*Dest0) // Copy to same destination including flags and sub reg. 1079 .addReg(DestReg, 0, SubRegIdx0); 1080 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1081 .add(*Dest1) 1082 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1083 1084 moveInstsAfter(Copy1, InstsToMove); 1085 1086 CI.I->eraseFromParent(); 1087 Paired.I->eraseFromParent(); 1088 1089 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1090 return Read2; 1091 } 1092 1093 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1094 if (STM->ldsRequiresM0Init()) 1095 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1096 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1097 : AMDGPU::DS_WRITE2_B64_gfx9; 1098 } 1099 1100 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1101 if (STM->ldsRequiresM0Init()) 1102 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1103 : AMDGPU::DS_WRITE2ST64_B64; 1104 1105 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1106 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1107 } 1108 1109 MachineBasicBlock::iterator 1110 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1111 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1112 MachineBasicBlock *MBB = CI.I->getParent(); 1113 1114 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1115 // sure we preserve the subregister index and any register flags set on them. 1116 const MachineOperand *AddrReg = 1117 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1118 const MachineOperand *Data0 = 1119 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1120 const MachineOperand *Data1 = 1121 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1122 1123 unsigned NewOffset0 = CI.Offset; 1124 unsigned NewOffset1 = Paired.Offset; 1125 unsigned Opc = 1126 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1127 1128 if (NewOffset0 > NewOffset1) { 1129 // Canonicalize the merged instruction so the smaller offset comes first. 1130 std::swap(NewOffset0, NewOffset1); 1131 std::swap(Data0, Data1); 1132 } 1133 1134 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1135 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1136 1137 const MCInstrDesc &Write2Desc = TII->get(Opc); 1138 DebugLoc DL = CI.I->getDebugLoc(); 1139 1140 Register BaseReg = AddrReg->getReg(); 1141 unsigned BaseSubReg = AddrReg->getSubReg(); 1142 unsigned BaseRegFlags = 0; 1143 if (CI.BaseOff) { 1144 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1145 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1146 .addImm(CI.BaseOff); 1147 1148 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1149 BaseRegFlags = RegState::Kill; 1150 1151 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1152 .addReg(ImmReg) 1153 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1154 .addImm(0); // clamp bit 1155 BaseSubReg = 0; 1156 } 1157 1158 MachineInstrBuilder Write2 = 1159 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1160 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1161 .add(*Data0) // data0 1162 .add(*Data1) // data1 1163 .addImm(NewOffset0) // offset0 1164 .addImm(NewOffset1) // offset1 1165 .addImm(0) // gds 1166 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1167 1168 moveInstsAfter(Write2, InstsToMove); 1169 1170 CI.I->eraseFromParent(); 1171 Paired.I->eraseFromParent(); 1172 1173 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1174 return Write2; 1175 } 1176 1177 MachineBasicBlock::iterator 1178 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1179 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1180 MachineBasicBlock *MBB = CI.I->getParent(); 1181 DebugLoc DL = CI.I->getDebugLoc(); 1182 const unsigned Opcode = getNewOpcode(CI, Paired); 1183 1184 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1185 1186 Register DestReg = MRI->createVirtualRegister(SuperRC); 1187 unsigned MergedDMask = CI.DMask | Paired.DMask; 1188 unsigned DMaskIdx = 1189 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1190 1191 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1192 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1193 if (I == DMaskIdx) 1194 MIB.addImm(MergedDMask); 1195 else 1196 MIB.add((*CI.I).getOperand(I)); 1197 } 1198 1199 // It shouldn't be possible to get this far if the two instructions 1200 // don't have a single memoperand, because MachineInstr::mayAlias() 1201 // will return true if this is the case. 1202 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1203 1204 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1205 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1206 1207 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1208 1209 unsigned SubRegIdx0, SubRegIdx1; 1210 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1211 1212 // Copy to the old destination registers. 1213 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1214 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1215 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1216 1217 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1218 .add(*Dest0) // Copy to same destination including flags and sub reg. 1219 .addReg(DestReg, 0, SubRegIdx0); 1220 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1221 .add(*Dest1) 1222 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1223 1224 moveInstsAfter(Copy1, InstsToMove); 1225 1226 CI.I->eraseFromParent(); 1227 Paired.I->eraseFromParent(); 1228 return New; 1229 } 1230 1231 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1232 CombineInfo &CI, CombineInfo &Paired, 1233 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1234 MachineBasicBlock *MBB = CI.I->getParent(); 1235 DebugLoc DL = CI.I->getDebugLoc(); 1236 const unsigned Opcode = getNewOpcode(CI, Paired); 1237 1238 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1239 1240 Register DestReg = MRI->createVirtualRegister(SuperRC); 1241 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1242 1243 // It shouldn't be possible to get this far if the two instructions 1244 // don't have a single memoperand, because MachineInstr::mayAlias() 1245 // will return true if this is the case. 1246 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1247 1248 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1249 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1250 1251 MachineInstr *New = 1252 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1253 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1254 .addImm(MergedOffset) // offset 1255 .addImm(CI.GLC) // glc 1256 .addImm(CI.DLC) // dlc 1257 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1258 1259 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1260 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1261 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1262 1263 // Copy to the old destination registers. 1264 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1265 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1266 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1267 1268 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1269 .add(*Dest0) // Copy to same destination including flags and sub reg. 1270 .addReg(DestReg, 0, SubRegIdx0); 1271 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1272 .add(*Dest1) 1273 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1274 1275 moveInstsAfter(Copy1, InstsToMove); 1276 1277 CI.I->eraseFromParent(); 1278 Paired.I->eraseFromParent(); 1279 return New; 1280 } 1281 1282 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1283 CombineInfo &CI, CombineInfo &Paired, 1284 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1285 MachineBasicBlock *MBB = CI.I->getParent(); 1286 DebugLoc DL = CI.I->getDebugLoc(); 1287 1288 const unsigned Opcode = getNewOpcode(CI, Paired); 1289 1290 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1291 1292 // Copy to the new source register. 1293 Register DestReg = MRI->createVirtualRegister(SuperRC); 1294 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1295 1296 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1297 1298 AddressRegs Regs = getRegs(Opcode, *TII); 1299 1300 if (Regs.VAddr) 1301 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1302 1303 // It shouldn't be possible to get this far if the two instructions 1304 // don't have a single memoperand, because MachineInstr::mayAlias() 1305 // will return true if this is the case. 1306 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1307 1308 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1309 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1310 1311 MachineInstr *New = 1312 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1313 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1314 .addImm(MergedOffset) // offset 1315 .addImm(CI.GLC) // glc 1316 .addImm(CI.SLC) // slc 1317 .addImm(0) // tfe 1318 .addImm(CI.DLC) // dlc 1319 .addImm(0) // swz 1320 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1321 1322 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1323 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1324 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1325 1326 // Copy to the old destination registers. 1327 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1328 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1329 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1330 1331 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1332 .add(*Dest0) // Copy to same destination including flags and sub reg. 1333 .addReg(DestReg, 0, SubRegIdx0); 1334 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1335 .add(*Dest1) 1336 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1337 1338 moveInstsAfter(Copy1, InstsToMove); 1339 1340 CI.I->eraseFromParent(); 1341 Paired.I->eraseFromParent(); 1342 return New; 1343 } 1344 1345 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1346 CombineInfo &CI, CombineInfo &Paired, 1347 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1348 MachineBasicBlock *MBB = CI.I->getParent(); 1349 DebugLoc DL = CI.I->getDebugLoc(); 1350 1351 const unsigned Opcode = getNewOpcode(CI, Paired); 1352 1353 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1354 1355 // Copy to the new source register. 1356 Register DestReg = MRI->createVirtualRegister(SuperRC); 1357 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1358 1359 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1360 1361 AddressRegs Regs = getRegs(Opcode, *TII); 1362 1363 if (Regs.VAddr) 1364 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1365 1366 unsigned JoinedFormat = 1367 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1368 1369 // It shouldn't be possible to get this far if the two instructions 1370 // don't have a single memoperand, because MachineInstr::mayAlias() 1371 // will return true if this is the case. 1372 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1373 1374 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1375 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1376 1377 MachineInstr *New = 1378 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1379 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1380 .addImm(MergedOffset) // offset 1381 .addImm(JoinedFormat) // format 1382 .addImm(CI.GLC) // glc 1383 .addImm(CI.SLC) // slc 1384 .addImm(0) // tfe 1385 .addImm(CI.DLC) // dlc 1386 .addImm(0) // swz 1387 .addMemOperand( 1388 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1389 1390 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1391 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1392 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1393 1394 // Copy to the old destination registers. 1395 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1396 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1397 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1398 1399 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1400 .add(*Dest0) // Copy to same destination including flags and sub reg. 1401 .addReg(DestReg, 0, SubRegIdx0); 1402 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1403 .add(*Dest1) 1404 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1405 1406 moveInstsAfter(Copy1, InstsToMove); 1407 1408 CI.I->eraseFromParent(); 1409 Paired.I->eraseFromParent(); 1410 return New; 1411 } 1412 1413 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1414 CombineInfo &CI, CombineInfo &Paired, 1415 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1416 MachineBasicBlock *MBB = CI.I->getParent(); 1417 DebugLoc DL = CI.I->getDebugLoc(); 1418 1419 const unsigned Opcode = getNewOpcode(CI, Paired); 1420 1421 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1422 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1423 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1424 1425 // Copy to the new source register. 1426 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1427 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1428 1429 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1430 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1431 1432 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1433 .add(*Src0) 1434 .addImm(SubRegIdx0) 1435 .add(*Src1) 1436 .addImm(SubRegIdx1); 1437 1438 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1439 .addReg(SrcReg, RegState::Kill); 1440 1441 AddressRegs Regs = getRegs(Opcode, *TII); 1442 1443 if (Regs.VAddr) 1444 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1445 1446 unsigned JoinedFormat = 1447 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1448 1449 // It shouldn't be possible to get this far if the two instructions 1450 // don't have a single memoperand, because MachineInstr::mayAlias() 1451 // will return true if this is the case. 1452 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1453 1454 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1455 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1456 1457 MachineInstr *New = 1458 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1459 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1460 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1461 .addImm(JoinedFormat) // format 1462 .addImm(CI.GLC) // glc 1463 .addImm(CI.SLC) // slc 1464 .addImm(0) // tfe 1465 .addImm(CI.DLC) // dlc 1466 .addImm(0) // swz 1467 .addMemOperand( 1468 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1469 1470 moveInstsAfter(MIB, InstsToMove); 1471 1472 CI.I->eraseFromParent(); 1473 Paired.I->eraseFromParent(); 1474 return New; 1475 } 1476 1477 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1478 const CombineInfo &Paired) { 1479 const unsigned Width = CI.Width + Paired.Width; 1480 1481 switch (CI.InstClass) { 1482 default: 1483 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1484 // FIXME: Handle d16 correctly 1485 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1486 Width); 1487 case TBUFFER_LOAD: 1488 case TBUFFER_STORE: 1489 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1490 Width); 1491 1492 case UNKNOWN: 1493 llvm_unreachable("Unknown instruction class"); 1494 case S_BUFFER_LOAD_IMM: 1495 switch (Width) { 1496 default: 1497 return 0; 1498 case 2: 1499 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1500 case 4: 1501 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1502 } 1503 case MIMG: 1504 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1505 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1506 } 1507 } 1508 1509 std::pair<unsigned, unsigned> 1510 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1511 1512 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1513 return std::make_pair(0, 0); 1514 1515 bool ReverseOrder; 1516 if (CI.InstClass == MIMG) { 1517 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1518 "No overlaps"); 1519 ReverseOrder = CI.DMask > Paired.DMask; 1520 } else 1521 ReverseOrder = CI.Offset > Paired.Offset; 1522 1523 static const unsigned Idxs[4][4] = { 1524 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1525 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1526 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1527 {AMDGPU::sub3, 0, 0, 0}, 1528 }; 1529 unsigned Idx0; 1530 unsigned Idx1; 1531 1532 assert(CI.Width >= 1 && CI.Width <= 3); 1533 assert(Paired.Width >= 1 && Paired.Width <= 3); 1534 1535 if (ReverseOrder) { 1536 Idx1 = Idxs[0][Paired.Width - 1]; 1537 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1538 } else { 1539 Idx0 = Idxs[0][CI.Width - 1]; 1540 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1541 } 1542 1543 return std::make_pair(Idx0, Idx1); 1544 } 1545 1546 const TargetRegisterClass * 1547 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1548 const CombineInfo &Paired) { 1549 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1550 switch (CI.Width + Paired.Width) { 1551 default: 1552 return nullptr; 1553 case 2: 1554 return &AMDGPU::SReg_64_XEXECRegClass; 1555 case 4: 1556 return &AMDGPU::SGPR_128RegClass; 1557 case 8: 1558 return &AMDGPU::SGPR_256RegClass; 1559 case 16: 1560 return &AMDGPU::SGPR_512RegClass; 1561 } 1562 } else { 1563 switch (CI.Width + Paired.Width) { 1564 default: 1565 return nullptr; 1566 case 2: 1567 return &AMDGPU::VReg_64RegClass; 1568 case 3: 1569 return &AMDGPU::VReg_96RegClass; 1570 case 4: 1571 return &AMDGPU::VReg_128RegClass; 1572 } 1573 } 1574 } 1575 1576 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1577 CombineInfo &CI, CombineInfo &Paired, 1578 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1579 MachineBasicBlock *MBB = CI.I->getParent(); 1580 DebugLoc DL = CI.I->getDebugLoc(); 1581 1582 const unsigned Opcode = getNewOpcode(CI, Paired); 1583 1584 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1585 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1586 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1587 1588 // Copy to the new source register. 1589 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1590 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1591 1592 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1593 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1594 1595 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1596 .add(*Src0) 1597 .addImm(SubRegIdx0) 1598 .add(*Src1) 1599 .addImm(SubRegIdx1); 1600 1601 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1602 .addReg(SrcReg, RegState::Kill); 1603 1604 AddressRegs Regs = getRegs(Opcode, *TII); 1605 1606 if (Regs.VAddr) 1607 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1608 1609 1610 // It shouldn't be possible to get this far if the two instructions 1611 // don't have a single memoperand, because MachineInstr::mayAlias() 1612 // will return true if this is the case. 1613 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1614 1615 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1616 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1617 1618 MachineInstr *New = 1619 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1620 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1621 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1622 .addImm(CI.GLC) // glc 1623 .addImm(CI.SLC) // slc 1624 .addImm(0) // tfe 1625 .addImm(CI.DLC) // dlc 1626 .addImm(0) // swz 1627 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1628 1629 moveInstsAfter(MIB, InstsToMove); 1630 1631 CI.I->eraseFromParent(); 1632 Paired.I->eraseFromParent(); 1633 return New; 1634 } 1635 1636 MachineOperand 1637 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1638 APInt V(32, Val, true); 1639 if (TII->isInlineConstant(V)) 1640 return MachineOperand::CreateImm(Val); 1641 1642 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1643 MachineInstr *Mov = 1644 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1645 TII->get(AMDGPU::S_MOV_B32), Reg) 1646 .addImm(Val); 1647 (void)Mov; 1648 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1649 return MachineOperand::CreateReg(Reg, false); 1650 } 1651 1652 // Compute base address using Addr and return the final register. 1653 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1654 const MemAddress &Addr) const { 1655 MachineBasicBlock *MBB = MI.getParent(); 1656 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1657 DebugLoc DL = MI.getDebugLoc(); 1658 1659 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1660 Addr.Base.LoSubReg) && 1661 "Expected 32-bit Base-Register-Low!!"); 1662 1663 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1664 Addr.Base.HiSubReg) && 1665 "Expected 32-bit Base-Register-Hi!!"); 1666 1667 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1668 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1669 MachineOperand OffsetHi = 1670 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1671 1672 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1673 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1674 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1675 1676 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1677 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1678 MachineInstr *LoHalf = 1679 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1680 .addReg(CarryReg, RegState::Define) 1681 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1682 .add(OffsetLo) 1683 .addImm(0); // clamp bit 1684 (void)LoHalf; 1685 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1686 1687 MachineInstr *HiHalf = 1688 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1689 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1690 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1691 .add(OffsetHi) 1692 .addReg(CarryReg, RegState::Kill) 1693 .addImm(0); // clamp bit 1694 (void)HiHalf; 1695 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1696 1697 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1698 MachineInstr *FullBase = 1699 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1700 .addReg(DestSub0) 1701 .addImm(AMDGPU::sub0) 1702 .addReg(DestSub1) 1703 .addImm(AMDGPU::sub1); 1704 (void)FullBase; 1705 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1706 1707 return FullDestReg; 1708 } 1709 1710 // Update base and offset with the NewBase and NewOffset in MI. 1711 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1712 Register NewBase, 1713 int32_t NewOffset) const { 1714 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1715 Base->setReg(NewBase); 1716 Base->setIsKill(false); 1717 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1718 } 1719 1720 Optional<int32_t> 1721 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1722 if (Op.isImm()) 1723 return Op.getImm(); 1724 1725 if (!Op.isReg()) 1726 return None; 1727 1728 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1729 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1730 !Def->getOperand(1).isImm()) 1731 return None; 1732 1733 return Def->getOperand(1).getImm(); 1734 } 1735 1736 // Analyze Base and extracts: 1737 // - 32bit base registers, subregisters 1738 // - 64bit constant offset 1739 // Expecting base computation as: 1740 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1741 // %LO:vgpr_32, %c:sreg_64_xexec = 1742 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1743 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1744 // %Base:vreg_64 = 1745 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1746 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1747 MemAddress &Addr) const { 1748 if (!Base.isReg()) 1749 return; 1750 1751 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1752 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1753 || Def->getNumOperands() != 5) 1754 return; 1755 1756 MachineOperand BaseLo = Def->getOperand(1); 1757 MachineOperand BaseHi = Def->getOperand(3); 1758 if (!BaseLo.isReg() || !BaseHi.isReg()) 1759 return; 1760 1761 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1762 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1763 1764 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1765 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1766 return; 1767 1768 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1769 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1770 1771 auto Offset0P = extractConstOffset(*Src0); 1772 if (Offset0P) 1773 BaseLo = *Src1; 1774 else { 1775 if (!(Offset0P = extractConstOffset(*Src1))) 1776 return; 1777 BaseLo = *Src0; 1778 } 1779 1780 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1781 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1782 1783 if (Src0->isImm()) 1784 std::swap(Src0, Src1); 1785 1786 if (!Src1->isImm()) 1787 return; 1788 1789 uint64_t Offset1 = Src1->getImm(); 1790 BaseHi = *Src0; 1791 1792 Addr.Base.LoReg = BaseLo.getReg(); 1793 Addr.Base.HiReg = BaseHi.getReg(); 1794 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1795 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1796 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1797 } 1798 1799 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1800 MachineInstr &MI, 1801 MemInfoMap &Visited, 1802 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1803 1804 if (!(MI.mayLoad() ^ MI.mayStore())) 1805 return false; 1806 1807 // TODO: Support flat and scratch. 1808 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1809 return false; 1810 1811 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1812 return false; 1813 1814 if (AnchorList.count(&MI)) 1815 return false; 1816 1817 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1818 1819 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1820 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1821 return false; 1822 } 1823 1824 // Step1: Find the base-registers and a 64bit constant offset. 1825 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1826 MemAddress MAddr; 1827 if (Visited.find(&MI) == Visited.end()) { 1828 processBaseWithConstOffset(Base, MAddr); 1829 Visited[&MI] = MAddr; 1830 } else 1831 MAddr = Visited[&MI]; 1832 1833 if (MAddr.Offset == 0) { 1834 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1835 " constant offsets that can be promoted.\n";); 1836 return false; 1837 } 1838 1839 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1840 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1841 1842 // Step2: Traverse through MI's basic block and find an anchor(that has the 1843 // same base-registers) with the highest 13bit distance from MI's offset. 1844 // E.g. (64bit loads) 1845 // bb: 1846 // addr1 = &a + 4096; load1 = load(addr1, 0) 1847 // addr2 = &a + 6144; load2 = load(addr2, 0) 1848 // addr3 = &a + 8192; load3 = load(addr3, 0) 1849 // addr4 = &a + 10240; load4 = load(addr4, 0) 1850 // addr5 = &a + 12288; load5 = load(addr5, 0) 1851 // 1852 // Starting from the first load, the optimization will try to find a new base 1853 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1854 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1855 // as the new-base(anchor) because of the maximum distance which can 1856 // accomodate more intermediate bases presumeably. 1857 // 1858 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1859 // (&a + 8192) for load1, load2, load4. 1860 // addr = &a + 8192 1861 // load1 = load(addr, -4096) 1862 // load2 = load(addr, -2048) 1863 // load3 = load(addr, 0) 1864 // load4 = load(addr, 2048) 1865 // addr5 = &a + 12288; load5 = load(addr5, 0) 1866 // 1867 MachineInstr *AnchorInst = nullptr; 1868 MemAddress AnchorAddr; 1869 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1870 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1871 1872 MachineBasicBlock *MBB = MI.getParent(); 1873 MachineBasicBlock::iterator E = MBB->end(); 1874 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1875 ++MBBI; 1876 const SITargetLowering *TLI = 1877 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1878 1879 for ( ; MBBI != E; ++MBBI) { 1880 MachineInstr &MINext = *MBBI; 1881 // TODO: Support finding an anchor(with same base) from store addresses or 1882 // any other load addresses where the opcodes are different. 1883 if (MINext.getOpcode() != MI.getOpcode() || 1884 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1885 continue; 1886 1887 const MachineOperand &BaseNext = 1888 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1889 MemAddress MAddrNext; 1890 if (Visited.find(&MINext) == Visited.end()) { 1891 processBaseWithConstOffset(BaseNext, MAddrNext); 1892 Visited[&MINext] = MAddrNext; 1893 } else 1894 MAddrNext = Visited[&MINext]; 1895 1896 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1897 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1898 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1899 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1900 continue; 1901 1902 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1903 1904 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1905 TargetLoweringBase::AddrMode AM; 1906 AM.HasBaseReg = true; 1907 AM.BaseOffs = Dist; 1908 if (TLI->isLegalGlobalAddressingMode(AM) && 1909 (uint32_t)std::abs(Dist) > MaxDist) { 1910 MaxDist = std::abs(Dist); 1911 1912 AnchorAddr = MAddrNext; 1913 AnchorInst = &MINext; 1914 } 1915 } 1916 1917 if (AnchorInst) { 1918 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1919 AnchorInst->dump()); 1920 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1921 << AnchorAddr.Offset << "\n\n"); 1922 1923 // Instead of moving up, just re-compute anchor-instruction's base address. 1924 Register Base = computeBase(MI, AnchorAddr); 1925 1926 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1927 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1928 1929 for (auto P : InstsWCommonBase) { 1930 TargetLoweringBase::AddrMode AM; 1931 AM.HasBaseReg = true; 1932 AM.BaseOffs = P.second - AnchorAddr.Offset; 1933 1934 if (TLI->isLegalGlobalAddressingMode(AM)) { 1935 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1936 dbgs() << ")"; P.first->dump()); 1937 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1938 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1939 } 1940 } 1941 AnchorList.insert(AnchorInst); 1942 return true; 1943 } 1944 1945 return false; 1946 } 1947 1948 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1949 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1950 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1951 if (AddrList.front().InstClass == CI.InstClass && 1952 AddrList.front().hasSameBaseAddress(*CI.I)) { 1953 AddrList.emplace_back(CI); 1954 return; 1955 } 1956 } 1957 1958 // Base address not found, so add a new list. 1959 MergeableInsts.emplace_back(1, CI); 1960 } 1961 1962 std::pair<MachineBasicBlock::iterator, bool> 1963 SILoadStoreOptimizer::collectMergeableInsts( 1964 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1965 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1966 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1967 bool Modified = false; 1968 1969 // Sort potential mergeable instructions into lists. One list per base address. 1970 unsigned Order = 0; 1971 MachineBasicBlock::iterator BlockI = Begin; 1972 for (; BlockI != End; ++BlockI) { 1973 MachineInstr &MI = *BlockI; 1974 1975 // We run this before checking if an address is mergeable, because it can produce 1976 // better code even if the instructions aren't mergeable. 1977 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1978 Modified = true; 1979 1980 // Don't combine if volatile. We also won't be able to merge across this, so 1981 // break the search. We can look after this barrier for separate merges. 1982 if (MI.hasOrderedMemoryRef()) { 1983 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1984 1985 // Search will resume after this instruction in a separate merge list. 1986 ++BlockI; 1987 break; 1988 } 1989 1990 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1991 if (InstClass == UNKNOWN) 1992 continue; 1993 1994 CombineInfo CI; 1995 CI.setMI(MI, *TII, *STM); 1996 CI.Order = Order++; 1997 1998 if (!CI.hasMergeableAddress(*MRI)) 1999 continue; 2000 2001 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2002 2003 addInstToMergeableList(CI, MergeableInsts); 2004 } 2005 2006 // At this point we have lists of Mergeable instructions. 2007 // 2008 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2009 // list try to find an instruction that can be merged with I. If an instruction 2010 // is found, it is stored in the Paired field. If no instructions are found, then 2011 // the CombineInfo object is deleted from the list. 2012 2013 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2014 E = MergeableInsts.end(); I != E;) { 2015 2016 std::list<CombineInfo> &MergeList = *I; 2017 if (MergeList.size() <= 1) { 2018 // This means we have found only one instruction with a given address 2019 // that can be merged, and we need at least 2 instructions to do a merge, 2020 // so this list can be discarded. 2021 I = MergeableInsts.erase(I); 2022 continue; 2023 } 2024 2025 // Sort the lists by offsets, this way mergeable instructions will be 2026 // adjacent to each other in the list, which will make it easier to find 2027 // matches. 2028 MergeList.sort( 2029 [] (const CombineInfo &A, CombineInfo &B) { 2030 return A.Offset < B.Offset; 2031 }); 2032 ++I; 2033 } 2034 2035 return std::make_pair(BlockI, Modified); 2036 } 2037 2038 // Scan through looking for adjacent LDS operations with constant offsets from 2039 // the same base register. We rely on the scheduler to do the hard work of 2040 // clustering nearby loads, and assume these are all adjacent. 2041 bool SILoadStoreOptimizer::optimizeBlock( 2042 std::list<std::list<CombineInfo> > &MergeableInsts) { 2043 bool Modified = false; 2044 2045 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2046 E = MergeableInsts.end(); I != E;) { 2047 std::list<CombineInfo> &MergeList = *I; 2048 2049 bool OptimizeListAgain = false; 2050 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2051 // We weren't able to make any changes, so delete the list so we don't 2052 // process the same instructions the next time we try to optimize this 2053 // block. 2054 I = MergeableInsts.erase(I); 2055 continue; 2056 } 2057 2058 Modified = true; 2059 2060 // We made changes, but also determined that there were no more optimization 2061 // opportunities, so we don't need to reprocess the list 2062 if (!OptimizeListAgain) { 2063 I = MergeableInsts.erase(I); 2064 continue; 2065 } 2066 OptimizeAgain = true; 2067 } 2068 return Modified; 2069 } 2070 2071 bool 2072 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2073 std::list<CombineInfo> &MergeList, 2074 bool &OptimizeListAgain) { 2075 if (MergeList.empty()) 2076 return false; 2077 2078 bool Modified = false; 2079 2080 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2081 Next = std::next(I)) { 2082 2083 auto First = I; 2084 auto Second = Next; 2085 2086 if ((*First).Order > (*Second).Order) 2087 std::swap(First, Second); 2088 CombineInfo &CI = *First; 2089 CombineInfo &Paired = *Second; 2090 2091 SmallVector<MachineInstr *, 8> InstsToMove; 2092 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2093 ++I; 2094 continue; 2095 } 2096 2097 Modified = true; 2098 2099 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2100 2101 switch (CI.InstClass) { 2102 default: 2103 llvm_unreachable("unknown InstClass"); 2104 break; 2105 case DS_READ: { 2106 MachineBasicBlock::iterator NewMI = 2107 mergeRead2Pair(CI, Paired, InstsToMove); 2108 CI.setMI(NewMI, *TII, *STM); 2109 break; 2110 } 2111 case DS_WRITE: { 2112 MachineBasicBlock::iterator NewMI = 2113 mergeWrite2Pair(CI, Paired, InstsToMove); 2114 CI.setMI(NewMI, *TII, *STM); 2115 break; 2116 } 2117 case S_BUFFER_LOAD_IMM: { 2118 MachineBasicBlock::iterator NewMI = 2119 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2120 CI.setMI(NewMI, *TII, *STM); 2121 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2122 break; 2123 } 2124 case BUFFER_LOAD: { 2125 MachineBasicBlock::iterator NewMI = 2126 mergeBufferLoadPair(CI, Paired, InstsToMove); 2127 CI.setMI(NewMI, *TII, *STM); 2128 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2129 break; 2130 } 2131 case BUFFER_STORE: { 2132 MachineBasicBlock::iterator NewMI = 2133 mergeBufferStorePair(CI, Paired, InstsToMove); 2134 CI.setMI(NewMI, *TII, *STM); 2135 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2136 break; 2137 } 2138 case MIMG: { 2139 MachineBasicBlock::iterator NewMI = 2140 mergeImagePair(CI, Paired, InstsToMove); 2141 CI.setMI(NewMI, *TII, *STM); 2142 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2143 break; 2144 } 2145 case TBUFFER_LOAD: { 2146 MachineBasicBlock::iterator NewMI = 2147 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2148 CI.setMI(NewMI, *TII, *STM); 2149 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2150 break; 2151 } 2152 case TBUFFER_STORE: { 2153 MachineBasicBlock::iterator NewMI = 2154 mergeTBufferStorePair(CI, Paired, InstsToMove); 2155 CI.setMI(NewMI, *TII, *STM); 2156 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2157 break; 2158 } 2159 } 2160 CI.Order = Paired.Order; 2161 if (I == Second) 2162 I = Next; 2163 2164 MergeList.erase(Second); 2165 } 2166 2167 return Modified; 2168 } 2169 2170 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2171 if (skipFunction(MF.getFunction())) 2172 return false; 2173 2174 STM = &MF.getSubtarget<GCNSubtarget>(); 2175 if (!STM->loadStoreOptEnabled()) 2176 return false; 2177 2178 TII = STM->getInstrInfo(); 2179 TRI = &TII->getRegisterInfo(); 2180 2181 MRI = &MF.getRegInfo(); 2182 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2183 2184 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2185 2186 bool Modified = false; 2187 2188 // Contains the list of instructions for which constant offsets are being 2189 // promoted to the IMM. This is tracked for an entire block at time. 2190 SmallPtrSet<MachineInstr *, 4> AnchorList; 2191 MemInfoMap Visited; 2192 2193 for (MachineBasicBlock &MBB : MF) { 2194 MachineBasicBlock::iterator SectionEnd; 2195 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2196 I = SectionEnd) { 2197 bool CollectModified; 2198 std::list<std::list<CombineInfo>> MergeableInsts; 2199 2200 // First pass: Collect list of all instructions we know how to merge in a 2201 // subset of the block. 2202 std::tie(SectionEnd, CollectModified) = 2203 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2204 2205 Modified |= CollectModified; 2206 2207 do { 2208 OptimizeAgain = false; 2209 Modified |= optimizeBlock(MergeableInsts); 2210 } while (OptimizeAgain); 2211 } 2212 2213 Visited.clear(); 2214 AnchorList.clear(); 2215 } 2216 2217 return Modified; 2218 } 2219