1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "llvm/Analysis/AliasAnalysis.h" 63 #include "llvm/CodeGen/MachineFunctionPass.h" 64 #include "llvm/InitializePasses.h" 65 66 using namespace llvm; 67 68 #define DEBUG_TYPE "si-load-store-opt" 69 70 namespace { 71 enum InstClassEnum { 72 UNKNOWN, 73 DS_READ, 74 DS_WRITE, 75 S_BUFFER_LOAD_IMM, 76 BUFFER_LOAD, 77 BUFFER_STORE, 78 MIMG, 79 TBUFFER_LOAD, 80 TBUFFER_STORE, 81 }; 82 83 struct AddressRegs { 84 unsigned char NumVAddrs = 0; 85 bool SBase = false; 86 bool SRsrc = false; 87 bool SOffset = false; 88 bool VAddr = false; 89 bool Addr = false; 90 bool SSamp = false; 91 }; 92 93 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 94 const unsigned MaxAddressRegs = 12 + 1 + 1; 95 96 class SILoadStoreOptimizer : public MachineFunctionPass { 97 struct CombineInfo { 98 MachineBasicBlock::iterator I; 99 unsigned EltSize; 100 unsigned Offset; 101 unsigned Width; 102 unsigned Format; 103 unsigned BaseOff; 104 unsigned DMask; 105 InstClassEnum InstClass; 106 bool GLC; 107 bool SLC; 108 bool DLC; 109 bool UseST64; 110 int AddrIdx[MaxAddressRegs]; 111 const MachineOperand *AddrReg[MaxAddressRegs]; 112 unsigned NumAddresses; 113 unsigned Order; 114 115 bool hasSameBaseAddress(const MachineInstr &MI) { 116 for (unsigned i = 0; i < NumAddresses; i++) { 117 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 118 119 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 120 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 121 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 122 return false; 123 } 124 continue; 125 } 126 127 // Check same base pointer. Be careful of subregisters, which can occur 128 // with vectors of pointers. 129 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 130 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 131 return false; 132 } 133 } 134 return true; 135 } 136 137 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 138 for (unsigned i = 0; i < NumAddresses; ++i) { 139 const MachineOperand *AddrOp = AddrReg[i]; 140 // Immediates are always OK. 141 if (AddrOp->isImm()) 142 continue; 143 144 // Don't try to merge addresses that aren't either immediates or registers. 145 // TODO: Should be possible to merge FrameIndexes and maybe some other 146 // non-register 147 if (!AddrOp->isReg()) 148 return false; 149 150 // TODO: We should be able to merge physical reg addreses. 151 if (AddrOp->getReg().isPhysical()) 152 return false; 153 154 // If an address has only one use then there will be on other 155 // instructions with the same address, so we can't merge this one. 156 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 157 return false; 158 } 159 return true; 160 } 161 162 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 163 const GCNSubtarget &STM); 164 }; 165 166 struct BaseRegisters { 167 Register LoReg; 168 Register HiReg; 169 170 unsigned LoSubReg = 0; 171 unsigned HiSubReg = 0; 172 }; 173 174 struct MemAddress { 175 BaseRegisters Base; 176 int64_t Offset = 0; 177 }; 178 179 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 180 181 private: 182 const GCNSubtarget *STM = nullptr; 183 const SIInstrInfo *TII = nullptr; 184 const SIRegisterInfo *TRI = nullptr; 185 MachineRegisterInfo *MRI = nullptr; 186 AliasAnalysis *AA = nullptr; 187 bool OptimizeAgain; 188 189 static bool dmasksCanBeCombined(const CombineInfo &CI, 190 const SIInstrInfo &TII, 191 const CombineInfo &Paired); 192 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 193 CombineInfo &Paired, bool Modify = false); 194 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 195 const CombineInfo &Paired); 196 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 197 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 198 const CombineInfo &Paired); 199 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 200 const CombineInfo &Paired); 201 202 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 203 SmallVectorImpl<MachineInstr *> &InstsToMove); 204 205 unsigned read2Opcode(unsigned EltSize) const; 206 unsigned read2ST64Opcode(unsigned EltSize) const; 207 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 208 CombineInfo &Paired, 209 const SmallVectorImpl<MachineInstr *> &InstsToMove); 210 211 unsigned write2Opcode(unsigned EltSize) const; 212 unsigned write2ST64Opcode(unsigned EltSize) const; 213 MachineBasicBlock::iterator 214 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 215 const SmallVectorImpl<MachineInstr *> &InstsToMove); 216 MachineBasicBlock::iterator 217 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 218 const SmallVectorImpl<MachineInstr *> &InstsToMove); 219 MachineBasicBlock::iterator 220 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 221 const SmallVectorImpl<MachineInstr *> &InstsToMove); 222 MachineBasicBlock::iterator 223 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 224 const SmallVectorImpl<MachineInstr *> &InstsToMove); 225 MachineBasicBlock::iterator 226 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 227 const SmallVectorImpl<MachineInstr *> &InstsToMove); 228 MachineBasicBlock::iterator 229 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 230 const SmallVectorImpl<MachineInstr *> &InstsToMove); 231 MachineBasicBlock::iterator 232 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 233 const SmallVectorImpl<MachineInstr *> &InstsToMove); 234 235 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 236 int32_t NewOffset) const; 237 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 238 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 239 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 240 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 241 /// Promotes constant offset to the immediate by adjusting the base. It 242 /// tries to use a base from the nearby instructions that allows it to have 243 /// a 13bit constant offset which gets promoted to the immediate. 244 bool promoteConstantOffsetToImm(MachineInstr &CI, 245 MemInfoMap &Visited, 246 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 247 void addInstToMergeableList(const CombineInfo &CI, 248 std::list<std::list<CombineInfo> > &MergeableInsts) const; 249 250 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 251 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 252 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 253 std::list<std::list<CombineInfo>> &MergeableInsts) const; 254 255 public: 256 static char ID; 257 258 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 259 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 263 bool &OptimizeListAgain); 264 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 265 266 bool runOnMachineFunction(MachineFunction &MF) override; 267 268 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 269 270 void getAnalysisUsage(AnalysisUsage &AU) const override { 271 AU.setPreservesCFG(); 272 AU.addRequired<AAResultsWrapperPass>(); 273 274 MachineFunctionPass::getAnalysisUsage(AU); 275 } 276 277 MachineFunctionProperties getRequiredProperties() const override { 278 return MachineFunctionProperties() 279 .set(MachineFunctionProperties::Property::IsSSA); 280 } 281 }; 282 283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 284 const unsigned Opc = MI.getOpcode(); 285 286 if (TII.isMUBUF(Opc)) { 287 // FIXME: Handle d16 correctly 288 return AMDGPU::getMUBUFElements(Opc); 289 } 290 if (TII.isMIMG(MI)) { 291 uint64_t DMaskImm = 292 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 293 return countPopulation(DMaskImm); 294 } 295 if (TII.isMTBUF(Opc)) { 296 return AMDGPU::getMTBUFElements(Opc); 297 } 298 299 switch (Opc) { 300 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 301 return 1; 302 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 303 return 2; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 305 return 4; 306 default: 307 return 0; 308 } 309 } 310 311 /// Maps instruction opcode to enum InstClassEnum. 312 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 313 switch (Opc) { 314 default: 315 if (TII.isMUBUF(Opc)) { 316 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 317 default: 318 return UNKNOWN; 319 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 320 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 321 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 322 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 323 return BUFFER_LOAD; 324 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 325 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 326 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 327 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 328 return BUFFER_STORE; 329 } 330 } 331 if (TII.isMIMG(Opc)) { 332 // Ignore instructions encoded without vaddr. 333 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 334 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 335 return UNKNOWN; 336 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 337 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 338 TII.isGather4(Opc)) 339 return UNKNOWN; 340 return MIMG; 341 } 342 if (TII.isMTBUF(Opc)) { 343 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 344 default: 345 return UNKNOWN; 346 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 347 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 348 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 349 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 350 return TBUFFER_LOAD; 351 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 352 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 353 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 354 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 355 return TBUFFER_STORE; 356 } 357 } 358 return UNKNOWN; 359 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 360 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 361 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 362 return S_BUFFER_LOAD_IMM; 363 case AMDGPU::DS_READ_B32: 364 case AMDGPU::DS_READ_B32_gfx9: 365 case AMDGPU::DS_READ_B64: 366 case AMDGPU::DS_READ_B64_gfx9: 367 return DS_READ; 368 case AMDGPU::DS_WRITE_B32: 369 case AMDGPU::DS_WRITE_B32_gfx9: 370 case AMDGPU::DS_WRITE_B64: 371 case AMDGPU::DS_WRITE_B64_gfx9: 372 return DS_WRITE; 373 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: 374 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: 375 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: 376 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: 377 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: 378 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: 379 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: 380 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: 381 return UNKNOWN; 382 } 383 } 384 385 /// Determines instruction subclass from opcode. Only instructions 386 /// of the same subclass can be merged together. 387 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 388 switch (Opc) { 389 default: 390 if (TII.isMUBUF(Opc)) 391 return AMDGPU::getMUBUFBaseOpcode(Opc); 392 if (TII.isMIMG(Opc)) { 393 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 394 assert(Info); 395 return Info->BaseOpcode; 396 } 397 if (TII.isMTBUF(Opc)) 398 return AMDGPU::getMTBUFBaseOpcode(Opc); 399 return -1; 400 case AMDGPU::DS_READ_B32: 401 case AMDGPU::DS_READ_B32_gfx9: 402 case AMDGPU::DS_READ_B64: 403 case AMDGPU::DS_READ_B64_gfx9: 404 case AMDGPU::DS_WRITE_B32: 405 case AMDGPU::DS_WRITE_B32_gfx9: 406 case AMDGPU::DS_WRITE_B64: 407 case AMDGPU::DS_WRITE_B64_gfx9: 408 return Opc; 409 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 410 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 411 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 412 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 413 } 414 } 415 416 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 417 AddressRegs Result; 418 419 if (TII.isMUBUF(Opc)) { 420 if (AMDGPU::getMUBUFHasVAddr(Opc)) 421 Result.VAddr = true; 422 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 423 Result.SRsrc = true; 424 if (AMDGPU::getMUBUFHasSoffset(Opc)) 425 Result.SOffset = true; 426 427 return Result; 428 } 429 430 if (TII.isMIMG(Opc)) { 431 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 432 if (VAddr0Idx >= 0) { 433 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 434 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 435 } else { 436 Result.VAddr = true; 437 } 438 Result.SRsrc = true; 439 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 440 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 441 Result.SSamp = true; 442 443 return Result; 444 } 445 if (TII.isMTBUF(Opc)) { 446 if (AMDGPU::getMTBUFHasVAddr(Opc)) 447 Result.VAddr = true; 448 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 449 Result.SRsrc = true; 450 if (AMDGPU::getMTBUFHasSoffset(Opc)) 451 Result.SOffset = true; 452 453 return Result; 454 } 455 456 switch (Opc) { 457 default: 458 return Result; 459 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 460 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 461 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 462 Result.SBase = true; 463 return Result; 464 case AMDGPU::DS_READ_B32: 465 case AMDGPU::DS_READ_B64: 466 case AMDGPU::DS_READ_B32_gfx9: 467 case AMDGPU::DS_READ_B64_gfx9: 468 case AMDGPU::DS_WRITE_B32: 469 case AMDGPU::DS_WRITE_B64: 470 case AMDGPU::DS_WRITE_B32_gfx9: 471 case AMDGPU::DS_WRITE_B64_gfx9: 472 Result.Addr = true; 473 return Result; 474 } 475 } 476 477 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 478 const SIInstrInfo &TII, 479 const GCNSubtarget &STM) { 480 I = MI; 481 unsigned Opc = MI->getOpcode(); 482 InstClass = getInstClass(Opc, TII); 483 484 if (InstClass == UNKNOWN) 485 return; 486 487 switch (InstClass) { 488 case DS_READ: 489 EltSize = 490 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 491 : 4; 492 break; 493 case DS_WRITE: 494 EltSize = 495 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 496 : 4; 497 break; 498 case S_BUFFER_LOAD_IMM: 499 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 500 break; 501 default: 502 EltSize = 4; 503 break; 504 } 505 506 if (InstClass == MIMG) { 507 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 508 // Offset is not considered for MIMG instructions. 509 Offset = 0; 510 } else { 511 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 512 Offset = I->getOperand(OffsetIdx).getImm(); 513 } 514 515 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 516 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 517 518 Width = getOpcodeWidth(*I, TII); 519 520 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 521 Offset &= 0xffff; 522 } else if (InstClass != MIMG) { 523 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 524 if (InstClass != S_BUFFER_LOAD_IMM) { 525 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 526 } 527 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 528 } 529 530 AddressRegs Regs = getRegs(Opc, TII); 531 532 NumAddresses = 0; 533 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 534 AddrIdx[NumAddresses++] = 535 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 536 if (Regs.Addr) 537 AddrIdx[NumAddresses++] = 538 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 539 if (Regs.SBase) 540 AddrIdx[NumAddresses++] = 541 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 542 if (Regs.SRsrc) 543 AddrIdx[NumAddresses++] = 544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 545 if (Regs.SOffset) 546 AddrIdx[NumAddresses++] = 547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 548 if (Regs.VAddr) 549 AddrIdx[NumAddresses++] = 550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 551 if (Regs.SSamp) 552 AddrIdx[NumAddresses++] = 553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 554 assert(NumAddresses <= MaxAddressRegs); 555 556 for (unsigned J = 0; J < NumAddresses; J++) 557 AddrReg[J] = &I->getOperand(AddrIdx[J]); 558 } 559 560 } // end anonymous namespace. 561 562 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 563 "SI Load Store Optimizer", false, false) 564 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 565 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 566 false, false) 567 568 char SILoadStoreOptimizer::ID = 0; 569 570 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 571 572 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 573 return new SILoadStoreOptimizer(); 574 } 575 576 static void moveInstsAfter(MachineBasicBlock::iterator I, 577 ArrayRef<MachineInstr *> InstsToMove) { 578 MachineBasicBlock *MBB = I->getParent(); 579 ++I; 580 for (MachineInstr *MI : InstsToMove) { 581 MI->removeFromParent(); 582 MBB->insert(I, MI); 583 } 584 } 585 586 static void addDefsUsesToList(const MachineInstr &MI, 587 DenseSet<Register> &RegDefs, 588 DenseSet<Register> &PhysRegUses) { 589 for (const MachineOperand &Op : MI.operands()) { 590 if (Op.isReg()) { 591 if (Op.isDef()) 592 RegDefs.insert(Op.getReg()); 593 else if (Op.readsReg() && Op.getReg().isPhysical()) 594 PhysRegUses.insert(Op.getReg()); 595 } 596 } 597 } 598 599 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 600 MachineBasicBlock::iterator B, 601 AliasAnalysis *AA) { 602 // RAW or WAR - cannot reorder 603 // WAW - cannot reorder 604 // RAR - safe to reorder 605 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 606 } 607 608 // Add MI and its defs to the lists if MI reads one of the defs that are 609 // already in the list. Returns true in that case. 610 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 611 DenseSet<Register> &PhysRegUses, 612 SmallVectorImpl<MachineInstr *> &Insts) { 613 for (MachineOperand &Use : MI.operands()) { 614 // If one of the defs is read, then there is a use of Def between I and the 615 // instruction that I will potentially be merged with. We will need to move 616 // this instruction after the merged instructions. 617 // 618 // Similarly, if there is a def which is read by an instruction that is to 619 // be moved for merging, then we need to move the def-instruction as well. 620 // This can only happen for physical registers such as M0; virtual 621 // registers are in SSA form. 622 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 623 (Use.isDef() && RegDefs.count(Use.getReg())) || 624 (Use.isDef() && Use.getReg().isPhysical() && 625 PhysRegUses.count(Use.getReg())))) { 626 Insts.push_back(&MI); 627 addDefsUsesToList(MI, RegDefs, PhysRegUses); 628 return true; 629 } 630 } 631 632 return false; 633 } 634 635 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 636 ArrayRef<MachineInstr *> InstsToMove, 637 AliasAnalysis *AA) { 638 assert(MemOp.mayLoadOrStore()); 639 640 for (MachineInstr *InstToMove : InstsToMove) { 641 if (!InstToMove->mayLoadOrStore()) 642 continue; 643 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 644 return false; 645 } 646 return true; 647 } 648 649 // This function assumes that \p A and \p B have are identical except for 650 // size and offset, and they referecne adjacent memory. 651 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 652 const MachineMemOperand *A, 653 const MachineMemOperand *B) { 654 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 655 unsigned Size = A->getSize() + B->getSize(); 656 // This function adds the offset parameter to the existing offset for A, 657 // so we pass 0 here as the offset and then manually set it to the correct 658 // value after the call. 659 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 660 MMO->setOffset(MinOffset); 661 return MMO; 662 } 663 664 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 665 const SIInstrInfo &TII, 666 const CombineInfo &Paired) { 667 assert(CI.InstClass == MIMG); 668 669 // Ignore instructions with tfe/lwe set. 670 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 671 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 672 673 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 674 return false; 675 676 // Check other optional immediate operands for equality. 677 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 678 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 679 AMDGPU::OpName::da, AMDGPU::OpName::r128, 680 AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; 681 682 for (auto op : OperandsToMatch) { 683 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 684 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 685 return false; 686 if (Idx != -1 && 687 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 688 return false; 689 } 690 691 // Check DMask for overlaps. 692 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 693 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 694 695 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 696 if ((1u << AllowedBitsForMin) <= MinMask) 697 return false; 698 699 return true; 700 } 701 702 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 703 unsigned ComponentCount, 704 const GCNSubtarget &STI) { 705 if (ComponentCount > 4) 706 return 0; 707 708 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 709 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 710 if (!OldFormatInfo) 711 return 0; 712 713 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 714 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 715 ComponentCount, 716 OldFormatInfo->NumFormat, STI); 717 718 if (!NewFormatInfo) 719 return 0; 720 721 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 722 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 723 724 return NewFormatInfo->Format; 725 } 726 727 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 728 const GCNSubtarget &STI, 729 CombineInfo &Paired, 730 bool Modify) { 731 assert(CI.InstClass != MIMG); 732 733 // XXX - Would the same offset be OK? Is there any reason this would happen or 734 // be useful? 735 if (CI.Offset == Paired.Offset) 736 return false; 737 738 // This won't be valid if the offset isn't aligned. 739 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 740 return false; 741 742 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 743 744 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 745 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 746 if (!Info0) 747 return false; 748 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 749 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 750 if (!Info1) 751 return false; 752 753 if (Info0->BitsPerComp != Info1->BitsPerComp || 754 Info0->NumFormat != Info1->NumFormat) 755 return false; 756 757 // TODO: Should be possible to support more formats, but if format loads 758 // are not dword-aligned, the merged load might not be valid. 759 if (Info0->BitsPerComp != 32) 760 return false; 761 762 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 763 return false; 764 } 765 766 unsigned EltOffset0 = CI.Offset / CI.EltSize; 767 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 768 CI.UseST64 = false; 769 CI.BaseOff = 0; 770 771 // Handle DS instructions. 772 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 773 return (EltOffset0 + CI.Width == EltOffset1 || 774 EltOffset1 + Paired.Width == EltOffset0) && 775 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 776 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 777 } 778 779 // Handle SMEM and VMEM instructions. 780 // If the offset in elements doesn't fit in 8-bits, we might be able to use 781 // the stride 64 versions. 782 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 783 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 784 if (Modify) { 785 CI.Offset = EltOffset0 / 64; 786 Paired.Offset = EltOffset1 / 64; 787 CI.UseST64 = true; 788 } 789 return true; 790 } 791 792 // Check if the new offsets fit in the reduced 8-bit range. 793 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 794 if (Modify) { 795 CI.Offset = EltOffset0; 796 Paired.Offset = EltOffset1; 797 } 798 return true; 799 } 800 801 // Try to shift base address to decrease offsets. 802 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 803 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 804 805 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 806 if (Modify) { 807 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 808 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 809 CI.UseST64 = true; 810 } 811 return true; 812 } 813 814 if (isUInt<8>(OffsetDiff)) { 815 if (Modify) { 816 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 817 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 818 } 819 return true; 820 } 821 822 return false; 823 } 824 825 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 826 const CombineInfo &CI, 827 const CombineInfo &Paired) { 828 const unsigned Width = (CI.Width + Paired.Width); 829 switch (CI.InstClass) { 830 default: 831 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 832 case S_BUFFER_LOAD_IMM: 833 switch (Width) { 834 default: 835 return false; 836 case 2: 837 case 4: 838 return true; 839 } 840 } 841 } 842 843 /// This function assumes that CI comes before Paired in a basic block. 844 bool SILoadStoreOptimizer::checkAndPrepareMerge( 845 CombineInfo &CI, CombineInfo &Paired, 846 SmallVectorImpl<MachineInstr *> &InstsToMove) { 847 848 // Check both offsets (or masks for MIMG) can be combined and fit in the 849 // reduced range. 850 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 851 return false; 852 853 if (CI.InstClass != MIMG && 854 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 855 return false; 856 857 const unsigned Opc = CI.I->getOpcode(); 858 const InstClassEnum InstClass = getInstClass(Opc, *TII); 859 860 if (InstClass == UNKNOWN) { 861 return false; 862 } 863 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 864 865 // Do not merge VMEM buffer instructions with "swizzled" bit set. 866 int Swizzled = 867 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 868 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 869 return false; 870 871 DenseSet<Register> RegDefsToMove; 872 DenseSet<Register> PhysRegUsesToMove; 873 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 874 875 MachineBasicBlock::iterator E = std::next(Paired.I); 876 MachineBasicBlock::iterator MBBI = std::next(CI.I); 877 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 878 for (; MBBI != E; ++MBBI) { 879 880 if (MBBI == MBBE) { 881 // CombineInfo::Order is a hint on the instruction ordering within the 882 // basic block. This hint suggests that CI precedes Paired, which is 883 // true most of the time. However, moveInstsAfter() processing a 884 // previous list may have changed this order in a situation when it 885 // moves an instruction which exists in some other merge list. 886 // In this case it must be dependent. 887 return false; 888 } 889 890 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 891 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 892 // This is not a matching instruction, but we can keep looking as 893 // long as one of these conditions are met: 894 // 1. It is safe to move I down past MBBI. 895 // 2. It is safe to move MBBI down past the instruction that I will 896 // be merged into. 897 898 if (MBBI->hasUnmodeledSideEffects()) { 899 // We can't re-order this instruction with respect to other memory 900 // operations, so we fail both conditions mentioned above. 901 return false; 902 } 903 904 if (MBBI->mayLoadOrStore() && 905 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 906 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 907 // We fail condition #1, but we may still be able to satisfy condition 908 // #2. Add this instruction to the move list and then we will check 909 // if condition #2 holds once we have selected the matching instruction. 910 InstsToMove.push_back(&*MBBI); 911 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 912 continue; 913 } 914 915 // When we match I with another DS instruction we will be moving I down 916 // to the location of the matched instruction any uses of I will need to 917 // be moved down as well. 918 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 919 InstsToMove); 920 continue; 921 } 922 923 // Don't merge volatiles. 924 if (MBBI->hasOrderedMemoryRef()) 925 return false; 926 927 int Swizzled = 928 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 929 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 930 return false; 931 932 // Handle a case like 933 // DS_WRITE_B32 addr, v, idx0 934 // w = DS_READ_B32 addr, idx0 935 // DS_WRITE_B32 addr, f(w), idx1 936 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 937 // merging of the two writes. 938 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 939 InstsToMove)) 940 continue; 941 942 if (&*MBBI == &*Paired.I) { 943 // We need to go through the list of instructions that we plan to 944 // move and make sure they are all safe to move down past the merged 945 // instruction. 946 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 947 948 // Call offsetsCanBeCombined with modify = true so that the offsets are 949 // correct for the new instruction. This should return true, because 950 // this function should only be called on CombineInfo objects that 951 // have already been confirmed to be mergeable. 952 if (CI.InstClass != MIMG) 953 offsetsCanBeCombined(CI, *STM, Paired, true); 954 return true; 955 } 956 return false; 957 } 958 959 // We've found a load/store that we couldn't merge for some reason. 960 // We could potentially keep looking, but we'd need to make sure that 961 // it was safe to move I and also all the instruction in InstsToMove 962 // down past this instruction. 963 // check if we can move I across MBBI and if we can move all I's users 964 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 965 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 966 break; 967 } 968 return false; 969 } 970 971 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 972 if (STM->ldsRequiresM0Init()) 973 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 974 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 975 } 976 977 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 978 if (STM->ldsRequiresM0Init()) 979 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 980 981 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 982 : AMDGPU::DS_READ2ST64_B64_gfx9; 983 } 984 985 MachineBasicBlock::iterator 986 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 987 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 988 MachineBasicBlock *MBB = CI.I->getParent(); 989 990 // Be careful, since the addresses could be subregisters themselves in weird 991 // cases, like vectors of pointers. 992 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 993 994 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 995 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 996 997 unsigned NewOffset0 = CI.Offset; 998 unsigned NewOffset1 = Paired.Offset; 999 unsigned Opc = 1000 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1001 1002 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1003 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1004 1005 if (NewOffset0 > NewOffset1) { 1006 // Canonicalize the merged instruction so the smaller offset comes first. 1007 std::swap(NewOffset0, NewOffset1); 1008 std::swap(SubRegIdx0, SubRegIdx1); 1009 } 1010 1011 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1012 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1013 1014 const MCInstrDesc &Read2Desc = TII->get(Opc); 1015 1016 const TargetRegisterClass *SuperRC = 1017 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1018 Register DestReg = MRI->createVirtualRegister(SuperRC); 1019 1020 DebugLoc DL = CI.I->getDebugLoc(); 1021 1022 Register BaseReg = AddrReg->getReg(); 1023 unsigned BaseSubReg = AddrReg->getSubReg(); 1024 unsigned BaseRegFlags = 0; 1025 if (CI.BaseOff) { 1026 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1027 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1028 .addImm(CI.BaseOff); 1029 1030 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1031 BaseRegFlags = RegState::Kill; 1032 1033 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1034 .addReg(ImmReg) 1035 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1036 .addImm(0); // clamp bit 1037 BaseSubReg = 0; 1038 } 1039 1040 MachineInstrBuilder Read2 = 1041 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1042 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1043 .addImm(NewOffset0) // offset0 1044 .addImm(NewOffset1) // offset1 1045 .addImm(0) // gds 1046 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1047 1048 (void)Read2; 1049 1050 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1051 1052 // Copy to the old destination registers. 1053 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1054 .add(*Dest0) // Copy to same destination including flags and sub reg. 1055 .addReg(DestReg, 0, SubRegIdx0); 1056 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1057 .add(*Dest1) 1058 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1059 1060 moveInstsAfter(Copy1, InstsToMove); 1061 1062 CI.I->eraseFromParent(); 1063 Paired.I->eraseFromParent(); 1064 1065 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1066 return Read2; 1067 } 1068 1069 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1070 if (STM->ldsRequiresM0Init()) 1071 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1072 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1073 : AMDGPU::DS_WRITE2_B64_gfx9; 1074 } 1075 1076 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1077 if (STM->ldsRequiresM0Init()) 1078 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1079 : AMDGPU::DS_WRITE2ST64_B64; 1080 1081 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1082 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1083 } 1084 1085 MachineBasicBlock::iterator 1086 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1087 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1088 MachineBasicBlock *MBB = CI.I->getParent(); 1089 1090 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1091 // sure we preserve the subregister index and any register flags set on them. 1092 const MachineOperand *AddrReg = 1093 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1094 const MachineOperand *Data0 = 1095 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1096 const MachineOperand *Data1 = 1097 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1098 1099 unsigned NewOffset0 = CI.Offset; 1100 unsigned NewOffset1 = Paired.Offset; 1101 unsigned Opc = 1102 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1103 1104 if (NewOffset0 > NewOffset1) { 1105 // Canonicalize the merged instruction so the smaller offset comes first. 1106 std::swap(NewOffset0, NewOffset1); 1107 std::swap(Data0, Data1); 1108 } 1109 1110 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1111 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1112 1113 const MCInstrDesc &Write2Desc = TII->get(Opc); 1114 DebugLoc DL = CI.I->getDebugLoc(); 1115 1116 Register BaseReg = AddrReg->getReg(); 1117 unsigned BaseSubReg = AddrReg->getSubReg(); 1118 unsigned BaseRegFlags = 0; 1119 if (CI.BaseOff) { 1120 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1121 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1122 .addImm(CI.BaseOff); 1123 1124 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1125 BaseRegFlags = RegState::Kill; 1126 1127 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1128 .addReg(ImmReg) 1129 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1130 .addImm(0); // clamp bit 1131 BaseSubReg = 0; 1132 } 1133 1134 MachineInstrBuilder Write2 = 1135 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1136 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1137 .add(*Data0) // data0 1138 .add(*Data1) // data1 1139 .addImm(NewOffset0) // offset0 1140 .addImm(NewOffset1) // offset1 1141 .addImm(0) // gds 1142 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1143 1144 moveInstsAfter(Write2, InstsToMove); 1145 1146 CI.I->eraseFromParent(); 1147 Paired.I->eraseFromParent(); 1148 1149 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1150 return Write2; 1151 } 1152 1153 MachineBasicBlock::iterator 1154 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1155 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1156 MachineBasicBlock *MBB = CI.I->getParent(); 1157 DebugLoc DL = CI.I->getDebugLoc(); 1158 const unsigned Opcode = getNewOpcode(CI, Paired); 1159 1160 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1161 1162 Register DestReg = MRI->createVirtualRegister(SuperRC); 1163 unsigned MergedDMask = CI.DMask | Paired.DMask; 1164 unsigned DMaskIdx = 1165 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1166 1167 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1168 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1169 if (I == DMaskIdx) 1170 MIB.addImm(MergedDMask); 1171 else 1172 MIB.add((*CI.I).getOperand(I)); 1173 } 1174 1175 // It shouldn't be possible to get this far if the two instructions 1176 // don't have a single memoperand, because MachineInstr::mayAlias() 1177 // will return true if this is the case. 1178 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1179 1180 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1181 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1182 1183 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1184 1185 unsigned SubRegIdx0, SubRegIdx1; 1186 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1187 1188 // Copy to the old destination registers. 1189 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1190 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1191 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1192 1193 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1194 .add(*Dest0) // Copy to same destination including flags and sub reg. 1195 .addReg(DestReg, 0, SubRegIdx0); 1196 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1197 .add(*Dest1) 1198 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1199 1200 moveInstsAfter(Copy1, InstsToMove); 1201 1202 CI.I->eraseFromParent(); 1203 Paired.I->eraseFromParent(); 1204 return New; 1205 } 1206 1207 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1208 CombineInfo &CI, CombineInfo &Paired, 1209 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1210 MachineBasicBlock *MBB = CI.I->getParent(); 1211 DebugLoc DL = CI.I->getDebugLoc(); 1212 const unsigned Opcode = getNewOpcode(CI, Paired); 1213 1214 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1215 1216 Register DestReg = MRI->createVirtualRegister(SuperRC); 1217 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1218 1219 // It shouldn't be possible to get this far if the two instructions 1220 // don't have a single memoperand, because MachineInstr::mayAlias() 1221 // will return true if this is the case. 1222 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1223 1224 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1225 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1226 1227 MachineInstr *New = 1228 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1229 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1230 .addImm(MergedOffset) // offset 1231 .addImm(CI.GLC) // glc 1232 .addImm(CI.DLC) // dlc 1233 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1234 1235 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1236 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1237 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1238 1239 // Copy to the old destination registers. 1240 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1241 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1242 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1243 1244 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1245 .add(*Dest0) // Copy to same destination including flags and sub reg. 1246 .addReg(DestReg, 0, SubRegIdx0); 1247 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1248 .add(*Dest1) 1249 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1250 1251 moveInstsAfter(Copy1, InstsToMove); 1252 1253 CI.I->eraseFromParent(); 1254 Paired.I->eraseFromParent(); 1255 return New; 1256 } 1257 1258 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1259 CombineInfo &CI, CombineInfo &Paired, 1260 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1261 MachineBasicBlock *MBB = CI.I->getParent(); 1262 DebugLoc DL = CI.I->getDebugLoc(); 1263 1264 const unsigned Opcode = getNewOpcode(CI, Paired); 1265 1266 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1267 1268 // Copy to the new source register. 1269 Register DestReg = MRI->createVirtualRegister(SuperRC); 1270 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1271 1272 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1273 1274 AddressRegs Regs = getRegs(Opcode, *TII); 1275 1276 if (Regs.VAddr) 1277 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1278 1279 // It shouldn't be possible to get this far if the two instructions 1280 // don't have a single memoperand, because MachineInstr::mayAlias() 1281 // will return true if this is the case. 1282 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1283 1284 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1285 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1286 1287 MachineInstr *New = 1288 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1289 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1290 .addImm(MergedOffset) // offset 1291 .addImm(CI.GLC) // glc 1292 .addImm(CI.SLC) // slc 1293 .addImm(0) // tfe 1294 .addImm(CI.DLC) // dlc 1295 .addImm(0) // swz 1296 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1297 1298 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1299 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1300 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1301 1302 // Copy to the old destination registers. 1303 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1304 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1305 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1306 1307 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1308 .add(*Dest0) // Copy to same destination including flags and sub reg. 1309 .addReg(DestReg, 0, SubRegIdx0); 1310 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1311 .add(*Dest1) 1312 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1313 1314 moveInstsAfter(Copy1, InstsToMove); 1315 1316 CI.I->eraseFromParent(); 1317 Paired.I->eraseFromParent(); 1318 return New; 1319 } 1320 1321 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1322 CombineInfo &CI, CombineInfo &Paired, 1323 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1324 MachineBasicBlock *MBB = CI.I->getParent(); 1325 DebugLoc DL = CI.I->getDebugLoc(); 1326 1327 const unsigned Opcode = getNewOpcode(CI, Paired); 1328 1329 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1330 1331 // Copy to the new source register. 1332 Register DestReg = MRI->createVirtualRegister(SuperRC); 1333 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1334 1335 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1336 1337 AddressRegs Regs = getRegs(Opcode, *TII); 1338 1339 if (Regs.VAddr) 1340 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1341 1342 unsigned JoinedFormat = 1343 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1344 1345 // It shouldn't be possible to get this far if the two instructions 1346 // don't have a single memoperand, because MachineInstr::mayAlias() 1347 // will return true if this is the case. 1348 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1349 1350 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1351 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1352 1353 MachineInstr *New = 1354 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1355 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1356 .addImm(MergedOffset) // offset 1357 .addImm(JoinedFormat) // format 1358 .addImm(CI.GLC) // glc 1359 .addImm(CI.SLC) // slc 1360 .addImm(0) // tfe 1361 .addImm(CI.DLC) // dlc 1362 .addImm(0) // swz 1363 .addMemOperand( 1364 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1365 1366 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1367 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1368 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1369 1370 // Copy to the old destination registers. 1371 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1372 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1373 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1374 1375 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1376 .add(*Dest0) // Copy to same destination including flags and sub reg. 1377 .addReg(DestReg, 0, SubRegIdx0); 1378 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1379 .add(*Dest1) 1380 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1381 1382 moveInstsAfter(Copy1, InstsToMove); 1383 1384 CI.I->eraseFromParent(); 1385 Paired.I->eraseFromParent(); 1386 return New; 1387 } 1388 1389 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1390 CombineInfo &CI, CombineInfo &Paired, 1391 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1392 MachineBasicBlock *MBB = CI.I->getParent(); 1393 DebugLoc DL = CI.I->getDebugLoc(); 1394 1395 const unsigned Opcode = getNewOpcode(CI, Paired); 1396 1397 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1398 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1399 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1400 1401 // Copy to the new source register. 1402 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1403 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1404 1405 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1406 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1407 1408 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1409 .add(*Src0) 1410 .addImm(SubRegIdx0) 1411 .add(*Src1) 1412 .addImm(SubRegIdx1); 1413 1414 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1415 .addReg(SrcReg, RegState::Kill); 1416 1417 AddressRegs Regs = getRegs(Opcode, *TII); 1418 1419 if (Regs.VAddr) 1420 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1421 1422 unsigned JoinedFormat = 1423 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1424 1425 // It shouldn't be possible to get this far if the two instructions 1426 // don't have a single memoperand, because MachineInstr::mayAlias() 1427 // will return true if this is the case. 1428 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1429 1430 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1431 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1432 1433 MachineInstr *New = 1434 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1435 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1436 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1437 .addImm(JoinedFormat) // format 1438 .addImm(CI.GLC) // glc 1439 .addImm(CI.SLC) // slc 1440 .addImm(0) // tfe 1441 .addImm(CI.DLC) // dlc 1442 .addImm(0) // swz 1443 .addMemOperand( 1444 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1445 1446 moveInstsAfter(MIB, InstsToMove); 1447 1448 CI.I->eraseFromParent(); 1449 Paired.I->eraseFromParent(); 1450 return New; 1451 } 1452 1453 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1454 const CombineInfo &Paired) { 1455 const unsigned Width = CI.Width + Paired.Width; 1456 1457 switch (CI.InstClass) { 1458 default: 1459 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1460 // FIXME: Handle d16 correctly 1461 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1462 Width); 1463 case TBUFFER_LOAD: 1464 case TBUFFER_STORE: 1465 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1466 Width); 1467 1468 case UNKNOWN: 1469 llvm_unreachable("Unknown instruction class"); 1470 case S_BUFFER_LOAD_IMM: 1471 switch (Width) { 1472 default: 1473 return 0; 1474 case 2: 1475 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1476 case 4: 1477 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1478 } 1479 case MIMG: 1480 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1481 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1482 } 1483 } 1484 1485 std::pair<unsigned, unsigned> 1486 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1487 1488 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1489 return std::make_pair(0, 0); 1490 1491 bool ReverseOrder; 1492 if (CI.InstClass == MIMG) { 1493 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1494 "No overlaps"); 1495 ReverseOrder = CI.DMask > Paired.DMask; 1496 } else 1497 ReverseOrder = CI.Offset > Paired.Offset; 1498 1499 static const unsigned Idxs[4][4] = { 1500 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1501 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1502 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1503 {AMDGPU::sub3, 0, 0, 0}, 1504 }; 1505 unsigned Idx0; 1506 unsigned Idx1; 1507 1508 assert(CI.Width >= 1 && CI.Width <= 3); 1509 assert(Paired.Width >= 1 && Paired.Width <= 3); 1510 1511 if (ReverseOrder) { 1512 Idx1 = Idxs[0][Paired.Width - 1]; 1513 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1514 } else { 1515 Idx0 = Idxs[0][CI.Width - 1]; 1516 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1517 } 1518 1519 return std::make_pair(Idx0, Idx1); 1520 } 1521 1522 const TargetRegisterClass * 1523 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1524 const CombineInfo &Paired) { 1525 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1526 switch (CI.Width + Paired.Width) { 1527 default: 1528 return nullptr; 1529 case 2: 1530 return &AMDGPU::SReg_64_XEXECRegClass; 1531 case 4: 1532 return &AMDGPU::SGPR_128RegClass; 1533 case 8: 1534 return &AMDGPU::SGPR_256RegClass; 1535 case 16: 1536 return &AMDGPU::SGPR_512RegClass; 1537 } 1538 } else { 1539 switch (CI.Width + Paired.Width) { 1540 default: 1541 return nullptr; 1542 case 2: 1543 return &AMDGPU::VReg_64RegClass; 1544 case 3: 1545 return &AMDGPU::VReg_96RegClass; 1546 case 4: 1547 return &AMDGPU::VReg_128RegClass; 1548 } 1549 } 1550 } 1551 1552 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1553 CombineInfo &CI, CombineInfo &Paired, 1554 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1555 MachineBasicBlock *MBB = CI.I->getParent(); 1556 DebugLoc DL = CI.I->getDebugLoc(); 1557 1558 const unsigned Opcode = getNewOpcode(CI, Paired); 1559 1560 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1561 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1562 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1563 1564 // Copy to the new source register. 1565 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1566 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1567 1568 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1569 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1570 1571 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1572 .add(*Src0) 1573 .addImm(SubRegIdx0) 1574 .add(*Src1) 1575 .addImm(SubRegIdx1); 1576 1577 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1578 .addReg(SrcReg, RegState::Kill); 1579 1580 AddressRegs Regs = getRegs(Opcode, *TII); 1581 1582 if (Regs.VAddr) 1583 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1584 1585 1586 // It shouldn't be possible to get this far if the two instructions 1587 // don't have a single memoperand, because MachineInstr::mayAlias() 1588 // will return true if this is the case. 1589 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1590 1591 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1592 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1593 1594 MachineInstr *New = 1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1596 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1597 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1598 .addImm(CI.GLC) // glc 1599 .addImm(CI.SLC) // slc 1600 .addImm(0) // tfe 1601 .addImm(CI.DLC) // dlc 1602 .addImm(0) // swz 1603 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1604 1605 moveInstsAfter(MIB, InstsToMove); 1606 1607 CI.I->eraseFromParent(); 1608 Paired.I->eraseFromParent(); 1609 return New; 1610 } 1611 1612 MachineOperand 1613 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1614 APInt V(32, Val, true); 1615 if (TII->isInlineConstant(V)) 1616 return MachineOperand::CreateImm(Val); 1617 1618 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1619 MachineInstr *Mov = 1620 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1621 TII->get(AMDGPU::S_MOV_B32), Reg) 1622 .addImm(Val); 1623 (void)Mov; 1624 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1625 return MachineOperand::CreateReg(Reg, false); 1626 } 1627 1628 // Compute base address using Addr and return the final register. 1629 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1630 const MemAddress &Addr) const { 1631 MachineBasicBlock *MBB = MI.getParent(); 1632 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1633 DebugLoc DL = MI.getDebugLoc(); 1634 1635 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1636 Addr.Base.LoSubReg) && 1637 "Expected 32-bit Base-Register-Low!!"); 1638 1639 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1640 Addr.Base.HiSubReg) && 1641 "Expected 32-bit Base-Register-Hi!!"); 1642 1643 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1644 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1645 MachineOperand OffsetHi = 1646 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1647 1648 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1649 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1650 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1651 1652 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1653 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1654 MachineInstr *LoHalf = 1655 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1656 .addReg(CarryReg, RegState::Define) 1657 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1658 .add(OffsetLo) 1659 .addImm(0); // clamp bit 1660 (void)LoHalf; 1661 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1662 1663 MachineInstr *HiHalf = 1664 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1665 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1666 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1667 .add(OffsetHi) 1668 .addReg(CarryReg, RegState::Kill) 1669 .addImm(0); // clamp bit 1670 (void)HiHalf; 1671 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1672 1673 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1674 MachineInstr *FullBase = 1675 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1676 .addReg(DestSub0) 1677 .addImm(AMDGPU::sub0) 1678 .addReg(DestSub1) 1679 .addImm(AMDGPU::sub1); 1680 (void)FullBase; 1681 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1682 1683 return FullDestReg; 1684 } 1685 1686 // Update base and offset with the NewBase and NewOffset in MI. 1687 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1688 Register NewBase, 1689 int32_t NewOffset) const { 1690 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1691 Base->setReg(NewBase); 1692 Base->setIsKill(false); 1693 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1694 } 1695 1696 Optional<int32_t> 1697 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1698 if (Op.isImm()) 1699 return Op.getImm(); 1700 1701 if (!Op.isReg()) 1702 return None; 1703 1704 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1705 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1706 !Def->getOperand(1).isImm()) 1707 return None; 1708 1709 return Def->getOperand(1).getImm(); 1710 } 1711 1712 // Analyze Base and extracts: 1713 // - 32bit base registers, subregisters 1714 // - 64bit constant offset 1715 // Expecting base computation as: 1716 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1717 // %LO:vgpr_32, %c:sreg_64_xexec = 1718 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1719 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1720 // %Base:vreg_64 = 1721 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1722 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1723 MemAddress &Addr) const { 1724 if (!Base.isReg()) 1725 return; 1726 1727 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1728 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1729 || Def->getNumOperands() != 5) 1730 return; 1731 1732 MachineOperand BaseLo = Def->getOperand(1); 1733 MachineOperand BaseHi = Def->getOperand(3); 1734 if (!BaseLo.isReg() || !BaseHi.isReg()) 1735 return; 1736 1737 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1738 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1739 1740 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1741 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1742 return; 1743 1744 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1745 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1746 1747 auto Offset0P = extractConstOffset(*Src0); 1748 if (Offset0P) 1749 BaseLo = *Src1; 1750 else { 1751 if (!(Offset0P = extractConstOffset(*Src1))) 1752 return; 1753 BaseLo = *Src0; 1754 } 1755 1756 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1757 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1758 1759 if (Src0->isImm()) 1760 std::swap(Src0, Src1); 1761 1762 if (!Src1->isImm()) 1763 return; 1764 1765 uint64_t Offset1 = Src1->getImm(); 1766 BaseHi = *Src0; 1767 1768 Addr.Base.LoReg = BaseLo.getReg(); 1769 Addr.Base.HiReg = BaseHi.getReg(); 1770 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1771 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1772 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1773 } 1774 1775 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1776 MachineInstr &MI, 1777 MemInfoMap &Visited, 1778 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1779 1780 if (!(MI.mayLoad() ^ MI.mayStore())) 1781 return false; 1782 1783 // TODO: Support flat and scratch. 1784 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1785 return false; 1786 1787 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1788 return false; 1789 1790 if (AnchorList.count(&MI)) 1791 return false; 1792 1793 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1794 1795 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1796 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1797 return false; 1798 } 1799 1800 // Step1: Find the base-registers and a 64bit constant offset. 1801 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1802 MemAddress MAddr; 1803 if (Visited.find(&MI) == Visited.end()) { 1804 processBaseWithConstOffset(Base, MAddr); 1805 Visited[&MI] = MAddr; 1806 } else 1807 MAddr = Visited[&MI]; 1808 1809 if (MAddr.Offset == 0) { 1810 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1811 " constant offsets that can be promoted.\n";); 1812 return false; 1813 } 1814 1815 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1816 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1817 1818 // Step2: Traverse through MI's basic block and find an anchor(that has the 1819 // same base-registers) with the highest 13bit distance from MI's offset. 1820 // E.g. (64bit loads) 1821 // bb: 1822 // addr1 = &a + 4096; load1 = load(addr1, 0) 1823 // addr2 = &a + 6144; load2 = load(addr2, 0) 1824 // addr3 = &a + 8192; load3 = load(addr3, 0) 1825 // addr4 = &a + 10240; load4 = load(addr4, 0) 1826 // addr5 = &a + 12288; load5 = load(addr5, 0) 1827 // 1828 // Starting from the first load, the optimization will try to find a new base 1829 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1830 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1831 // as the new-base(anchor) because of the maximum distance which can 1832 // accomodate more intermediate bases presumeably. 1833 // 1834 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1835 // (&a + 8192) for load1, load2, load4. 1836 // addr = &a + 8192 1837 // load1 = load(addr, -4096) 1838 // load2 = load(addr, -2048) 1839 // load3 = load(addr, 0) 1840 // load4 = load(addr, 2048) 1841 // addr5 = &a + 12288; load5 = load(addr5, 0) 1842 // 1843 MachineInstr *AnchorInst = nullptr; 1844 MemAddress AnchorAddr; 1845 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1846 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1847 1848 MachineBasicBlock *MBB = MI.getParent(); 1849 MachineBasicBlock::iterator E = MBB->end(); 1850 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1851 ++MBBI; 1852 const SITargetLowering *TLI = 1853 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1854 1855 for ( ; MBBI != E; ++MBBI) { 1856 MachineInstr &MINext = *MBBI; 1857 // TODO: Support finding an anchor(with same base) from store addresses or 1858 // any other load addresses where the opcodes are different. 1859 if (MINext.getOpcode() != MI.getOpcode() || 1860 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1861 continue; 1862 1863 const MachineOperand &BaseNext = 1864 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1865 MemAddress MAddrNext; 1866 if (Visited.find(&MINext) == Visited.end()) { 1867 processBaseWithConstOffset(BaseNext, MAddrNext); 1868 Visited[&MINext] = MAddrNext; 1869 } else 1870 MAddrNext = Visited[&MINext]; 1871 1872 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1873 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1874 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1875 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1876 continue; 1877 1878 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1879 1880 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1881 TargetLoweringBase::AddrMode AM; 1882 AM.HasBaseReg = true; 1883 AM.BaseOffs = Dist; 1884 if (TLI->isLegalGlobalAddressingMode(AM) && 1885 (uint32_t)std::abs(Dist) > MaxDist) { 1886 MaxDist = std::abs(Dist); 1887 1888 AnchorAddr = MAddrNext; 1889 AnchorInst = &MINext; 1890 } 1891 } 1892 1893 if (AnchorInst) { 1894 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1895 AnchorInst->dump()); 1896 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1897 << AnchorAddr.Offset << "\n\n"); 1898 1899 // Instead of moving up, just re-compute anchor-instruction's base address. 1900 Register Base = computeBase(MI, AnchorAddr); 1901 1902 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1903 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1904 1905 for (auto P : InstsWCommonBase) { 1906 TargetLoweringBase::AddrMode AM; 1907 AM.HasBaseReg = true; 1908 AM.BaseOffs = P.second - AnchorAddr.Offset; 1909 1910 if (TLI->isLegalGlobalAddressingMode(AM)) { 1911 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1912 dbgs() << ")"; P.first->dump()); 1913 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1914 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1915 } 1916 } 1917 AnchorList.insert(AnchorInst); 1918 return true; 1919 } 1920 1921 return false; 1922 } 1923 1924 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1925 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1926 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1927 if (AddrList.front().InstClass == CI.InstClass && 1928 AddrList.front().hasSameBaseAddress(*CI.I)) { 1929 AddrList.emplace_back(CI); 1930 return; 1931 } 1932 } 1933 1934 // Base address not found, so add a new list. 1935 MergeableInsts.emplace_back(1, CI); 1936 } 1937 1938 std::pair<MachineBasicBlock::iterator, bool> 1939 SILoadStoreOptimizer::collectMergeableInsts( 1940 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1941 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1942 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1943 bool Modified = false; 1944 1945 // Sort potential mergeable instructions into lists. One list per base address. 1946 unsigned Order = 0; 1947 MachineBasicBlock::iterator BlockI = Begin; 1948 for (; BlockI != End; ++BlockI) { 1949 MachineInstr &MI = *BlockI; 1950 1951 // We run this before checking if an address is mergeable, because it can produce 1952 // better code even if the instructions aren't mergeable. 1953 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1954 Modified = true; 1955 1956 // Don't combine if volatile. We also won't be able to merge across this, so 1957 // break the search. We can look after this barrier for separate merges. 1958 if (MI.hasOrderedMemoryRef()) { 1959 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1960 1961 // Search will resume after this instruction in a separate merge list. 1962 ++BlockI; 1963 break; 1964 } 1965 1966 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1967 if (InstClass == UNKNOWN) 1968 continue; 1969 1970 CombineInfo CI; 1971 CI.setMI(MI, *TII, *STM); 1972 CI.Order = Order++; 1973 1974 if (!CI.hasMergeableAddress(*MRI)) 1975 continue; 1976 1977 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 1978 1979 addInstToMergeableList(CI, MergeableInsts); 1980 } 1981 1982 // At this point we have lists of Mergeable instructions. 1983 // 1984 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1985 // list try to find an instruction that can be merged with I. If an instruction 1986 // is found, it is stored in the Paired field. If no instructions are found, then 1987 // the CombineInfo object is deleted from the list. 1988 1989 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 1990 E = MergeableInsts.end(); I != E;) { 1991 1992 std::list<CombineInfo> &MergeList = *I; 1993 if (MergeList.size() <= 1) { 1994 // This means we have found only one instruction with a given address 1995 // that can be merged, and we need at least 2 instructions to do a merge, 1996 // so this list can be discarded. 1997 I = MergeableInsts.erase(I); 1998 continue; 1999 } 2000 2001 // Sort the lists by offsets, this way mergeable instructions will be 2002 // adjacent to each other in the list, which will make it easier to find 2003 // matches. 2004 MergeList.sort( 2005 [] (const CombineInfo &A, CombineInfo &B) { 2006 return A.Offset < B.Offset; 2007 }); 2008 ++I; 2009 } 2010 2011 return std::make_pair(BlockI, Modified); 2012 } 2013 2014 // Scan through looking for adjacent LDS operations with constant offsets from 2015 // the same base register. We rely on the scheduler to do the hard work of 2016 // clustering nearby loads, and assume these are all adjacent. 2017 bool SILoadStoreOptimizer::optimizeBlock( 2018 std::list<std::list<CombineInfo> > &MergeableInsts) { 2019 bool Modified = false; 2020 2021 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2022 E = MergeableInsts.end(); I != E;) { 2023 std::list<CombineInfo> &MergeList = *I; 2024 2025 bool OptimizeListAgain = false; 2026 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2027 // We weren't able to make any changes, so delete the list so we don't 2028 // process the same instructions the next time we try to optimize this 2029 // block. 2030 I = MergeableInsts.erase(I); 2031 continue; 2032 } 2033 2034 Modified = true; 2035 2036 // We made changes, but also determined that there were no more optimization 2037 // opportunities, so we don't need to reprocess the list 2038 if (!OptimizeListAgain) { 2039 I = MergeableInsts.erase(I); 2040 continue; 2041 } 2042 OptimizeAgain = true; 2043 } 2044 return Modified; 2045 } 2046 2047 bool 2048 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2049 std::list<CombineInfo> &MergeList, 2050 bool &OptimizeListAgain) { 2051 if (MergeList.empty()) 2052 return false; 2053 2054 bool Modified = false; 2055 2056 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2057 Next = std::next(I)) { 2058 2059 auto First = I; 2060 auto Second = Next; 2061 2062 if ((*First).Order > (*Second).Order) 2063 std::swap(First, Second); 2064 CombineInfo &CI = *First; 2065 CombineInfo &Paired = *Second; 2066 2067 SmallVector<MachineInstr *, 8> InstsToMove; 2068 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2069 ++I; 2070 continue; 2071 } 2072 2073 Modified = true; 2074 2075 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2076 2077 switch (CI.InstClass) { 2078 default: 2079 llvm_unreachable("unknown InstClass"); 2080 break; 2081 case DS_READ: { 2082 MachineBasicBlock::iterator NewMI = 2083 mergeRead2Pair(CI, Paired, InstsToMove); 2084 CI.setMI(NewMI, *TII, *STM); 2085 break; 2086 } 2087 case DS_WRITE: { 2088 MachineBasicBlock::iterator NewMI = 2089 mergeWrite2Pair(CI, Paired, InstsToMove); 2090 CI.setMI(NewMI, *TII, *STM); 2091 break; 2092 } 2093 case S_BUFFER_LOAD_IMM: { 2094 MachineBasicBlock::iterator NewMI = 2095 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2096 CI.setMI(NewMI, *TII, *STM); 2097 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2098 break; 2099 } 2100 case BUFFER_LOAD: { 2101 MachineBasicBlock::iterator NewMI = 2102 mergeBufferLoadPair(CI, Paired, InstsToMove); 2103 CI.setMI(NewMI, *TII, *STM); 2104 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2105 break; 2106 } 2107 case BUFFER_STORE: { 2108 MachineBasicBlock::iterator NewMI = 2109 mergeBufferStorePair(CI, Paired, InstsToMove); 2110 CI.setMI(NewMI, *TII, *STM); 2111 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2112 break; 2113 } 2114 case MIMG: { 2115 MachineBasicBlock::iterator NewMI = 2116 mergeImagePair(CI, Paired, InstsToMove); 2117 CI.setMI(NewMI, *TII, *STM); 2118 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2119 break; 2120 } 2121 case TBUFFER_LOAD: { 2122 MachineBasicBlock::iterator NewMI = 2123 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2124 CI.setMI(NewMI, *TII, *STM); 2125 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2126 break; 2127 } 2128 case TBUFFER_STORE: { 2129 MachineBasicBlock::iterator NewMI = 2130 mergeTBufferStorePair(CI, Paired, InstsToMove); 2131 CI.setMI(NewMI, *TII, *STM); 2132 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2133 break; 2134 } 2135 } 2136 CI.Order = Paired.Order; 2137 if (I == Second) 2138 I = Next; 2139 2140 MergeList.erase(Second); 2141 } 2142 2143 return Modified; 2144 } 2145 2146 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2147 if (skipFunction(MF.getFunction())) 2148 return false; 2149 2150 STM = &MF.getSubtarget<GCNSubtarget>(); 2151 if (!STM->loadStoreOptEnabled()) 2152 return false; 2153 2154 TII = STM->getInstrInfo(); 2155 TRI = &TII->getRegisterInfo(); 2156 2157 MRI = &MF.getRegInfo(); 2158 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2159 2160 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2161 2162 bool Modified = false; 2163 2164 // Contains the list of instructions for which constant offsets are being 2165 // promoted to the IMM. This is tracked for an entire block at time. 2166 SmallPtrSet<MachineInstr *, 4> AnchorList; 2167 MemInfoMap Visited; 2168 2169 for (MachineBasicBlock &MBB : MF) { 2170 MachineBasicBlock::iterator SectionEnd; 2171 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2172 I = SectionEnd) { 2173 bool CollectModified; 2174 std::list<std::list<CombineInfo>> MergeableInsts; 2175 2176 // First pass: Collect list of all instructions we know how to merge in a 2177 // subset of the block. 2178 std::tie(SectionEnd, CollectModified) = 2179 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2180 2181 Modified |= CollectModified; 2182 2183 do { 2184 OptimizeAgain = false; 2185 Modified |= optimizeBlock(MergeableInsts); 2186 } while (OptimizeAgain); 2187 } 2188 2189 Visited.clear(); 2190 AnchorList.clear(); 2191 } 2192 2193 return Modified; 2194 } 2195