1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "SIInstrInfo.h" 64 #include "SIRegisterInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/StringRef.h" 69 #include "llvm/Analysis/AliasAnalysis.h" 70 #include "llvm/CodeGen/MachineBasicBlock.h" 71 #include "llvm/CodeGen/MachineFunction.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstr.h" 74 #include "llvm/CodeGen/MachineInstrBuilder.h" 75 #include "llvm/CodeGen/MachineOperand.h" 76 #include "llvm/CodeGen/MachineRegisterInfo.h" 77 #include "llvm/IR/DebugLoc.h" 78 #include "llvm/InitializePasses.h" 79 #include "llvm/Pass.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include <algorithm> 84 #include <cassert> 85 #include <cstdlib> 86 #include <iterator> 87 #include <utility> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "si-load-store-opt" 92 93 namespace { 94 enum InstClassEnum { 95 UNKNOWN, 96 DS_READ, 97 DS_WRITE, 98 S_BUFFER_LOAD_IMM, 99 BUFFER_LOAD, 100 BUFFER_STORE, 101 MIMG, 102 TBUFFER_LOAD, 103 TBUFFER_STORE, 104 }; 105 106 struct AddressRegs { 107 unsigned char NumVAddrs = 0; 108 bool SBase = false; 109 bool SRsrc = false; 110 bool SOffset = false; 111 bool VAddr = false; 112 bool Addr = false; 113 bool SSamp = false; 114 }; 115 116 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 117 const unsigned MaxAddressRegs = 12 + 1 + 1; 118 119 class SILoadStoreOptimizer : public MachineFunctionPass { 120 struct CombineInfo { 121 MachineBasicBlock::iterator I; 122 unsigned EltSize; 123 unsigned Offset; 124 unsigned Width; 125 unsigned Format; 126 unsigned BaseOff; 127 unsigned DMask; 128 InstClassEnum InstClass; 129 bool GLC; 130 bool SLC; 131 bool DLC; 132 bool UseST64; 133 int AddrIdx[MaxAddressRegs]; 134 const MachineOperand *AddrReg[MaxAddressRegs]; 135 unsigned NumAddresses; 136 unsigned Order; 137 138 bool hasSameBaseAddress(const MachineInstr &MI) { 139 for (unsigned i = 0; i < NumAddresses; i++) { 140 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 141 142 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 143 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 144 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 145 return false; 146 } 147 continue; 148 } 149 150 // Check same base pointer. Be careful of subregisters, which can occur 151 // with vectors of pointers. 152 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 153 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 154 return false; 155 } 156 } 157 return true; 158 } 159 160 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 161 for (unsigned i = 0; i < NumAddresses; ++i) { 162 const MachineOperand *AddrOp = AddrReg[i]; 163 // Immediates are always OK. 164 if (AddrOp->isImm()) 165 continue; 166 167 // Don't try to merge addresses that aren't either immediates or registers. 168 // TODO: Should be possible to merge FrameIndexes and maybe some other 169 // non-register 170 if (!AddrOp->isReg()) 171 return false; 172 173 // TODO: We should be able to merge physical reg addreses. 174 if (AddrOp->getReg().isPhysical()) 175 return false; 176 177 // If an address has only one use then there will be on other 178 // instructions with the same address, so we can't merge this one. 179 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 180 return false; 181 } 182 return true; 183 } 184 185 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 186 const GCNSubtarget &STM); 187 }; 188 189 struct BaseRegisters { 190 Register LoReg; 191 Register HiReg; 192 193 unsigned LoSubReg = 0; 194 unsigned HiSubReg = 0; 195 }; 196 197 struct MemAddress { 198 BaseRegisters Base; 199 int64_t Offset = 0; 200 }; 201 202 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 203 204 private: 205 const GCNSubtarget *STM = nullptr; 206 const SIInstrInfo *TII = nullptr; 207 const SIRegisterInfo *TRI = nullptr; 208 MachineRegisterInfo *MRI = nullptr; 209 AliasAnalysis *AA = nullptr; 210 bool OptimizeAgain; 211 212 static bool dmasksCanBeCombined(const CombineInfo &CI, 213 const SIInstrInfo &TII, 214 const CombineInfo &Paired); 215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 216 CombineInfo &Paired, bool Modify = false); 217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 218 const CombineInfo &Paired); 219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 221 const CombineInfo &Paired); 222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 223 const CombineInfo &Paired); 224 225 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 226 SmallVectorImpl<MachineInstr *> &InstsToMove); 227 228 unsigned read2Opcode(unsigned EltSize) const; 229 unsigned read2ST64Opcode(unsigned EltSize) const; 230 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 231 CombineInfo &Paired, 232 const SmallVectorImpl<MachineInstr *> &InstsToMove); 233 234 unsigned write2Opcode(unsigned EltSize) const; 235 unsigned write2ST64Opcode(unsigned EltSize) const; 236 MachineBasicBlock::iterator 237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 238 const SmallVectorImpl<MachineInstr *> &InstsToMove); 239 MachineBasicBlock::iterator 240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 241 const SmallVectorImpl<MachineInstr *> &InstsToMove); 242 MachineBasicBlock::iterator 243 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 244 const SmallVectorImpl<MachineInstr *> &InstsToMove); 245 MachineBasicBlock::iterator 246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 247 const SmallVectorImpl<MachineInstr *> &InstsToMove); 248 MachineBasicBlock::iterator 249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 250 const SmallVectorImpl<MachineInstr *> &InstsToMove); 251 MachineBasicBlock::iterator 252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 253 const SmallVectorImpl<MachineInstr *> &InstsToMove); 254 MachineBasicBlock::iterator 255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 256 const SmallVectorImpl<MachineInstr *> &InstsToMove); 257 258 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 259 int32_t NewOffset) const; 260 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 261 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 262 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 263 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 264 /// Promotes constant offset to the immediate by adjusting the base. It 265 /// tries to use a base from the nearby instructions that allows it to have 266 /// a 13bit constant offset which gets promoted to the immediate. 267 bool promoteConstantOffsetToImm(MachineInstr &CI, 268 MemInfoMap &Visited, 269 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 270 void addInstToMergeableList(const CombineInfo &CI, 271 std::list<std::list<CombineInfo> > &MergeableInsts) const; 272 273 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 274 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 275 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 276 std::list<std::list<CombineInfo>> &MergeableInsts) const; 277 278 public: 279 static char ID; 280 281 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 282 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 283 } 284 285 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 286 bool &OptimizeListAgain); 287 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 288 289 bool runOnMachineFunction(MachineFunction &MF) override; 290 291 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 292 293 void getAnalysisUsage(AnalysisUsage &AU) const override { 294 AU.setPreservesCFG(); 295 AU.addRequired<AAResultsWrapperPass>(); 296 297 MachineFunctionPass::getAnalysisUsage(AU); 298 } 299 300 MachineFunctionProperties getRequiredProperties() const override { 301 return MachineFunctionProperties() 302 .set(MachineFunctionProperties::Property::IsSSA); 303 } 304 }; 305 306 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 307 const unsigned Opc = MI.getOpcode(); 308 309 if (TII.isMUBUF(Opc)) { 310 // FIXME: Handle d16 correctly 311 return AMDGPU::getMUBUFElements(Opc); 312 } 313 if (TII.isMIMG(MI)) { 314 uint64_t DMaskImm = 315 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 316 return countPopulation(DMaskImm); 317 } 318 if (TII.isMTBUF(Opc)) { 319 return AMDGPU::getMTBUFElements(Opc); 320 } 321 322 switch (Opc) { 323 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 324 return 1; 325 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 326 return 2; 327 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 328 return 4; 329 default: 330 return 0; 331 } 332 } 333 334 /// Maps instruction opcode to enum InstClassEnum. 335 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 336 switch (Opc) { 337 default: 338 if (TII.isMUBUF(Opc)) { 339 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 340 default: 341 return UNKNOWN; 342 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 343 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 344 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 345 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 346 return BUFFER_LOAD; 347 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 348 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 349 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 350 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 351 return BUFFER_STORE; 352 } 353 } 354 if (TII.isMIMG(Opc)) { 355 // Ignore instructions encoded without vaddr. 356 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 357 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 358 return UNKNOWN; 359 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 360 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 361 TII.isGather4(Opc)) 362 return UNKNOWN; 363 return MIMG; 364 } 365 if (TII.isMTBUF(Opc)) { 366 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 367 default: 368 return UNKNOWN; 369 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 370 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 371 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 372 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 373 return TBUFFER_LOAD; 374 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 375 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 376 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 377 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 378 return TBUFFER_STORE; 379 } 380 } 381 return UNKNOWN; 382 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 383 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 384 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 385 return S_BUFFER_LOAD_IMM; 386 case AMDGPU::DS_READ_B32: 387 case AMDGPU::DS_READ_B32_gfx9: 388 case AMDGPU::DS_READ_B64: 389 case AMDGPU::DS_READ_B64_gfx9: 390 return DS_READ; 391 case AMDGPU::DS_WRITE_B32: 392 case AMDGPU::DS_WRITE_B32_gfx9: 393 case AMDGPU::DS_WRITE_B64: 394 case AMDGPU::DS_WRITE_B64_gfx9: 395 return DS_WRITE; 396 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa: 397 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa: 398 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa: 399 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa: 400 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa: 401 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa: 402 case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa: 403 case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa: 404 return UNKNOWN; 405 } 406 } 407 408 /// Determines instruction subclass from opcode. Only instructions 409 /// of the same subclass can be merged together. 410 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 411 switch (Opc) { 412 default: 413 if (TII.isMUBUF(Opc)) 414 return AMDGPU::getMUBUFBaseOpcode(Opc); 415 if (TII.isMIMG(Opc)) { 416 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 417 assert(Info); 418 return Info->BaseOpcode; 419 } 420 if (TII.isMTBUF(Opc)) 421 return AMDGPU::getMTBUFBaseOpcode(Opc); 422 return -1; 423 case AMDGPU::DS_READ_B32: 424 case AMDGPU::DS_READ_B32_gfx9: 425 case AMDGPU::DS_READ_B64: 426 case AMDGPU::DS_READ_B64_gfx9: 427 case AMDGPU::DS_WRITE_B32: 428 case AMDGPU::DS_WRITE_B32_gfx9: 429 case AMDGPU::DS_WRITE_B64: 430 case AMDGPU::DS_WRITE_B64_gfx9: 431 return Opc; 432 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 433 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 434 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 435 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 436 } 437 } 438 439 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 440 AddressRegs Result; 441 442 if (TII.isMUBUF(Opc)) { 443 if (AMDGPU::getMUBUFHasVAddr(Opc)) 444 Result.VAddr = true; 445 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 446 Result.SRsrc = true; 447 if (AMDGPU::getMUBUFHasSoffset(Opc)) 448 Result.SOffset = true; 449 450 return Result; 451 } 452 453 if (TII.isMIMG(Opc)) { 454 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 455 if (VAddr0Idx >= 0) { 456 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 457 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 458 } else { 459 Result.VAddr = true; 460 } 461 Result.SRsrc = true; 462 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 463 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 464 Result.SSamp = true; 465 466 return Result; 467 } 468 if (TII.isMTBUF(Opc)) { 469 if (AMDGPU::getMTBUFHasVAddr(Opc)) 470 Result.VAddr = true; 471 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 472 Result.SRsrc = true; 473 if (AMDGPU::getMTBUFHasSoffset(Opc)) 474 Result.SOffset = true; 475 476 return Result; 477 } 478 479 switch (Opc) { 480 default: 481 return Result; 482 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 483 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 484 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 485 Result.SBase = true; 486 return Result; 487 case AMDGPU::DS_READ_B32: 488 case AMDGPU::DS_READ_B64: 489 case AMDGPU::DS_READ_B32_gfx9: 490 case AMDGPU::DS_READ_B64_gfx9: 491 case AMDGPU::DS_WRITE_B32: 492 case AMDGPU::DS_WRITE_B64: 493 case AMDGPU::DS_WRITE_B32_gfx9: 494 case AMDGPU::DS_WRITE_B64_gfx9: 495 Result.Addr = true; 496 return Result; 497 } 498 } 499 500 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 501 const SIInstrInfo &TII, 502 const GCNSubtarget &STM) { 503 I = MI; 504 unsigned Opc = MI->getOpcode(); 505 InstClass = getInstClass(Opc, TII); 506 507 if (InstClass == UNKNOWN) 508 return; 509 510 switch (InstClass) { 511 case DS_READ: 512 EltSize = 513 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 514 : 4; 515 break; 516 case DS_WRITE: 517 EltSize = 518 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 519 : 4; 520 break; 521 case S_BUFFER_LOAD_IMM: 522 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 523 break; 524 default: 525 EltSize = 4; 526 break; 527 } 528 529 if (InstClass == MIMG) { 530 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 531 // Offset is not considered for MIMG instructions. 532 Offset = 0; 533 } else { 534 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 535 Offset = I->getOperand(OffsetIdx).getImm(); 536 } 537 538 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 539 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 540 541 Width = getOpcodeWidth(*I, TII); 542 543 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 544 Offset &= 0xffff; 545 } else if (InstClass != MIMG) { 546 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 547 if (InstClass != S_BUFFER_LOAD_IMM) { 548 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 549 } 550 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 551 } 552 553 AddressRegs Regs = getRegs(Opc, TII); 554 555 NumAddresses = 0; 556 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 557 AddrIdx[NumAddresses++] = 558 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 559 if (Regs.Addr) 560 AddrIdx[NumAddresses++] = 561 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 562 if (Regs.SBase) 563 AddrIdx[NumAddresses++] = 564 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 565 if (Regs.SRsrc) 566 AddrIdx[NumAddresses++] = 567 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 568 if (Regs.SOffset) 569 AddrIdx[NumAddresses++] = 570 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 571 if (Regs.VAddr) 572 AddrIdx[NumAddresses++] = 573 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 574 if (Regs.SSamp) 575 AddrIdx[NumAddresses++] = 576 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 577 assert(NumAddresses <= MaxAddressRegs); 578 579 for (unsigned J = 0; J < NumAddresses; J++) 580 AddrReg[J] = &I->getOperand(AddrIdx[J]); 581 } 582 583 } // end anonymous namespace. 584 585 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 586 "SI Load Store Optimizer", false, false) 587 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 588 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 589 false, false) 590 591 char SILoadStoreOptimizer::ID = 0; 592 593 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 594 595 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 596 return new SILoadStoreOptimizer(); 597 } 598 599 static void moveInstsAfter(MachineBasicBlock::iterator I, 600 ArrayRef<MachineInstr *> InstsToMove) { 601 MachineBasicBlock *MBB = I->getParent(); 602 ++I; 603 for (MachineInstr *MI : InstsToMove) { 604 MI->removeFromParent(); 605 MBB->insert(I, MI); 606 } 607 } 608 609 static void addDefsUsesToList(const MachineInstr &MI, 610 DenseSet<Register> &RegDefs, 611 DenseSet<Register> &PhysRegUses) { 612 for (const MachineOperand &Op : MI.operands()) { 613 if (Op.isReg()) { 614 if (Op.isDef()) 615 RegDefs.insert(Op.getReg()); 616 else if (Op.readsReg() && Op.getReg().isPhysical()) 617 PhysRegUses.insert(Op.getReg()); 618 } 619 } 620 } 621 622 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 623 MachineBasicBlock::iterator B, 624 AliasAnalysis *AA) { 625 // RAW or WAR - cannot reorder 626 // WAW - cannot reorder 627 // RAR - safe to reorder 628 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 629 } 630 631 // Add MI and its defs to the lists if MI reads one of the defs that are 632 // already in the list. Returns true in that case. 633 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 634 DenseSet<Register> &PhysRegUses, 635 SmallVectorImpl<MachineInstr *> &Insts) { 636 for (MachineOperand &Use : MI.operands()) { 637 // If one of the defs is read, then there is a use of Def between I and the 638 // instruction that I will potentially be merged with. We will need to move 639 // this instruction after the merged instructions. 640 // 641 // Similarly, if there is a def which is read by an instruction that is to 642 // be moved for merging, then we need to move the def-instruction as well. 643 // This can only happen for physical registers such as M0; virtual 644 // registers are in SSA form. 645 if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) || 646 (Use.isDef() && RegDefs.count(Use.getReg())) || 647 (Use.isDef() && Use.getReg().isPhysical() && 648 PhysRegUses.count(Use.getReg())))) { 649 Insts.push_back(&MI); 650 addDefsUsesToList(MI, RegDefs, PhysRegUses); 651 return true; 652 } 653 } 654 655 return false; 656 } 657 658 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 659 ArrayRef<MachineInstr *> InstsToMove, 660 AliasAnalysis *AA) { 661 assert(MemOp.mayLoadOrStore()); 662 663 for (MachineInstr *InstToMove : InstsToMove) { 664 if (!InstToMove->mayLoadOrStore()) 665 continue; 666 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 667 return false; 668 } 669 return true; 670 } 671 672 // This function assumes that \p A and \p B have are identical except for 673 // size and offset, and they referecne adjacent memory. 674 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 675 const MachineMemOperand *A, 676 const MachineMemOperand *B) { 677 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 678 unsigned Size = A->getSize() + B->getSize(); 679 // This function adds the offset parameter to the existing offset for A, 680 // so we pass 0 here as the offset and then manually set it to the correct 681 // value after the call. 682 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 683 MMO->setOffset(MinOffset); 684 return MMO; 685 } 686 687 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 688 const SIInstrInfo &TII, 689 const CombineInfo &Paired) { 690 assert(CI.InstClass == MIMG); 691 692 // Ignore instructions with tfe/lwe set. 693 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 694 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 695 696 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 697 return false; 698 699 // Check other optional immediate operands for equality. 700 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 701 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 702 AMDGPU::OpName::da, AMDGPU::OpName::r128, 703 AMDGPU::OpName::a16, AMDGPU::OpName::dlc}; 704 705 for (auto op : OperandsToMatch) { 706 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 707 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 708 return false; 709 if (Idx != -1 && 710 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 711 return false; 712 } 713 714 // Check DMask for overlaps. 715 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 716 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 717 718 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 719 if ((1u << AllowedBitsForMin) <= MinMask) 720 return false; 721 722 return true; 723 } 724 725 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 726 unsigned ComponentCount, 727 const GCNSubtarget &STI) { 728 if (ComponentCount > 4) 729 return 0; 730 731 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 732 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 733 if (!OldFormatInfo) 734 return 0; 735 736 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 737 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 738 ComponentCount, 739 OldFormatInfo->NumFormat, STI); 740 741 if (!NewFormatInfo) 742 return 0; 743 744 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 745 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 746 747 return NewFormatInfo->Format; 748 } 749 750 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 751 const GCNSubtarget &STI, 752 CombineInfo &Paired, 753 bool Modify) { 754 assert(CI.InstClass != MIMG); 755 756 // XXX - Would the same offset be OK? Is there any reason this would happen or 757 // be useful? 758 if (CI.Offset == Paired.Offset) 759 return false; 760 761 // This won't be valid if the offset isn't aligned. 762 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 763 return false; 764 765 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 766 767 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 768 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 769 if (!Info0) 770 return false; 771 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 772 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 773 if (!Info1) 774 return false; 775 776 if (Info0->BitsPerComp != Info1->BitsPerComp || 777 Info0->NumFormat != Info1->NumFormat) 778 return false; 779 780 // TODO: Should be possible to support more formats, but if format loads 781 // are not dword-aligned, the merged load might not be valid. 782 if (Info0->BitsPerComp != 32) 783 return false; 784 785 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 786 return false; 787 } 788 789 unsigned EltOffset0 = CI.Offset / CI.EltSize; 790 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 791 CI.UseST64 = false; 792 CI.BaseOff = 0; 793 794 // Handle DS instructions. 795 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 796 return (EltOffset0 + CI.Width == EltOffset1 || 797 EltOffset1 + Paired.Width == EltOffset0) && 798 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 799 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 800 } 801 802 // Handle SMEM and VMEM instructions. 803 // If the offset in elements doesn't fit in 8-bits, we might be able to use 804 // the stride 64 versions. 805 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 806 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 807 if (Modify) { 808 CI.Offset = EltOffset0 / 64; 809 Paired.Offset = EltOffset1 / 64; 810 CI.UseST64 = true; 811 } 812 return true; 813 } 814 815 // Check if the new offsets fit in the reduced 8-bit range. 816 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 817 if (Modify) { 818 CI.Offset = EltOffset0; 819 Paired.Offset = EltOffset1; 820 } 821 return true; 822 } 823 824 // Try to shift base address to decrease offsets. 825 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 826 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 827 828 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 829 if (Modify) { 830 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 831 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 832 CI.UseST64 = true; 833 } 834 return true; 835 } 836 837 if (isUInt<8>(OffsetDiff)) { 838 if (Modify) { 839 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 840 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 841 } 842 return true; 843 } 844 845 return false; 846 } 847 848 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 849 const CombineInfo &CI, 850 const CombineInfo &Paired) { 851 const unsigned Width = (CI.Width + Paired.Width); 852 switch (CI.InstClass) { 853 default: 854 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 855 case S_BUFFER_LOAD_IMM: 856 switch (Width) { 857 default: 858 return false; 859 case 2: 860 case 4: 861 return true; 862 } 863 } 864 } 865 866 /// This function assumes that CI comes before Paired in a basic block. 867 bool SILoadStoreOptimizer::checkAndPrepareMerge( 868 CombineInfo &CI, CombineInfo &Paired, 869 SmallVectorImpl<MachineInstr *> &InstsToMove) { 870 871 // Check both offsets (or masks for MIMG) can be combined and fit in the 872 // reduced range. 873 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 874 return false; 875 876 if (CI.InstClass != MIMG && 877 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 878 return false; 879 880 const unsigned Opc = CI.I->getOpcode(); 881 const InstClassEnum InstClass = getInstClass(Opc, *TII); 882 883 if (InstClass == UNKNOWN) { 884 return false; 885 } 886 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 887 888 // Do not merge VMEM buffer instructions with "swizzled" bit set. 889 int Swizzled = 890 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 891 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 892 return false; 893 894 DenseSet<Register> RegDefsToMove; 895 DenseSet<Register> PhysRegUsesToMove; 896 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 897 898 MachineBasicBlock::iterator E = std::next(Paired.I); 899 MachineBasicBlock::iterator MBBI = std::next(CI.I); 900 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 901 for (; MBBI != E; ++MBBI) { 902 903 if (MBBI == MBBE) { 904 // CombineInfo::Order is a hint on the instruction ordering within the 905 // basic block. This hint suggests that CI precedes Paired, which is 906 // true most of the time. However, moveInstsAfter() processing a 907 // previous list may have changed this order in a situation when it 908 // moves an instruction which exists in some other merge list. 909 // In this case it must be dependent. 910 return false; 911 } 912 913 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 914 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 915 // This is not a matching instruction, but we can keep looking as 916 // long as one of these conditions are met: 917 // 1. It is safe to move I down past MBBI. 918 // 2. It is safe to move MBBI down past the instruction that I will 919 // be merged into. 920 921 if (MBBI->hasUnmodeledSideEffects()) { 922 // We can't re-order this instruction with respect to other memory 923 // operations, so we fail both conditions mentioned above. 924 return false; 925 } 926 927 if (MBBI->mayLoadOrStore() && 928 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 929 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 930 // We fail condition #1, but we may still be able to satisfy condition 931 // #2. Add this instruction to the move list and then we will check 932 // if condition #2 holds once we have selected the matching instruction. 933 InstsToMove.push_back(&*MBBI); 934 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 935 continue; 936 } 937 938 // When we match I with another DS instruction we will be moving I down 939 // to the location of the matched instruction any uses of I will need to 940 // be moved down as well. 941 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 942 InstsToMove); 943 continue; 944 } 945 946 // Don't merge volatiles. 947 if (MBBI->hasOrderedMemoryRef()) 948 return false; 949 950 int Swizzled = 951 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 952 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 953 return false; 954 955 // Handle a case like 956 // DS_WRITE_B32 addr, v, idx0 957 // w = DS_READ_B32 addr, idx0 958 // DS_WRITE_B32 addr, f(w), idx1 959 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 960 // merging of the two writes. 961 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 962 InstsToMove)) 963 continue; 964 965 if (&*MBBI == &*Paired.I) { 966 // We need to go through the list of instructions that we plan to 967 // move and make sure they are all safe to move down past the merged 968 // instruction. 969 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 970 971 // Call offsetsCanBeCombined with modify = true so that the offsets are 972 // correct for the new instruction. This should return true, because 973 // this function should only be called on CombineInfo objects that 974 // have already been confirmed to be mergeable. 975 if (CI.InstClass != MIMG) 976 offsetsCanBeCombined(CI, *STM, Paired, true); 977 return true; 978 } 979 return false; 980 } 981 982 // We've found a load/store that we couldn't merge for some reason. 983 // We could potentially keep looking, but we'd need to make sure that 984 // it was safe to move I and also all the instruction in InstsToMove 985 // down past this instruction. 986 // check if we can move I across MBBI and if we can move all I's users 987 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 988 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 989 break; 990 } 991 return false; 992 } 993 994 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 995 if (STM->ldsRequiresM0Init()) 996 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 997 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 998 } 999 1000 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 1001 if (STM->ldsRequiresM0Init()) 1002 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 1003 1004 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 1005 : AMDGPU::DS_READ2ST64_B64_gfx9; 1006 } 1007 1008 MachineBasicBlock::iterator 1009 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1010 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1011 MachineBasicBlock *MBB = CI.I->getParent(); 1012 1013 // Be careful, since the addresses could be subregisters themselves in weird 1014 // cases, like vectors of pointers. 1015 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1016 1017 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1018 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1019 1020 unsigned NewOffset0 = CI.Offset; 1021 unsigned NewOffset1 = Paired.Offset; 1022 unsigned Opc = 1023 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1024 1025 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1026 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1027 1028 if (NewOffset0 > NewOffset1) { 1029 // Canonicalize the merged instruction so the smaller offset comes first. 1030 std::swap(NewOffset0, NewOffset1); 1031 std::swap(SubRegIdx0, SubRegIdx1); 1032 } 1033 1034 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1035 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1036 1037 const MCInstrDesc &Read2Desc = TII->get(Opc); 1038 1039 const TargetRegisterClass *SuperRC = 1040 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1041 Register DestReg = MRI->createVirtualRegister(SuperRC); 1042 1043 DebugLoc DL = CI.I->getDebugLoc(); 1044 1045 Register BaseReg = AddrReg->getReg(); 1046 unsigned BaseSubReg = AddrReg->getSubReg(); 1047 unsigned BaseRegFlags = 0; 1048 if (CI.BaseOff) { 1049 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1050 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1051 .addImm(CI.BaseOff); 1052 1053 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1054 BaseRegFlags = RegState::Kill; 1055 1056 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1057 .addReg(ImmReg) 1058 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1059 .addImm(0); // clamp bit 1060 BaseSubReg = 0; 1061 } 1062 1063 MachineInstrBuilder Read2 = 1064 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1065 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1066 .addImm(NewOffset0) // offset0 1067 .addImm(NewOffset1) // offset1 1068 .addImm(0) // gds 1069 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1070 1071 (void)Read2; 1072 1073 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1074 1075 // Copy to the old destination registers. 1076 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1077 .add(*Dest0) // Copy to same destination including flags and sub reg. 1078 .addReg(DestReg, 0, SubRegIdx0); 1079 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1080 .add(*Dest1) 1081 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1082 1083 moveInstsAfter(Copy1, InstsToMove); 1084 1085 CI.I->eraseFromParent(); 1086 Paired.I->eraseFromParent(); 1087 1088 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1089 return Read2; 1090 } 1091 1092 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1093 if (STM->ldsRequiresM0Init()) 1094 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1095 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1096 : AMDGPU::DS_WRITE2_B64_gfx9; 1097 } 1098 1099 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1100 if (STM->ldsRequiresM0Init()) 1101 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1102 : AMDGPU::DS_WRITE2ST64_B64; 1103 1104 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1105 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1106 } 1107 1108 MachineBasicBlock::iterator 1109 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1110 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1111 MachineBasicBlock *MBB = CI.I->getParent(); 1112 1113 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1114 // sure we preserve the subregister index and any register flags set on them. 1115 const MachineOperand *AddrReg = 1116 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1117 const MachineOperand *Data0 = 1118 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1119 const MachineOperand *Data1 = 1120 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1121 1122 unsigned NewOffset0 = CI.Offset; 1123 unsigned NewOffset1 = Paired.Offset; 1124 unsigned Opc = 1125 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1126 1127 if (NewOffset0 > NewOffset1) { 1128 // Canonicalize the merged instruction so the smaller offset comes first. 1129 std::swap(NewOffset0, NewOffset1); 1130 std::swap(Data0, Data1); 1131 } 1132 1133 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1134 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1135 1136 const MCInstrDesc &Write2Desc = TII->get(Opc); 1137 DebugLoc DL = CI.I->getDebugLoc(); 1138 1139 Register BaseReg = AddrReg->getReg(); 1140 unsigned BaseSubReg = AddrReg->getSubReg(); 1141 unsigned BaseRegFlags = 0; 1142 if (CI.BaseOff) { 1143 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1144 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1145 .addImm(CI.BaseOff); 1146 1147 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1148 BaseRegFlags = RegState::Kill; 1149 1150 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1151 .addReg(ImmReg) 1152 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1153 .addImm(0); // clamp bit 1154 BaseSubReg = 0; 1155 } 1156 1157 MachineInstrBuilder Write2 = 1158 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1159 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1160 .add(*Data0) // data0 1161 .add(*Data1) // data1 1162 .addImm(NewOffset0) // offset0 1163 .addImm(NewOffset1) // offset1 1164 .addImm(0) // gds 1165 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1166 1167 moveInstsAfter(Write2, InstsToMove); 1168 1169 CI.I->eraseFromParent(); 1170 Paired.I->eraseFromParent(); 1171 1172 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1173 return Write2; 1174 } 1175 1176 MachineBasicBlock::iterator 1177 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1178 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1179 MachineBasicBlock *MBB = CI.I->getParent(); 1180 DebugLoc DL = CI.I->getDebugLoc(); 1181 const unsigned Opcode = getNewOpcode(CI, Paired); 1182 1183 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1184 1185 Register DestReg = MRI->createVirtualRegister(SuperRC); 1186 unsigned MergedDMask = CI.DMask | Paired.DMask; 1187 unsigned DMaskIdx = 1188 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1189 1190 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1191 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1192 if (I == DMaskIdx) 1193 MIB.addImm(MergedDMask); 1194 else 1195 MIB.add((*CI.I).getOperand(I)); 1196 } 1197 1198 // It shouldn't be possible to get this far if the two instructions 1199 // don't have a single memoperand, because MachineInstr::mayAlias() 1200 // will return true if this is the case. 1201 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1202 1203 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1204 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1205 1206 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1207 1208 unsigned SubRegIdx0, SubRegIdx1; 1209 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1210 1211 // Copy to the old destination registers. 1212 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1213 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1214 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1215 1216 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1217 .add(*Dest0) // Copy to same destination including flags and sub reg. 1218 .addReg(DestReg, 0, SubRegIdx0); 1219 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1220 .add(*Dest1) 1221 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1222 1223 moveInstsAfter(Copy1, InstsToMove); 1224 1225 CI.I->eraseFromParent(); 1226 Paired.I->eraseFromParent(); 1227 return New; 1228 } 1229 1230 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1231 CombineInfo &CI, CombineInfo &Paired, 1232 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1233 MachineBasicBlock *MBB = CI.I->getParent(); 1234 DebugLoc DL = CI.I->getDebugLoc(); 1235 const unsigned Opcode = getNewOpcode(CI, Paired); 1236 1237 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1238 1239 Register DestReg = MRI->createVirtualRegister(SuperRC); 1240 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1241 1242 // It shouldn't be possible to get this far if the two instructions 1243 // don't have a single memoperand, because MachineInstr::mayAlias() 1244 // will return true if this is the case. 1245 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1246 1247 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1248 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1249 1250 MachineInstr *New = 1251 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1252 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1253 .addImm(MergedOffset) // offset 1254 .addImm(CI.GLC) // glc 1255 .addImm(CI.DLC) // dlc 1256 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1257 1258 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1259 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1260 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1261 1262 // Copy to the old destination registers. 1263 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1264 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1265 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1266 1267 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1268 .add(*Dest0) // Copy to same destination including flags and sub reg. 1269 .addReg(DestReg, 0, SubRegIdx0); 1270 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1271 .add(*Dest1) 1272 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1273 1274 moveInstsAfter(Copy1, InstsToMove); 1275 1276 CI.I->eraseFromParent(); 1277 Paired.I->eraseFromParent(); 1278 return New; 1279 } 1280 1281 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1282 CombineInfo &CI, CombineInfo &Paired, 1283 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1284 MachineBasicBlock *MBB = CI.I->getParent(); 1285 DebugLoc DL = CI.I->getDebugLoc(); 1286 1287 const unsigned Opcode = getNewOpcode(CI, Paired); 1288 1289 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1290 1291 // Copy to the new source register. 1292 Register DestReg = MRI->createVirtualRegister(SuperRC); 1293 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1294 1295 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1296 1297 AddressRegs Regs = getRegs(Opcode, *TII); 1298 1299 if (Regs.VAddr) 1300 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1301 1302 // It shouldn't be possible to get this far if the two instructions 1303 // don't have a single memoperand, because MachineInstr::mayAlias() 1304 // will return true if this is the case. 1305 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1306 1307 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1308 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1309 1310 MachineInstr *New = 1311 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1312 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1313 .addImm(MergedOffset) // offset 1314 .addImm(CI.GLC) // glc 1315 .addImm(CI.SLC) // slc 1316 .addImm(0) // tfe 1317 .addImm(CI.DLC) // dlc 1318 .addImm(0) // swz 1319 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1320 1321 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1322 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1323 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1324 1325 // Copy to the old destination registers. 1326 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1327 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1328 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1329 1330 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1331 .add(*Dest0) // Copy to same destination including flags and sub reg. 1332 .addReg(DestReg, 0, SubRegIdx0); 1333 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1334 .add(*Dest1) 1335 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1336 1337 moveInstsAfter(Copy1, InstsToMove); 1338 1339 CI.I->eraseFromParent(); 1340 Paired.I->eraseFromParent(); 1341 return New; 1342 } 1343 1344 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1345 CombineInfo &CI, CombineInfo &Paired, 1346 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1347 MachineBasicBlock *MBB = CI.I->getParent(); 1348 DebugLoc DL = CI.I->getDebugLoc(); 1349 1350 const unsigned Opcode = getNewOpcode(CI, Paired); 1351 1352 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1353 1354 // Copy to the new source register. 1355 Register DestReg = MRI->createVirtualRegister(SuperRC); 1356 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1357 1358 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1359 1360 AddressRegs Regs = getRegs(Opcode, *TII); 1361 1362 if (Regs.VAddr) 1363 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1364 1365 unsigned JoinedFormat = 1366 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1367 1368 // It shouldn't be possible to get this far if the two instructions 1369 // don't have a single memoperand, because MachineInstr::mayAlias() 1370 // will return true if this is the case. 1371 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1372 1373 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1374 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1375 1376 MachineInstr *New = 1377 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1378 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1379 .addImm(MergedOffset) // offset 1380 .addImm(JoinedFormat) // format 1381 .addImm(CI.GLC) // glc 1382 .addImm(CI.SLC) // slc 1383 .addImm(0) // tfe 1384 .addImm(CI.DLC) // dlc 1385 .addImm(0) // swz 1386 .addMemOperand( 1387 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1388 1389 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1390 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1391 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1392 1393 // Copy to the old destination registers. 1394 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1395 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1396 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1397 1398 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1399 .add(*Dest0) // Copy to same destination including flags and sub reg. 1400 .addReg(DestReg, 0, SubRegIdx0); 1401 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1402 .add(*Dest1) 1403 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1404 1405 moveInstsAfter(Copy1, InstsToMove); 1406 1407 CI.I->eraseFromParent(); 1408 Paired.I->eraseFromParent(); 1409 return New; 1410 } 1411 1412 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1413 CombineInfo &CI, CombineInfo &Paired, 1414 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1415 MachineBasicBlock *MBB = CI.I->getParent(); 1416 DebugLoc DL = CI.I->getDebugLoc(); 1417 1418 const unsigned Opcode = getNewOpcode(CI, Paired); 1419 1420 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1421 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1422 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1423 1424 // Copy to the new source register. 1425 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1426 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1427 1428 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1429 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1430 1431 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1432 .add(*Src0) 1433 .addImm(SubRegIdx0) 1434 .add(*Src1) 1435 .addImm(SubRegIdx1); 1436 1437 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1438 .addReg(SrcReg, RegState::Kill); 1439 1440 AddressRegs Regs = getRegs(Opcode, *TII); 1441 1442 if (Regs.VAddr) 1443 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1444 1445 unsigned JoinedFormat = 1446 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1447 1448 // It shouldn't be possible to get this far if the two instructions 1449 // don't have a single memoperand, because MachineInstr::mayAlias() 1450 // will return true if this is the case. 1451 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1452 1453 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1454 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1455 1456 MachineInstr *New = 1457 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1458 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1459 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1460 .addImm(JoinedFormat) // format 1461 .addImm(CI.GLC) // glc 1462 .addImm(CI.SLC) // slc 1463 .addImm(0) // tfe 1464 .addImm(CI.DLC) // dlc 1465 .addImm(0) // swz 1466 .addMemOperand( 1467 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1468 1469 moveInstsAfter(MIB, InstsToMove); 1470 1471 CI.I->eraseFromParent(); 1472 Paired.I->eraseFromParent(); 1473 return New; 1474 } 1475 1476 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1477 const CombineInfo &Paired) { 1478 const unsigned Width = CI.Width + Paired.Width; 1479 1480 switch (CI.InstClass) { 1481 default: 1482 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1483 // FIXME: Handle d16 correctly 1484 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1485 Width); 1486 case TBUFFER_LOAD: 1487 case TBUFFER_STORE: 1488 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1489 Width); 1490 1491 case UNKNOWN: 1492 llvm_unreachable("Unknown instruction class"); 1493 case S_BUFFER_LOAD_IMM: 1494 switch (Width) { 1495 default: 1496 return 0; 1497 case 2: 1498 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1499 case 4: 1500 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1501 } 1502 case MIMG: 1503 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1504 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1505 } 1506 } 1507 1508 std::pair<unsigned, unsigned> 1509 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1510 1511 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1512 return std::make_pair(0, 0); 1513 1514 bool ReverseOrder; 1515 if (CI.InstClass == MIMG) { 1516 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1517 "No overlaps"); 1518 ReverseOrder = CI.DMask > Paired.DMask; 1519 } else 1520 ReverseOrder = CI.Offset > Paired.Offset; 1521 1522 static const unsigned Idxs[4][4] = { 1523 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1524 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1525 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1526 {AMDGPU::sub3, 0, 0, 0}, 1527 }; 1528 unsigned Idx0; 1529 unsigned Idx1; 1530 1531 assert(CI.Width >= 1 && CI.Width <= 3); 1532 assert(Paired.Width >= 1 && Paired.Width <= 3); 1533 1534 if (ReverseOrder) { 1535 Idx1 = Idxs[0][Paired.Width - 1]; 1536 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1537 } else { 1538 Idx0 = Idxs[0][CI.Width - 1]; 1539 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1540 } 1541 1542 return std::make_pair(Idx0, Idx1); 1543 } 1544 1545 const TargetRegisterClass * 1546 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1547 const CombineInfo &Paired) { 1548 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1549 switch (CI.Width + Paired.Width) { 1550 default: 1551 return nullptr; 1552 case 2: 1553 return &AMDGPU::SReg_64_XEXECRegClass; 1554 case 4: 1555 return &AMDGPU::SGPR_128RegClass; 1556 case 8: 1557 return &AMDGPU::SGPR_256RegClass; 1558 case 16: 1559 return &AMDGPU::SGPR_512RegClass; 1560 } 1561 } else { 1562 switch (CI.Width + Paired.Width) { 1563 default: 1564 return nullptr; 1565 case 2: 1566 return &AMDGPU::VReg_64RegClass; 1567 case 3: 1568 return &AMDGPU::VReg_96RegClass; 1569 case 4: 1570 return &AMDGPU::VReg_128RegClass; 1571 } 1572 } 1573 } 1574 1575 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1576 CombineInfo &CI, CombineInfo &Paired, 1577 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1578 MachineBasicBlock *MBB = CI.I->getParent(); 1579 DebugLoc DL = CI.I->getDebugLoc(); 1580 1581 const unsigned Opcode = getNewOpcode(CI, Paired); 1582 1583 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1584 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1585 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1586 1587 // Copy to the new source register. 1588 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1589 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1590 1591 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1592 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1593 1594 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1595 .add(*Src0) 1596 .addImm(SubRegIdx0) 1597 .add(*Src1) 1598 .addImm(SubRegIdx1); 1599 1600 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1601 .addReg(SrcReg, RegState::Kill); 1602 1603 AddressRegs Regs = getRegs(Opcode, *TII); 1604 1605 if (Regs.VAddr) 1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1607 1608 1609 // It shouldn't be possible to get this far if the two instructions 1610 // don't have a single memoperand, because MachineInstr::mayAlias() 1611 // will return true if this is the case. 1612 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1613 1614 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1615 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1616 1617 MachineInstr *New = 1618 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1619 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1620 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1621 .addImm(CI.GLC) // glc 1622 .addImm(CI.SLC) // slc 1623 .addImm(0) // tfe 1624 .addImm(CI.DLC) // dlc 1625 .addImm(0) // swz 1626 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1627 1628 moveInstsAfter(MIB, InstsToMove); 1629 1630 CI.I->eraseFromParent(); 1631 Paired.I->eraseFromParent(); 1632 return New; 1633 } 1634 1635 MachineOperand 1636 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1637 APInt V(32, Val, true); 1638 if (TII->isInlineConstant(V)) 1639 return MachineOperand::CreateImm(Val); 1640 1641 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1642 MachineInstr *Mov = 1643 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1644 TII->get(AMDGPU::S_MOV_B32), Reg) 1645 .addImm(Val); 1646 (void)Mov; 1647 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1648 return MachineOperand::CreateReg(Reg, false); 1649 } 1650 1651 // Compute base address using Addr and return the final register. 1652 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1653 const MemAddress &Addr) const { 1654 MachineBasicBlock *MBB = MI.getParent(); 1655 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1656 DebugLoc DL = MI.getDebugLoc(); 1657 1658 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1659 Addr.Base.LoSubReg) && 1660 "Expected 32-bit Base-Register-Low!!"); 1661 1662 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1663 Addr.Base.HiSubReg) && 1664 "Expected 32-bit Base-Register-Hi!!"); 1665 1666 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1667 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1668 MachineOperand OffsetHi = 1669 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1670 1671 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1672 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1673 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1674 1675 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1676 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1677 MachineInstr *LoHalf = 1678 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1679 .addReg(CarryReg, RegState::Define) 1680 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1681 .add(OffsetLo) 1682 .addImm(0); // clamp bit 1683 (void)LoHalf; 1684 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1685 1686 MachineInstr *HiHalf = 1687 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1688 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1689 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1690 .add(OffsetHi) 1691 .addReg(CarryReg, RegState::Kill) 1692 .addImm(0); // clamp bit 1693 (void)HiHalf; 1694 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1695 1696 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1697 MachineInstr *FullBase = 1698 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1699 .addReg(DestSub0) 1700 .addImm(AMDGPU::sub0) 1701 .addReg(DestSub1) 1702 .addImm(AMDGPU::sub1); 1703 (void)FullBase; 1704 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1705 1706 return FullDestReg; 1707 } 1708 1709 // Update base and offset with the NewBase and NewOffset in MI. 1710 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1711 Register NewBase, 1712 int32_t NewOffset) const { 1713 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1714 Base->setReg(NewBase); 1715 Base->setIsKill(false); 1716 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1717 } 1718 1719 Optional<int32_t> 1720 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1721 if (Op.isImm()) 1722 return Op.getImm(); 1723 1724 if (!Op.isReg()) 1725 return None; 1726 1727 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1728 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1729 !Def->getOperand(1).isImm()) 1730 return None; 1731 1732 return Def->getOperand(1).getImm(); 1733 } 1734 1735 // Analyze Base and extracts: 1736 // - 32bit base registers, subregisters 1737 // - 64bit constant offset 1738 // Expecting base computation as: 1739 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1740 // %LO:vgpr_32, %c:sreg_64_xexec = 1741 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1742 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1743 // %Base:vreg_64 = 1744 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1745 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1746 MemAddress &Addr) const { 1747 if (!Base.isReg()) 1748 return; 1749 1750 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1751 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1752 || Def->getNumOperands() != 5) 1753 return; 1754 1755 MachineOperand BaseLo = Def->getOperand(1); 1756 MachineOperand BaseHi = Def->getOperand(3); 1757 if (!BaseLo.isReg() || !BaseHi.isReg()) 1758 return; 1759 1760 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1761 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1762 1763 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1764 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1765 return; 1766 1767 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1768 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1769 1770 auto Offset0P = extractConstOffset(*Src0); 1771 if (Offset0P) 1772 BaseLo = *Src1; 1773 else { 1774 if (!(Offset0P = extractConstOffset(*Src1))) 1775 return; 1776 BaseLo = *Src0; 1777 } 1778 1779 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1780 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1781 1782 if (Src0->isImm()) 1783 std::swap(Src0, Src1); 1784 1785 if (!Src1->isImm()) 1786 return; 1787 1788 uint64_t Offset1 = Src1->getImm(); 1789 BaseHi = *Src0; 1790 1791 Addr.Base.LoReg = BaseLo.getReg(); 1792 Addr.Base.HiReg = BaseHi.getReg(); 1793 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1794 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1795 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1796 } 1797 1798 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1799 MachineInstr &MI, 1800 MemInfoMap &Visited, 1801 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1802 1803 if (!(MI.mayLoad() ^ MI.mayStore())) 1804 return false; 1805 1806 // TODO: Support flat and scratch. 1807 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1808 return false; 1809 1810 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1811 return false; 1812 1813 if (AnchorList.count(&MI)) 1814 return false; 1815 1816 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1817 1818 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1819 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1820 return false; 1821 } 1822 1823 // Step1: Find the base-registers and a 64bit constant offset. 1824 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1825 MemAddress MAddr; 1826 if (Visited.find(&MI) == Visited.end()) { 1827 processBaseWithConstOffset(Base, MAddr); 1828 Visited[&MI] = MAddr; 1829 } else 1830 MAddr = Visited[&MI]; 1831 1832 if (MAddr.Offset == 0) { 1833 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1834 " constant offsets that can be promoted.\n";); 1835 return false; 1836 } 1837 1838 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1839 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1840 1841 // Step2: Traverse through MI's basic block and find an anchor(that has the 1842 // same base-registers) with the highest 13bit distance from MI's offset. 1843 // E.g. (64bit loads) 1844 // bb: 1845 // addr1 = &a + 4096; load1 = load(addr1, 0) 1846 // addr2 = &a + 6144; load2 = load(addr2, 0) 1847 // addr3 = &a + 8192; load3 = load(addr3, 0) 1848 // addr4 = &a + 10240; load4 = load(addr4, 0) 1849 // addr5 = &a + 12288; load5 = load(addr5, 0) 1850 // 1851 // Starting from the first load, the optimization will try to find a new base 1852 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1853 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1854 // as the new-base(anchor) because of the maximum distance which can 1855 // accomodate more intermediate bases presumeably. 1856 // 1857 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1858 // (&a + 8192) for load1, load2, load4. 1859 // addr = &a + 8192 1860 // load1 = load(addr, -4096) 1861 // load2 = load(addr, -2048) 1862 // load3 = load(addr, 0) 1863 // load4 = load(addr, 2048) 1864 // addr5 = &a + 12288; load5 = load(addr5, 0) 1865 // 1866 MachineInstr *AnchorInst = nullptr; 1867 MemAddress AnchorAddr; 1868 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1869 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1870 1871 MachineBasicBlock *MBB = MI.getParent(); 1872 MachineBasicBlock::iterator E = MBB->end(); 1873 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1874 ++MBBI; 1875 const SITargetLowering *TLI = 1876 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1877 1878 for ( ; MBBI != E; ++MBBI) { 1879 MachineInstr &MINext = *MBBI; 1880 // TODO: Support finding an anchor(with same base) from store addresses or 1881 // any other load addresses where the opcodes are different. 1882 if (MINext.getOpcode() != MI.getOpcode() || 1883 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1884 continue; 1885 1886 const MachineOperand &BaseNext = 1887 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1888 MemAddress MAddrNext; 1889 if (Visited.find(&MINext) == Visited.end()) { 1890 processBaseWithConstOffset(BaseNext, MAddrNext); 1891 Visited[&MINext] = MAddrNext; 1892 } else 1893 MAddrNext = Visited[&MINext]; 1894 1895 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1896 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1897 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1898 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1899 continue; 1900 1901 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1902 1903 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1904 TargetLoweringBase::AddrMode AM; 1905 AM.HasBaseReg = true; 1906 AM.BaseOffs = Dist; 1907 if (TLI->isLegalGlobalAddressingMode(AM) && 1908 (uint32_t)std::abs(Dist) > MaxDist) { 1909 MaxDist = std::abs(Dist); 1910 1911 AnchorAddr = MAddrNext; 1912 AnchorInst = &MINext; 1913 } 1914 } 1915 1916 if (AnchorInst) { 1917 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1918 AnchorInst->dump()); 1919 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1920 << AnchorAddr.Offset << "\n\n"); 1921 1922 // Instead of moving up, just re-compute anchor-instruction's base address. 1923 Register Base = computeBase(MI, AnchorAddr); 1924 1925 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1926 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1927 1928 for (auto P : InstsWCommonBase) { 1929 TargetLoweringBase::AddrMode AM; 1930 AM.HasBaseReg = true; 1931 AM.BaseOffs = P.second - AnchorAddr.Offset; 1932 1933 if (TLI->isLegalGlobalAddressingMode(AM)) { 1934 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1935 dbgs() << ")"; P.first->dump()); 1936 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1937 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1938 } 1939 } 1940 AnchorList.insert(AnchorInst); 1941 return true; 1942 } 1943 1944 return false; 1945 } 1946 1947 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1948 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1949 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1950 if (AddrList.front().InstClass == CI.InstClass && 1951 AddrList.front().hasSameBaseAddress(*CI.I)) { 1952 AddrList.emplace_back(CI); 1953 return; 1954 } 1955 } 1956 1957 // Base address not found, so add a new list. 1958 MergeableInsts.emplace_back(1, CI); 1959 } 1960 1961 std::pair<MachineBasicBlock::iterator, bool> 1962 SILoadStoreOptimizer::collectMergeableInsts( 1963 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1964 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1965 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1966 bool Modified = false; 1967 1968 // Sort potential mergeable instructions into lists. One list per base address. 1969 unsigned Order = 0; 1970 MachineBasicBlock::iterator BlockI = Begin; 1971 for (; BlockI != End; ++BlockI) { 1972 MachineInstr &MI = *BlockI; 1973 1974 // We run this before checking if an address is mergeable, because it can produce 1975 // better code even if the instructions aren't mergeable. 1976 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1977 Modified = true; 1978 1979 // Don't combine if volatile. We also won't be able to merge across this, so 1980 // break the search. We can look after this barrier for separate merges. 1981 if (MI.hasOrderedMemoryRef()) { 1982 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1983 1984 // Search will resume after this instruction in a separate merge list. 1985 ++BlockI; 1986 break; 1987 } 1988 1989 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1990 if (InstClass == UNKNOWN) 1991 continue; 1992 1993 CombineInfo CI; 1994 CI.setMI(MI, *TII, *STM); 1995 CI.Order = Order++; 1996 1997 if (!CI.hasMergeableAddress(*MRI)) 1998 continue; 1999 2000 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 2001 2002 addInstToMergeableList(CI, MergeableInsts); 2003 } 2004 2005 // At this point we have lists of Mergeable instructions. 2006 // 2007 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2008 // list try to find an instruction that can be merged with I. If an instruction 2009 // is found, it is stored in the Paired field. If no instructions are found, then 2010 // the CombineInfo object is deleted from the list. 2011 2012 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2013 E = MergeableInsts.end(); I != E;) { 2014 2015 std::list<CombineInfo> &MergeList = *I; 2016 if (MergeList.size() <= 1) { 2017 // This means we have found only one instruction with a given address 2018 // that can be merged, and we need at least 2 instructions to do a merge, 2019 // so this list can be discarded. 2020 I = MergeableInsts.erase(I); 2021 continue; 2022 } 2023 2024 // Sort the lists by offsets, this way mergeable instructions will be 2025 // adjacent to each other in the list, which will make it easier to find 2026 // matches. 2027 MergeList.sort( 2028 [] (const CombineInfo &A, CombineInfo &B) { 2029 return A.Offset < B.Offset; 2030 }); 2031 ++I; 2032 } 2033 2034 return std::make_pair(BlockI, Modified); 2035 } 2036 2037 // Scan through looking for adjacent LDS operations with constant offsets from 2038 // the same base register. We rely on the scheduler to do the hard work of 2039 // clustering nearby loads, and assume these are all adjacent. 2040 bool SILoadStoreOptimizer::optimizeBlock( 2041 std::list<std::list<CombineInfo> > &MergeableInsts) { 2042 bool Modified = false; 2043 2044 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2045 E = MergeableInsts.end(); I != E;) { 2046 std::list<CombineInfo> &MergeList = *I; 2047 2048 bool OptimizeListAgain = false; 2049 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2050 // We weren't able to make any changes, so delete the list so we don't 2051 // process the same instructions the next time we try to optimize this 2052 // block. 2053 I = MergeableInsts.erase(I); 2054 continue; 2055 } 2056 2057 Modified = true; 2058 2059 // We made changes, but also determined that there were no more optimization 2060 // opportunities, so we don't need to reprocess the list 2061 if (!OptimizeListAgain) { 2062 I = MergeableInsts.erase(I); 2063 continue; 2064 } 2065 OptimizeAgain = true; 2066 } 2067 return Modified; 2068 } 2069 2070 bool 2071 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2072 std::list<CombineInfo> &MergeList, 2073 bool &OptimizeListAgain) { 2074 if (MergeList.empty()) 2075 return false; 2076 2077 bool Modified = false; 2078 2079 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2080 Next = std::next(I)) { 2081 2082 auto First = I; 2083 auto Second = Next; 2084 2085 if ((*First).Order > (*Second).Order) 2086 std::swap(First, Second); 2087 CombineInfo &CI = *First; 2088 CombineInfo &Paired = *Second; 2089 2090 SmallVector<MachineInstr *, 8> InstsToMove; 2091 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2092 ++I; 2093 continue; 2094 } 2095 2096 Modified = true; 2097 2098 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2099 2100 switch (CI.InstClass) { 2101 default: 2102 llvm_unreachable("unknown InstClass"); 2103 break; 2104 case DS_READ: { 2105 MachineBasicBlock::iterator NewMI = 2106 mergeRead2Pair(CI, Paired, InstsToMove); 2107 CI.setMI(NewMI, *TII, *STM); 2108 break; 2109 } 2110 case DS_WRITE: { 2111 MachineBasicBlock::iterator NewMI = 2112 mergeWrite2Pair(CI, Paired, InstsToMove); 2113 CI.setMI(NewMI, *TII, *STM); 2114 break; 2115 } 2116 case S_BUFFER_LOAD_IMM: { 2117 MachineBasicBlock::iterator NewMI = 2118 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2119 CI.setMI(NewMI, *TII, *STM); 2120 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2121 break; 2122 } 2123 case BUFFER_LOAD: { 2124 MachineBasicBlock::iterator NewMI = 2125 mergeBufferLoadPair(CI, Paired, InstsToMove); 2126 CI.setMI(NewMI, *TII, *STM); 2127 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2128 break; 2129 } 2130 case BUFFER_STORE: { 2131 MachineBasicBlock::iterator NewMI = 2132 mergeBufferStorePair(CI, Paired, InstsToMove); 2133 CI.setMI(NewMI, *TII, *STM); 2134 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2135 break; 2136 } 2137 case MIMG: { 2138 MachineBasicBlock::iterator NewMI = 2139 mergeImagePair(CI, Paired, InstsToMove); 2140 CI.setMI(NewMI, *TII, *STM); 2141 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2142 break; 2143 } 2144 case TBUFFER_LOAD: { 2145 MachineBasicBlock::iterator NewMI = 2146 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2147 CI.setMI(NewMI, *TII, *STM); 2148 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2149 break; 2150 } 2151 case TBUFFER_STORE: { 2152 MachineBasicBlock::iterator NewMI = 2153 mergeTBufferStorePair(CI, Paired, InstsToMove); 2154 CI.setMI(NewMI, *TII, *STM); 2155 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2156 break; 2157 } 2158 } 2159 CI.Order = Paired.Order; 2160 if (I == Second) 2161 I = Next; 2162 2163 MergeList.erase(Second); 2164 } 2165 2166 return Modified; 2167 } 2168 2169 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2170 if (skipFunction(MF.getFunction())) 2171 return false; 2172 2173 STM = &MF.getSubtarget<GCNSubtarget>(); 2174 if (!STM->loadStoreOptEnabled()) 2175 return false; 2176 2177 TII = STM->getInstrInfo(); 2178 TRI = &TII->getRegisterInfo(); 2179 2180 MRI = &MF.getRegInfo(); 2181 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2182 2183 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2184 2185 bool Modified = false; 2186 2187 // Contains the list of instructions for which constant offsets are being 2188 // promoted to the IMM. This is tracked for an entire block at time. 2189 SmallPtrSet<MachineInstr *, 4> AnchorList; 2190 MemInfoMap Visited; 2191 2192 for (MachineBasicBlock &MBB : MF) { 2193 MachineBasicBlock::iterator SectionEnd; 2194 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2195 I = SectionEnd) { 2196 bool CollectModified; 2197 std::list<std::list<CombineInfo>> MergeableInsts; 2198 2199 // First pass: Collect list of all instructions we know how to merge in a 2200 // subset of the block. 2201 std::tie(SectionEnd, CollectModified) = 2202 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2203 2204 Modified |= CollectModified; 2205 2206 do { 2207 OptimizeAgain = false; 2208 Modified |= optimizeBlock(MergeableInsts); 2209 } while (OptimizeAgain); 2210 } 2211 2212 Visited.clear(); 2213 AnchorList.clear(); 2214 } 2215 2216 return Modified; 2217 } 2218