1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "SIInstrInfo.h" 64 #include "SIRegisterInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/StringRef.h" 69 #include "llvm/Analysis/AliasAnalysis.h" 70 #include "llvm/CodeGen/MachineBasicBlock.h" 71 #include "llvm/CodeGen/MachineFunction.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstr.h" 74 #include "llvm/CodeGen/MachineInstrBuilder.h" 75 #include "llvm/CodeGen/MachineOperand.h" 76 #include "llvm/CodeGen/MachineRegisterInfo.h" 77 #include "llvm/IR/DebugLoc.h" 78 #include "llvm/InitializePasses.h" 79 #include "llvm/Pass.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include <algorithm> 84 #include <cassert> 85 #include <cstdlib> 86 #include <iterator> 87 #include <utility> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "si-load-store-opt" 92 93 namespace { 94 enum InstClassEnum { 95 UNKNOWN, 96 DS_READ, 97 DS_WRITE, 98 S_BUFFER_LOAD_IMM, 99 BUFFER_LOAD, 100 BUFFER_STORE, 101 MIMG, 102 TBUFFER_LOAD, 103 TBUFFER_STORE, 104 }; 105 106 enum RegisterEnum { 107 SBASE = 0x1, 108 SRSRC = 0x2, 109 SOFFSET = 0x4, 110 VADDR = 0x8, 111 ADDR = 0x10, 112 SSAMP = 0x20, 113 }; 114 115 class SILoadStoreOptimizer : public MachineFunctionPass { 116 struct CombineInfo { 117 MachineBasicBlock::iterator I; 118 unsigned EltSize; 119 unsigned Offset; 120 unsigned Width; 121 unsigned Format; 122 unsigned BaseOff; 123 unsigned DMask; 124 InstClassEnum InstClass; 125 bool GLC; 126 bool SLC; 127 bool DLC; 128 bool UseST64; 129 int AddrIdx[5]; 130 const MachineOperand *AddrReg[5]; 131 unsigned NumAddresses; 132 unsigned Order; 133 134 bool hasSameBaseAddress(const MachineInstr &MI) { 135 for (unsigned i = 0; i < NumAddresses; i++) { 136 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 137 138 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 139 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 140 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 141 return false; 142 } 143 continue; 144 } 145 146 // Check same base pointer. Be careful of subregisters, which can occur 147 // with vectors of pointers. 148 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 149 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 150 return false; 151 } 152 } 153 return true; 154 } 155 156 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 157 for (unsigned i = 0; i < NumAddresses; ++i) { 158 const MachineOperand *AddrOp = AddrReg[i]; 159 // Immediates are always OK. 160 if (AddrOp->isImm()) 161 continue; 162 163 // Don't try to merge addresses that aren't either immediates or registers. 164 // TODO: Should be possible to merge FrameIndexes and maybe some other 165 // non-register 166 if (!AddrOp->isReg()) 167 return false; 168 169 // TODO: We should be able to merge physical reg addreses. 170 if (Register::isPhysicalRegister(AddrOp->getReg())) 171 return false; 172 173 // If an address has only one use then there will be on other 174 // instructions with the same address, so we can't merge this one. 175 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 176 return false; 177 } 178 return true; 179 } 180 181 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 182 const GCNSubtarget &STM); 183 }; 184 185 struct BaseRegisters { 186 Register LoReg; 187 Register HiReg; 188 189 unsigned LoSubReg = 0; 190 unsigned HiSubReg = 0; 191 }; 192 193 struct MemAddress { 194 BaseRegisters Base; 195 int64_t Offset = 0; 196 }; 197 198 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 199 200 private: 201 const GCNSubtarget *STM = nullptr; 202 const SIInstrInfo *TII = nullptr; 203 const SIRegisterInfo *TRI = nullptr; 204 MachineRegisterInfo *MRI = nullptr; 205 AliasAnalysis *AA = nullptr; 206 bool OptimizeAgain; 207 208 static bool dmasksCanBeCombined(const CombineInfo &CI, 209 const SIInstrInfo &TII, 210 const CombineInfo &Paired); 211 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 212 CombineInfo &Paired, bool Modify = false); 213 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 214 const CombineInfo &Paired); 215 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 216 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 217 const CombineInfo &Paired); 218 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 219 const CombineInfo &Paired); 220 221 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 222 SmallVectorImpl<MachineInstr *> &InstsToMove); 223 224 unsigned read2Opcode(unsigned EltSize) const; 225 unsigned read2ST64Opcode(unsigned EltSize) const; 226 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 227 CombineInfo &Paired, 228 const SmallVectorImpl<MachineInstr *> &InstsToMove); 229 230 unsigned write2Opcode(unsigned EltSize) const; 231 unsigned write2ST64Opcode(unsigned EltSize) const; 232 MachineBasicBlock::iterator 233 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 234 const SmallVectorImpl<MachineInstr *> &InstsToMove); 235 MachineBasicBlock::iterator 236 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 237 const SmallVectorImpl<MachineInstr *> &InstsToMove); 238 MachineBasicBlock::iterator 239 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 240 const SmallVectorImpl<MachineInstr *> &InstsToMove); 241 MachineBasicBlock::iterator 242 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 243 const SmallVectorImpl<MachineInstr *> &InstsToMove); 244 MachineBasicBlock::iterator 245 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 246 const SmallVectorImpl<MachineInstr *> &InstsToMove); 247 MachineBasicBlock::iterator 248 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 249 const SmallVectorImpl<MachineInstr *> &InstsToMove); 250 MachineBasicBlock::iterator 251 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 252 const SmallVectorImpl<MachineInstr *> &InstsToMove); 253 254 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 255 int32_t NewOffset) const; 256 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 257 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 258 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 259 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 260 /// Promotes constant offset to the immediate by adjusting the base. It 261 /// tries to use a base from the nearby instructions that allows it to have 262 /// a 13bit constant offset which gets promoted to the immediate. 263 bool promoteConstantOffsetToImm(MachineInstr &CI, 264 MemInfoMap &Visited, 265 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 266 void addInstToMergeableList(const CombineInfo &CI, 267 std::list<std::list<CombineInfo> > &MergeableInsts) const; 268 269 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 270 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 271 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 272 std::list<std::list<CombineInfo>> &MergeableInsts) const; 273 274 public: 275 static char ID; 276 277 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 278 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 279 } 280 281 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 282 bool &OptimizeListAgain); 283 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 284 285 bool runOnMachineFunction(MachineFunction &MF) override; 286 287 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 288 289 void getAnalysisUsage(AnalysisUsage &AU) const override { 290 AU.setPreservesCFG(); 291 AU.addRequired<AAResultsWrapperPass>(); 292 293 MachineFunctionPass::getAnalysisUsage(AU); 294 } 295 }; 296 297 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 298 const unsigned Opc = MI.getOpcode(); 299 300 if (TII.isMUBUF(Opc)) { 301 // FIXME: Handle d16 correctly 302 return AMDGPU::getMUBUFElements(Opc); 303 } 304 if (TII.isMIMG(MI)) { 305 uint64_t DMaskImm = 306 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 307 return countPopulation(DMaskImm); 308 } 309 if (TII.isMTBUF(Opc)) { 310 return AMDGPU::getMTBUFElements(Opc); 311 } 312 313 switch (Opc) { 314 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 315 return 1; 316 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 317 return 2; 318 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 319 return 4; 320 default: 321 return 0; 322 } 323 } 324 325 /// Maps instruction opcode to enum InstClassEnum. 326 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 327 switch (Opc) { 328 default: 329 if (TII.isMUBUF(Opc)) { 330 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 331 default: 332 return UNKNOWN; 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 335 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 336 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 337 return BUFFER_LOAD; 338 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 340 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 341 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 342 return BUFFER_STORE; 343 } 344 } 345 if (TII.isMIMG(Opc)) { 346 // Ignore instructions encoded without vaddr. 347 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) 348 return UNKNOWN; 349 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 350 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 351 TII.isGather4(Opc)) 352 return UNKNOWN; 353 return MIMG; 354 } 355 if (TII.isMTBUF(Opc)) { 356 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 357 default: 358 return UNKNOWN; 359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 360 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 361 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 362 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 363 return TBUFFER_LOAD; 364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 365 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 366 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 367 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 368 return TBUFFER_STORE; 369 } 370 } 371 return UNKNOWN; 372 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 374 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 375 return S_BUFFER_LOAD_IMM; 376 case AMDGPU::DS_READ_B32: 377 case AMDGPU::DS_READ_B32_gfx9: 378 case AMDGPU::DS_READ_B64: 379 case AMDGPU::DS_READ_B64_gfx9: 380 return DS_READ; 381 case AMDGPU::DS_WRITE_B32: 382 case AMDGPU::DS_WRITE_B32_gfx9: 383 case AMDGPU::DS_WRITE_B64: 384 case AMDGPU::DS_WRITE_B64_gfx9: 385 return DS_WRITE; 386 } 387 } 388 389 /// Determines instruction subclass from opcode. Only instructions 390 /// of the same subclass can be merged together. 391 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 392 switch (Opc) { 393 default: 394 if (TII.isMUBUF(Opc)) 395 return AMDGPU::getMUBUFBaseOpcode(Opc); 396 if (TII.isMIMG(Opc)) { 397 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 398 assert(Info); 399 return Info->BaseOpcode; 400 } 401 if (TII.isMTBUF(Opc)) 402 return AMDGPU::getMTBUFBaseOpcode(Opc); 403 return -1; 404 case AMDGPU::DS_READ_B32: 405 case AMDGPU::DS_READ_B32_gfx9: 406 case AMDGPU::DS_READ_B64: 407 case AMDGPU::DS_READ_B64_gfx9: 408 case AMDGPU::DS_WRITE_B32: 409 case AMDGPU::DS_WRITE_B32_gfx9: 410 case AMDGPU::DS_WRITE_B64: 411 case AMDGPU::DS_WRITE_B64_gfx9: 412 return Opc; 413 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 414 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 415 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 416 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 417 } 418 } 419 420 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { 421 if (TII.isMUBUF(Opc)) { 422 unsigned result = 0; 423 424 if (AMDGPU::getMUBUFHasVAddr(Opc)) { 425 result |= VADDR; 426 } 427 428 if (AMDGPU::getMUBUFHasSrsrc(Opc)) { 429 result |= SRSRC; 430 } 431 432 if (AMDGPU::getMUBUFHasSoffset(Opc)) { 433 result |= SOFFSET; 434 } 435 436 return result; 437 } 438 439 if (TII.isMIMG(Opc)) { 440 unsigned result = VADDR | SRSRC; 441 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 442 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 443 result |= SSAMP; 444 445 return result; 446 } 447 if (TII.isMTBUF(Opc)) { 448 unsigned result = 0; 449 450 if (AMDGPU::getMTBUFHasVAddr(Opc)) { 451 result |= VADDR; 452 } 453 454 if (AMDGPU::getMTBUFHasSrsrc(Opc)) { 455 result |= SRSRC; 456 } 457 458 if (AMDGPU::getMTBUFHasSoffset(Opc)) { 459 result |= SOFFSET; 460 } 461 462 return result; 463 } 464 465 switch (Opc) { 466 default: 467 return 0; 468 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 469 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 470 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 471 return SBASE; 472 case AMDGPU::DS_READ_B32: 473 case AMDGPU::DS_READ_B64: 474 case AMDGPU::DS_READ_B32_gfx9: 475 case AMDGPU::DS_READ_B64_gfx9: 476 case AMDGPU::DS_WRITE_B32: 477 case AMDGPU::DS_WRITE_B64: 478 case AMDGPU::DS_WRITE_B32_gfx9: 479 case AMDGPU::DS_WRITE_B64_gfx9: 480 return ADDR; 481 } 482 } 483 484 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 485 const SIInstrInfo &TII, 486 const GCNSubtarget &STM) { 487 I = MI; 488 unsigned Opc = MI->getOpcode(); 489 InstClass = getInstClass(Opc, TII); 490 491 if (InstClass == UNKNOWN) 492 return; 493 494 switch (InstClass) { 495 case DS_READ: 496 EltSize = 497 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 498 : 4; 499 break; 500 case DS_WRITE: 501 EltSize = 502 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 503 : 4; 504 break; 505 case S_BUFFER_LOAD_IMM: 506 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 507 break; 508 default: 509 EltSize = 4; 510 break; 511 } 512 513 if (InstClass == MIMG) { 514 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 515 // Offset is not considered for MIMG instructions. 516 Offset = 0; 517 } else { 518 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 519 Offset = I->getOperand(OffsetIdx).getImm(); 520 } 521 522 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 523 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 524 525 Width = getOpcodeWidth(*I, TII); 526 527 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 528 Offset &= 0xffff; 529 } else if (InstClass != MIMG) { 530 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 531 if (InstClass != S_BUFFER_LOAD_IMM) { 532 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 533 } 534 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 535 } 536 537 unsigned AddrOpName[5] = {0}; 538 NumAddresses = 0; 539 const unsigned Regs = getRegs(I->getOpcode(), TII); 540 541 if (Regs & ADDR) { 542 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 543 } 544 545 if (Regs & SBASE) { 546 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 547 } 548 549 if (Regs & SRSRC) { 550 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 551 } 552 553 if (Regs & SOFFSET) { 554 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 555 } 556 557 if (Regs & VADDR) { 558 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 559 } 560 561 if (Regs & SSAMP) { 562 AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; 563 } 564 565 for (unsigned i = 0; i < NumAddresses; i++) { 566 AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); 567 AddrReg[i] = &I->getOperand(AddrIdx[i]); 568 } 569 } 570 571 } // end anonymous namespace. 572 573 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 574 "SI Load Store Optimizer", false, false) 575 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 576 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 577 false, false) 578 579 char SILoadStoreOptimizer::ID = 0; 580 581 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 582 583 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 584 return new SILoadStoreOptimizer(); 585 } 586 587 static void moveInstsAfter(MachineBasicBlock::iterator I, 588 ArrayRef<MachineInstr *> InstsToMove) { 589 MachineBasicBlock *MBB = I->getParent(); 590 ++I; 591 for (MachineInstr *MI : InstsToMove) { 592 MI->removeFromParent(); 593 MBB->insert(I, MI); 594 } 595 } 596 597 static void addDefsUsesToList(const MachineInstr &MI, 598 DenseSet<Register> &RegDefs, 599 DenseSet<Register> &PhysRegUses) { 600 for (const MachineOperand &Op : MI.operands()) { 601 if (Op.isReg()) { 602 if (Op.isDef()) 603 RegDefs.insert(Op.getReg()); 604 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) 605 PhysRegUses.insert(Op.getReg()); 606 } 607 } 608 } 609 610 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 611 MachineBasicBlock::iterator B, 612 AliasAnalysis *AA) { 613 // RAW or WAR - cannot reorder 614 // WAW - cannot reorder 615 // RAR - safe to reorder 616 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 617 } 618 619 // Add MI and its defs to the lists if MI reads one of the defs that are 620 // already in the list. Returns true in that case. 621 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 622 DenseSet<Register> &PhysRegUses, 623 SmallVectorImpl<MachineInstr *> &Insts) { 624 for (MachineOperand &Use : MI.operands()) { 625 // If one of the defs is read, then there is a use of Def between I and the 626 // instruction that I will potentially be merged with. We will need to move 627 // this instruction after the merged instructions. 628 // 629 // Similarly, if there is a def which is read by an instruction that is to 630 // be moved for merging, then we need to move the def-instruction as well. 631 // This can only happen for physical registers such as M0; virtual 632 // registers are in SSA form. 633 if (Use.isReg() && 634 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 635 (Use.isDef() && RegDefs.count(Use.getReg())) || 636 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && 637 PhysRegUses.count(Use.getReg())))) { 638 Insts.push_back(&MI); 639 addDefsUsesToList(MI, RegDefs, PhysRegUses); 640 return true; 641 } 642 } 643 644 return false; 645 } 646 647 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 648 ArrayRef<MachineInstr *> InstsToMove, 649 AliasAnalysis *AA) { 650 assert(MemOp.mayLoadOrStore()); 651 652 for (MachineInstr *InstToMove : InstsToMove) { 653 if (!InstToMove->mayLoadOrStore()) 654 continue; 655 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 656 return false; 657 } 658 return true; 659 } 660 661 // This function assumes that \p A and \p B have are identical except for 662 // size and offset, and they referecne adjacent memory. 663 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 664 const MachineMemOperand *A, 665 const MachineMemOperand *B) { 666 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 667 unsigned Size = A->getSize() + B->getSize(); 668 // This function adds the offset parameter to the existing offset for A, 669 // so we pass 0 here as the offset and then manually set it to the correct 670 // value after the call. 671 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 672 MMO->setOffset(MinOffset); 673 return MMO; 674 } 675 676 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 677 const SIInstrInfo &TII, 678 const CombineInfo &Paired) { 679 assert(CI.InstClass == MIMG); 680 681 // Ignore instructions with tfe/lwe set. 682 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 683 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 684 685 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 686 return false; 687 688 // Check other optional immediate operands for equality. 689 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 690 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 691 AMDGPU::OpName::da, AMDGPU::OpName::r128, 692 AMDGPU::OpName::a16}; 693 694 for (auto op : OperandsToMatch) { 695 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 696 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 697 return false; 698 if (Idx != -1 && 699 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 700 return false; 701 } 702 703 // Check DMask for overlaps. 704 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 705 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 706 707 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 708 if ((1u << AllowedBitsForMin) <= MinMask) 709 return false; 710 711 return true; 712 } 713 714 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 715 unsigned ComponentCount, 716 const GCNSubtarget &STI) { 717 if (ComponentCount > 4) 718 return 0; 719 720 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 721 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 722 if (!OldFormatInfo) 723 return 0; 724 725 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 726 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 727 ComponentCount, 728 OldFormatInfo->NumFormat, STI); 729 730 if (!NewFormatInfo) 731 return 0; 732 733 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 734 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 735 736 return NewFormatInfo->Format; 737 } 738 739 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 740 const GCNSubtarget &STI, 741 CombineInfo &Paired, 742 bool Modify) { 743 assert(CI.InstClass != MIMG); 744 745 // XXX - Would the same offset be OK? Is there any reason this would happen or 746 // be useful? 747 if (CI.Offset == Paired.Offset) 748 return false; 749 750 // This won't be valid if the offset isn't aligned. 751 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 752 return false; 753 754 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 755 756 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 757 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 758 if (!Info0) 759 return false; 760 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 761 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 762 if (!Info1) 763 return false; 764 765 if (Info0->BitsPerComp != Info1->BitsPerComp || 766 Info0->NumFormat != Info1->NumFormat) 767 return false; 768 769 // TODO: Should be possible to support more formats, but if format loads 770 // are not dword-aligned, the merged load might not be valid. 771 if (Info0->BitsPerComp != 32) 772 return false; 773 774 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 775 return false; 776 } 777 778 unsigned EltOffset0 = CI.Offset / CI.EltSize; 779 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 780 CI.UseST64 = false; 781 CI.BaseOff = 0; 782 783 // Handle DS instructions. 784 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 785 return (EltOffset0 + CI.Width == EltOffset1 || 786 EltOffset1 + Paired.Width == EltOffset0) && 787 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 788 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 789 } 790 791 // Handle SMEM and VMEM instructions. 792 // If the offset in elements doesn't fit in 8-bits, we might be able to use 793 // the stride 64 versions. 794 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 795 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 796 if (Modify) { 797 CI.Offset = EltOffset0 / 64; 798 Paired.Offset = EltOffset1 / 64; 799 CI.UseST64 = true; 800 } 801 return true; 802 } 803 804 // Check if the new offsets fit in the reduced 8-bit range. 805 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 806 if (Modify) { 807 CI.Offset = EltOffset0; 808 Paired.Offset = EltOffset1; 809 } 810 return true; 811 } 812 813 // Try to shift base address to decrease offsets. 814 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 815 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 816 817 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 818 if (Modify) { 819 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 820 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 821 CI.UseST64 = true; 822 } 823 return true; 824 } 825 826 if (isUInt<8>(OffsetDiff)) { 827 if (Modify) { 828 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 829 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 830 } 831 return true; 832 } 833 834 return false; 835 } 836 837 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 838 const CombineInfo &CI, 839 const CombineInfo &Paired) { 840 const unsigned Width = (CI.Width + Paired.Width); 841 switch (CI.InstClass) { 842 default: 843 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 844 case S_BUFFER_LOAD_IMM: 845 switch (Width) { 846 default: 847 return false; 848 case 2: 849 case 4: 850 return true; 851 } 852 } 853 } 854 855 /// This function assumes that CI comes before Paired in a basic block. 856 bool SILoadStoreOptimizer::checkAndPrepareMerge( 857 CombineInfo &CI, CombineInfo &Paired, 858 SmallVectorImpl<MachineInstr *> &InstsToMove) { 859 860 // Check both offsets (or masks for MIMG) can be combined and fit in the 861 // reduced range. 862 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 863 return false; 864 865 if (CI.InstClass != MIMG && 866 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 867 return false; 868 869 const unsigned Opc = CI.I->getOpcode(); 870 const InstClassEnum InstClass = getInstClass(Opc, *TII); 871 872 if (InstClass == UNKNOWN) { 873 return false; 874 } 875 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 876 877 // Do not merge VMEM buffer instructions with "swizzled" bit set. 878 int Swizzled = 879 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 880 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 881 return false; 882 883 DenseSet<Register> RegDefsToMove; 884 DenseSet<Register> PhysRegUsesToMove; 885 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 886 887 MachineBasicBlock::iterator E = std::next(Paired.I); 888 MachineBasicBlock::iterator MBBI = std::next(CI.I); 889 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 890 for (; MBBI != E; ++MBBI) { 891 892 if (MBBI == MBBE) { 893 // CombineInfo::Order is a hint on the instruction ordering within the 894 // basic block. This hint suggests that CI precedes Paired, which is 895 // true most of the time. However, moveInstsAfter() processing a 896 // previous list may have changed this order in a situation when it 897 // moves an instruction which exists in some other merge list. 898 // In this case it must be dependent. 899 return false; 900 } 901 902 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 903 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 904 // This is not a matching instruction, but we can keep looking as 905 // long as one of these conditions are met: 906 // 1. It is safe to move I down past MBBI. 907 // 2. It is safe to move MBBI down past the instruction that I will 908 // be merged into. 909 910 if (MBBI->hasUnmodeledSideEffects()) { 911 // We can't re-order this instruction with respect to other memory 912 // operations, so we fail both conditions mentioned above. 913 return false; 914 } 915 916 if (MBBI->mayLoadOrStore() && 917 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 918 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 919 // We fail condition #1, but we may still be able to satisfy condition 920 // #2. Add this instruction to the move list and then we will check 921 // if condition #2 holds once we have selected the matching instruction. 922 InstsToMove.push_back(&*MBBI); 923 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 924 continue; 925 } 926 927 // When we match I with another DS instruction we will be moving I down 928 // to the location of the matched instruction any uses of I will need to 929 // be moved down as well. 930 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 931 InstsToMove); 932 continue; 933 } 934 935 // Don't merge volatiles. 936 if (MBBI->hasOrderedMemoryRef()) 937 return false; 938 939 int Swizzled = 940 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 941 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 942 return false; 943 944 // Handle a case like 945 // DS_WRITE_B32 addr, v, idx0 946 // w = DS_READ_B32 addr, idx0 947 // DS_WRITE_B32 addr, f(w), idx1 948 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 949 // merging of the two writes. 950 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 951 InstsToMove)) 952 continue; 953 954 if (&*MBBI == &*Paired.I) { 955 // We need to go through the list of instructions that we plan to 956 // move and make sure they are all safe to move down past the merged 957 // instruction. 958 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 959 960 // Call offsetsCanBeCombined with modify = true so that the offsets are 961 // correct for the new instruction. This should return true, because 962 // this function should only be called on CombineInfo objects that 963 // have already been confirmed to be mergeable. 964 if (CI.InstClass != MIMG) 965 offsetsCanBeCombined(CI, *STM, Paired, true); 966 return true; 967 } 968 return false; 969 } 970 971 // We've found a load/store that we couldn't merge for some reason. 972 // We could potentially keep looking, but we'd need to make sure that 973 // it was safe to move I and also all the instruction in InstsToMove 974 // down past this instruction. 975 // check if we can move I across MBBI and if we can move all I's users 976 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 977 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 978 break; 979 } 980 return false; 981 } 982 983 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 984 if (STM->ldsRequiresM0Init()) 985 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 986 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 987 } 988 989 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 990 if (STM->ldsRequiresM0Init()) 991 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 992 993 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 994 : AMDGPU::DS_READ2ST64_B64_gfx9; 995 } 996 997 MachineBasicBlock::iterator 998 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 999 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1000 MachineBasicBlock *MBB = CI.I->getParent(); 1001 1002 // Be careful, since the addresses could be subregisters themselves in weird 1003 // cases, like vectors of pointers. 1004 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1005 1006 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1007 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1008 1009 unsigned NewOffset0 = CI.Offset; 1010 unsigned NewOffset1 = Paired.Offset; 1011 unsigned Opc = 1012 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1013 1014 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1015 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1016 1017 if (NewOffset0 > NewOffset1) { 1018 // Canonicalize the merged instruction so the smaller offset comes first. 1019 std::swap(NewOffset0, NewOffset1); 1020 std::swap(SubRegIdx0, SubRegIdx1); 1021 } 1022 1023 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1024 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1025 1026 const MCInstrDesc &Read2Desc = TII->get(Opc); 1027 1028 const TargetRegisterClass *SuperRC = 1029 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1030 Register DestReg = MRI->createVirtualRegister(SuperRC); 1031 1032 DebugLoc DL = CI.I->getDebugLoc(); 1033 1034 Register BaseReg = AddrReg->getReg(); 1035 unsigned BaseSubReg = AddrReg->getSubReg(); 1036 unsigned BaseRegFlags = 0; 1037 if (CI.BaseOff) { 1038 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1039 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1040 .addImm(CI.BaseOff); 1041 1042 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1043 BaseRegFlags = RegState::Kill; 1044 1045 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1046 .addReg(ImmReg) 1047 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1048 .addImm(0); // clamp bit 1049 BaseSubReg = 0; 1050 } 1051 1052 MachineInstrBuilder Read2 = 1053 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1054 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1055 .addImm(NewOffset0) // offset0 1056 .addImm(NewOffset1) // offset1 1057 .addImm(0) // gds 1058 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1059 1060 (void)Read2; 1061 1062 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1063 1064 // Copy to the old destination registers. 1065 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1066 .add(*Dest0) // Copy to same destination including flags and sub reg. 1067 .addReg(DestReg, 0, SubRegIdx0); 1068 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1069 .add(*Dest1) 1070 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1071 1072 moveInstsAfter(Copy1, InstsToMove); 1073 1074 CI.I->eraseFromParent(); 1075 Paired.I->eraseFromParent(); 1076 1077 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1078 return Read2; 1079 } 1080 1081 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1082 if (STM->ldsRequiresM0Init()) 1083 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1084 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1085 : AMDGPU::DS_WRITE2_B64_gfx9; 1086 } 1087 1088 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1089 if (STM->ldsRequiresM0Init()) 1090 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1091 : AMDGPU::DS_WRITE2ST64_B64; 1092 1093 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1094 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1095 } 1096 1097 MachineBasicBlock::iterator 1098 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1099 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1100 MachineBasicBlock *MBB = CI.I->getParent(); 1101 1102 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1103 // sure we preserve the subregister index and any register flags set on them. 1104 const MachineOperand *AddrReg = 1105 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1106 const MachineOperand *Data0 = 1107 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1108 const MachineOperand *Data1 = 1109 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1110 1111 unsigned NewOffset0 = CI.Offset; 1112 unsigned NewOffset1 = Paired.Offset; 1113 unsigned Opc = 1114 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1115 1116 if (NewOffset0 > NewOffset1) { 1117 // Canonicalize the merged instruction so the smaller offset comes first. 1118 std::swap(NewOffset0, NewOffset1); 1119 std::swap(Data0, Data1); 1120 } 1121 1122 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1123 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1124 1125 const MCInstrDesc &Write2Desc = TII->get(Opc); 1126 DebugLoc DL = CI.I->getDebugLoc(); 1127 1128 Register BaseReg = AddrReg->getReg(); 1129 unsigned BaseSubReg = AddrReg->getSubReg(); 1130 unsigned BaseRegFlags = 0; 1131 if (CI.BaseOff) { 1132 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1133 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1134 .addImm(CI.BaseOff); 1135 1136 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1137 BaseRegFlags = RegState::Kill; 1138 1139 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1140 .addReg(ImmReg) 1141 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1142 .addImm(0); // clamp bit 1143 BaseSubReg = 0; 1144 } 1145 1146 MachineInstrBuilder Write2 = 1147 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1148 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1149 .add(*Data0) // data0 1150 .add(*Data1) // data1 1151 .addImm(NewOffset0) // offset0 1152 .addImm(NewOffset1) // offset1 1153 .addImm(0) // gds 1154 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1155 1156 moveInstsAfter(Write2, InstsToMove); 1157 1158 CI.I->eraseFromParent(); 1159 Paired.I->eraseFromParent(); 1160 1161 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1162 return Write2; 1163 } 1164 1165 MachineBasicBlock::iterator 1166 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1167 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1168 MachineBasicBlock *MBB = CI.I->getParent(); 1169 DebugLoc DL = CI.I->getDebugLoc(); 1170 const unsigned Opcode = getNewOpcode(CI, Paired); 1171 1172 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1173 1174 Register DestReg = MRI->createVirtualRegister(SuperRC); 1175 unsigned MergedDMask = CI.DMask | Paired.DMask; 1176 unsigned DMaskIdx = 1177 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1178 1179 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1180 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1181 if (I == DMaskIdx) 1182 MIB.addImm(MergedDMask); 1183 else 1184 MIB.add((*CI.I).getOperand(I)); 1185 } 1186 1187 // It shouldn't be possible to get this far if the two instructions 1188 // don't have a single memoperand, because MachineInstr::mayAlias() 1189 // will return true if this is the case. 1190 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1191 1192 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1193 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1194 1195 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1196 1197 unsigned SubRegIdx0, SubRegIdx1; 1198 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1199 1200 // Copy to the old destination registers. 1201 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1202 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1203 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1204 1205 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1206 .add(*Dest0) // Copy to same destination including flags and sub reg. 1207 .addReg(DestReg, 0, SubRegIdx0); 1208 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1209 .add(*Dest1) 1210 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1211 1212 moveInstsAfter(Copy1, InstsToMove); 1213 1214 CI.I->eraseFromParent(); 1215 Paired.I->eraseFromParent(); 1216 return New; 1217 } 1218 1219 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1220 CombineInfo &CI, CombineInfo &Paired, 1221 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1222 MachineBasicBlock *MBB = CI.I->getParent(); 1223 DebugLoc DL = CI.I->getDebugLoc(); 1224 const unsigned Opcode = getNewOpcode(CI, Paired); 1225 1226 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1227 1228 Register DestReg = MRI->createVirtualRegister(SuperRC); 1229 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1230 1231 // It shouldn't be possible to get this far if the two instructions 1232 // don't have a single memoperand, because MachineInstr::mayAlias() 1233 // will return true if this is the case. 1234 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1235 1236 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1237 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1238 1239 MachineInstr *New = 1240 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1241 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1242 .addImm(MergedOffset) // offset 1243 .addImm(CI.GLC) // glc 1244 .addImm(CI.DLC) // dlc 1245 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1246 1247 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1248 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1249 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1250 1251 // Copy to the old destination registers. 1252 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1253 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1254 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1255 1256 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1257 .add(*Dest0) // Copy to same destination including flags and sub reg. 1258 .addReg(DestReg, 0, SubRegIdx0); 1259 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1260 .add(*Dest1) 1261 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1262 1263 moveInstsAfter(Copy1, InstsToMove); 1264 1265 CI.I->eraseFromParent(); 1266 Paired.I->eraseFromParent(); 1267 return New; 1268 } 1269 1270 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1271 CombineInfo &CI, CombineInfo &Paired, 1272 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1273 MachineBasicBlock *MBB = CI.I->getParent(); 1274 DebugLoc DL = CI.I->getDebugLoc(); 1275 1276 const unsigned Opcode = getNewOpcode(CI, Paired); 1277 1278 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1279 1280 // Copy to the new source register. 1281 Register DestReg = MRI->createVirtualRegister(SuperRC); 1282 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1283 1284 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1285 1286 const unsigned Regs = getRegs(Opcode, *TII); 1287 1288 if (Regs & VADDR) 1289 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1290 1291 // It shouldn't be possible to get this far if the two instructions 1292 // don't have a single memoperand, because MachineInstr::mayAlias() 1293 // will return true if this is the case. 1294 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1295 1296 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1297 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1298 1299 MachineInstr *New = 1300 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1301 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1302 .addImm(MergedOffset) // offset 1303 .addImm(CI.GLC) // glc 1304 .addImm(CI.SLC) // slc 1305 .addImm(0) // tfe 1306 .addImm(CI.DLC) // dlc 1307 .addImm(0) // swz 1308 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1309 1310 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1311 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1312 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1313 1314 // Copy to the old destination registers. 1315 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1316 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1317 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1318 1319 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1320 .add(*Dest0) // Copy to same destination including flags and sub reg. 1321 .addReg(DestReg, 0, SubRegIdx0); 1322 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1323 .add(*Dest1) 1324 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1325 1326 moveInstsAfter(Copy1, InstsToMove); 1327 1328 CI.I->eraseFromParent(); 1329 Paired.I->eraseFromParent(); 1330 return New; 1331 } 1332 1333 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1334 CombineInfo &CI, CombineInfo &Paired, 1335 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1336 MachineBasicBlock *MBB = CI.I->getParent(); 1337 DebugLoc DL = CI.I->getDebugLoc(); 1338 1339 const unsigned Opcode = getNewOpcode(CI, Paired); 1340 1341 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1342 1343 // Copy to the new source register. 1344 Register DestReg = MRI->createVirtualRegister(SuperRC); 1345 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1346 1347 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1348 1349 const unsigned Regs = getRegs(Opcode, *TII); 1350 1351 if (Regs & VADDR) 1352 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1353 1354 unsigned JoinedFormat = 1355 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1356 1357 // It shouldn't be possible to get this far if the two instructions 1358 // don't have a single memoperand, because MachineInstr::mayAlias() 1359 // will return true if this is the case. 1360 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1361 1362 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1363 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1364 1365 MachineInstr *New = 1366 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1367 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1368 .addImm(MergedOffset) // offset 1369 .addImm(JoinedFormat) // format 1370 .addImm(CI.GLC) // glc 1371 .addImm(CI.SLC) // slc 1372 .addImm(0) // tfe 1373 .addImm(CI.DLC) // dlc 1374 .addImm(0) // swz 1375 .addMemOperand( 1376 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1377 1378 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1379 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1380 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1381 1382 // Copy to the old destination registers. 1383 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1384 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1385 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1386 1387 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1388 .add(*Dest0) // Copy to same destination including flags and sub reg. 1389 .addReg(DestReg, 0, SubRegIdx0); 1390 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1391 .add(*Dest1) 1392 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1393 1394 moveInstsAfter(Copy1, InstsToMove); 1395 1396 CI.I->eraseFromParent(); 1397 Paired.I->eraseFromParent(); 1398 return New; 1399 } 1400 1401 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1402 CombineInfo &CI, CombineInfo &Paired, 1403 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1404 MachineBasicBlock *MBB = CI.I->getParent(); 1405 DebugLoc DL = CI.I->getDebugLoc(); 1406 1407 const unsigned Opcode = getNewOpcode(CI, Paired); 1408 1409 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1410 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1411 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1412 1413 // Copy to the new source register. 1414 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1415 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1416 1417 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1418 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1419 1420 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1421 .add(*Src0) 1422 .addImm(SubRegIdx0) 1423 .add(*Src1) 1424 .addImm(SubRegIdx1); 1425 1426 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1427 .addReg(SrcReg, RegState::Kill); 1428 1429 const unsigned Regs = getRegs(Opcode, *TII); 1430 1431 if (Regs & VADDR) 1432 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1433 1434 unsigned JoinedFormat = 1435 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1436 1437 // It shouldn't be possible to get this far if the two instructions 1438 // don't have a single memoperand, because MachineInstr::mayAlias() 1439 // will return true if this is the case. 1440 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1441 1442 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1443 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1444 1445 MachineInstr *New = 1446 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1447 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1448 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1449 .addImm(JoinedFormat) // format 1450 .addImm(CI.GLC) // glc 1451 .addImm(CI.SLC) // slc 1452 .addImm(0) // tfe 1453 .addImm(CI.DLC) // dlc 1454 .addImm(0) // swz 1455 .addMemOperand( 1456 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1457 1458 moveInstsAfter(MIB, InstsToMove); 1459 1460 CI.I->eraseFromParent(); 1461 Paired.I->eraseFromParent(); 1462 return New; 1463 } 1464 1465 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1466 const CombineInfo &Paired) { 1467 const unsigned Width = CI.Width + Paired.Width; 1468 1469 switch (CI.InstClass) { 1470 default: 1471 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1472 // FIXME: Handle d16 correctly 1473 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1474 Width); 1475 case TBUFFER_LOAD: 1476 case TBUFFER_STORE: 1477 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1478 Width); 1479 1480 case UNKNOWN: 1481 llvm_unreachable("Unknown instruction class"); 1482 case S_BUFFER_LOAD_IMM: 1483 switch (Width) { 1484 default: 1485 return 0; 1486 case 2: 1487 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1488 case 4: 1489 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1490 } 1491 case MIMG: 1492 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1493 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1494 } 1495 } 1496 1497 std::pair<unsigned, unsigned> 1498 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1499 1500 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1501 return std::make_pair(0, 0); 1502 1503 bool ReverseOrder; 1504 if (CI.InstClass == MIMG) { 1505 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1506 "No overlaps"); 1507 ReverseOrder = CI.DMask > Paired.DMask; 1508 } else 1509 ReverseOrder = CI.Offset > Paired.Offset; 1510 1511 static const unsigned Idxs[4][4] = { 1512 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1513 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1514 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1515 {AMDGPU::sub3, 0, 0, 0}, 1516 }; 1517 unsigned Idx0; 1518 unsigned Idx1; 1519 1520 assert(CI.Width >= 1 && CI.Width <= 3); 1521 assert(Paired.Width >= 1 && Paired.Width <= 3); 1522 1523 if (ReverseOrder) { 1524 Idx1 = Idxs[0][Paired.Width - 1]; 1525 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1526 } else { 1527 Idx0 = Idxs[0][CI.Width - 1]; 1528 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1529 } 1530 1531 return std::make_pair(Idx0, Idx1); 1532 } 1533 1534 const TargetRegisterClass * 1535 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1536 const CombineInfo &Paired) { 1537 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1538 switch (CI.Width + Paired.Width) { 1539 default: 1540 return nullptr; 1541 case 2: 1542 return &AMDGPU::SReg_64_XEXECRegClass; 1543 case 4: 1544 return &AMDGPU::SGPR_128RegClass; 1545 case 8: 1546 return &AMDGPU::SGPR_256RegClass; 1547 case 16: 1548 return &AMDGPU::SGPR_512RegClass; 1549 } 1550 } else { 1551 switch (CI.Width + Paired.Width) { 1552 default: 1553 return nullptr; 1554 case 2: 1555 return &AMDGPU::VReg_64RegClass; 1556 case 3: 1557 return &AMDGPU::VReg_96RegClass; 1558 case 4: 1559 return &AMDGPU::VReg_128RegClass; 1560 } 1561 } 1562 } 1563 1564 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1565 CombineInfo &CI, CombineInfo &Paired, 1566 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1567 MachineBasicBlock *MBB = CI.I->getParent(); 1568 DebugLoc DL = CI.I->getDebugLoc(); 1569 1570 const unsigned Opcode = getNewOpcode(CI, Paired); 1571 1572 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1573 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1574 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1575 1576 // Copy to the new source register. 1577 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1578 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1579 1580 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1581 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1582 1583 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1584 .add(*Src0) 1585 .addImm(SubRegIdx0) 1586 .add(*Src1) 1587 .addImm(SubRegIdx1); 1588 1589 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1590 .addReg(SrcReg, RegState::Kill); 1591 1592 const unsigned Regs = getRegs(Opcode, *TII); 1593 1594 if (Regs & VADDR) 1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1596 1597 1598 // It shouldn't be possible to get this far if the two instructions 1599 // don't have a single memoperand, because MachineInstr::mayAlias() 1600 // will return true if this is the case. 1601 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1602 1603 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1604 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1605 1606 MachineInstr *New = 1607 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1608 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1609 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1610 .addImm(CI.GLC) // glc 1611 .addImm(CI.SLC) // slc 1612 .addImm(0) // tfe 1613 .addImm(CI.DLC) // dlc 1614 .addImm(0) // swz 1615 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1616 1617 moveInstsAfter(MIB, InstsToMove); 1618 1619 CI.I->eraseFromParent(); 1620 Paired.I->eraseFromParent(); 1621 return New; 1622 } 1623 1624 MachineOperand 1625 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1626 APInt V(32, Val, true); 1627 if (TII->isInlineConstant(V)) 1628 return MachineOperand::CreateImm(Val); 1629 1630 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1631 MachineInstr *Mov = 1632 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1633 TII->get(AMDGPU::S_MOV_B32), Reg) 1634 .addImm(Val); 1635 (void)Mov; 1636 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1637 return MachineOperand::CreateReg(Reg, false); 1638 } 1639 1640 // Compute base address using Addr and return the final register. 1641 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1642 const MemAddress &Addr) const { 1643 MachineBasicBlock *MBB = MI.getParent(); 1644 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1645 DebugLoc DL = MI.getDebugLoc(); 1646 1647 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1648 Addr.Base.LoSubReg) && 1649 "Expected 32-bit Base-Register-Low!!"); 1650 1651 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1652 Addr.Base.HiSubReg) && 1653 "Expected 32-bit Base-Register-Hi!!"); 1654 1655 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1656 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1657 MachineOperand OffsetHi = 1658 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1659 1660 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1661 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1662 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1663 1664 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1665 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1666 MachineInstr *LoHalf = 1667 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) 1668 .addReg(CarryReg, RegState::Define) 1669 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1670 .add(OffsetLo) 1671 .addImm(0); // clamp bit 1672 (void)LoHalf; 1673 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1674 1675 MachineInstr *HiHalf = 1676 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1677 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1678 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1679 .add(OffsetHi) 1680 .addReg(CarryReg, RegState::Kill) 1681 .addImm(0); // clamp bit 1682 (void)HiHalf; 1683 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1684 1685 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1686 MachineInstr *FullBase = 1687 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1688 .addReg(DestSub0) 1689 .addImm(AMDGPU::sub0) 1690 .addReg(DestSub1) 1691 .addImm(AMDGPU::sub1); 1692 (void)FullBase; 1693 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1694 1695 return FullDestReg; 1696 } 1697 1698 // Update base and offset with the NewBase and NewOffset in MI. 1699 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1700 Register NewBase, 1701 int32_t NewOffset) const { 1702 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1703 Base->setReg(NewBase); 1704 Base->setIsKill(false); 1705 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1706 } 1707 1708 Optional<int32_t> 1709 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1710 if (Op.isImm()) 1711 return Op.getImm(); 1712 1713 if (!Op.isReg()) 1714 return None; 1715 1716 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1717 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1718 !Def->getOperand(1).isImm()) 1719 return None; 1720 1721 return Def->getOperand(1).getImm(); 1722 } 1723 1724 // Analyze Base and extracts: 1725 // - 32bit base registers, subregisters 1726 // - 64bit constant offset 1727 // Expecting base computation as: 1728 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1729 // %LO:vgpr_32, %c:sreg_64_xexec = 1730 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1731 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1732 // %Base:vreg_64 = 1733 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1734 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1735 MemAddress &Addr) const { 1736 if (!Base.isReg()) 1737 return; 1738 1739 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1740 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1741 || Def->getNumOperands() != 5) 1742 return; 1743 1744 MachineOperand BaseLo = Def->getOperand(1); 1745 MachineOperand BaseHi = Def->getOperand(3); 1746 if (!BaseLo.isReg() || !BaseHi.isReg()) 1747 return; 1748 1749 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1750 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1751 1752 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || 1753 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1754 return; 1755 1756 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1757 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1758 1759 auto Offset0P = extractConstOffset(*Src0); 1760 if (Offset0P) 1761 BaseLo = *Src1; 1762 else { 1763 if (!(Offset0P = extractConstOffset(*Src1))) 1764 return; 1765 BaseLo = *Src0; 1766 } 1767 1768 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1769 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1770 1771 if (Src0->isImm()) 1772 std::swap(Src0, Src1); 1773 1774 if (!Src1->isImm()) 1775 return; 1776 1777 uint64_t Offset1 = Src1->getImm(); 1778 BaseHi = *Src0; 1779 1780 Addr.Base.LoReg = BaseLo.getReg(); 1781 Addr.Base.HiReg = BaseHi.getReg(); 1782 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1783 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1784 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1785 } 1786 1787 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1788 MachineInstr &MI, 1789 MemInfoMap &Visited, 1790 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1791 1792 if (!(MI.mayLoad() ^ MI.mayStore())) 1793 return false; 1794 1795 // TODO: Support flat and scratch. 1796 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1797 return false; 1798 1799 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1800 return false; 1801 1802 if (AnchorList.count(&MI)) 1803 return false; 1804 1805 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1806 1807 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1808 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1809 return false; 1810 } 1811 1812 // Step1: Find the base-registers and a 64bit constant offset. 1813 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1814 MemAddress MAddr; 1815 if (Visited.find(&MI) == Visited.end()) { 1816 processBaseWithConstOffset(Base, MAddr); 1817 Visited[&MI] = MAddr; 1818 } else 1819 MAddr = Visited[&MI]; 1820 1821 if (MAddr.Offset == 0) { 1822 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1823 " constant offsets that can be promoted.\n";); 1824 return false; 1825 } 1826 1827 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1828 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1829 1830 // Step2: Traverse through MI's basic block and find an anchor(that has the 1831 // same base-registers) with the highest 13bit distance from MI's offset. 1832 // E.g. (64bit loads) 1833 // bb: 1834 // addr1 = &a + 4096; load1 = load(addr1, 0) 1835 // addr2 = &a + 6144; load2 = load(addr2, 0) 1836 // addr3 = &a + 8192; load3 = load(addr3, 0) 1837 // addr4 = &a + 10240; load4 = load(addr4, 0) 1838 // addr5 = &a + 12288; load5 = load(addr5, 0) 1839 // 1840 // Starting from the first load, the optimization will try to find a new base 1841 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1842 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1843 // as the new-base(anchor) because of the maximum distance which can 1844 // accomodate more intermediate bases presumeably. 1845 // 1846 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1847 // (&a + 8192) for load1, load2, load4. 1848 // addr = &a + 8192 1849 // load1 = load(addr, -4096) 1850 // load2 = load(addr, -2048) 1851 // load3 = load(addr, 0) 1852 // load4 = load(addr, 2048) 1853 // addr5 = &a + 12288; load5 = load(addr5, 0) 1854 // 1855 MachineInstr *AnchorInst = nullptr; 1856 MemAddress AnchorAddr; 1857 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1858 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1859 1860 MachineBasicBlock *MBB = MI.getParent(); 1861 MachineBasicBlock::iterator E = MBB->end(); 1862 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1863 ++MBBI; 1864 const SITargetLowering *TLI = 1865 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1866 1867 for ( ; MBBI != E; ++MBBI) { 1868 MachineInstr &MINext = *MBBI; 1869 // TODO: Support finding an anchor(with same base) from store addresses or 1870 // any other load addresses where the opcodes are different. 1871 if (MINext.getOpcode() != MI.getOpcode() || 1872 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1873 continue; 1874 1875 const MachineOperand &BaseNext = 1876 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1877 MemAddress MAddrNext; 1878 if (Visited.find(&MINext) == Visited.end()) { 1879 processBaseWithConstOffset(BaseNext, MAddrNext); 1880 Visited[&MINext] = MAddrNext; 1881 } else 1882 MAddrNext = Visited[&MINext]; 1883 1884 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1885 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1886 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1887 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1888 continue; 1889 1890 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1891 1892 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1893 TargetLoweringBase::AddrMode AM; 1894 AM.HasBaseReg = true; 1895 AM.BaseOffs = Dist; 1896 if (TLI->isLegalGlobalAddressingMode(AM) && 1897 (uint32_t)std::abs(Dist) > MaxDist) { 1898 MaxDist = std::abs(Dist); 1899 1900 AnchorAddr = MAddrNext; 1901 AnchorInst = &MINext; 1902 } 1903 } 1904 1905 if (AnchorInst) { 1906 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1907 AnchorInst->dump()); 1908 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1909 << AnchorAddr.Offset << "\n\n"); 1910 1911 // Instead of moving up, just re-compute anchor-instruction's base address. 1912 Register Base = computeBase(MI, AnchorAddr); 1913 1914 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1915 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1916 1917 for (auto P : InstsWCommonBase) { 1918 TargetLoweringBase::AddrMode AM; 1919 AM.HasBaseReg = true; 1920 AM.BaseOffs = P.second - AnchorAddr.Offset; 1921 1922 if (TLI->isLegalGlobalAddressingMode(AM)) { 1923 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1924 dbgs() << ")"; P.first->dump()); 1925 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1926 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1927 } 1928 } 1929 AnchorList.insert(AnchorInst); 1930 return true; 1931 } 1932 1933 return false; 1934 } 1935 1936 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1937 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1938 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1939 if (AddrList.front().InstClass == CI.InstClass && 1940 AddrList.front().hasSameBaseAddress(*CI.I)) { 1941 AddrList.emplace_back(CI); 1942 return; 1943 } 1944 } 1945 1946 // Base address not found, so add a new list. 1947 MergeableInsts.emplace_back(1, CI); 1948 } 1949 1950 std::pair<MachineBasicBlock::iterator, bool> 1951 SILoadStoreOptimizer::collectMergeableInsts( 1952 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1953 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1954 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1955 bool Modified = false; 1956 1957 // Sort potential mergeable instructions into lists. One list per base address. 1958 unsigned Order = 0; 1959 MachineBasicBlock::iterator BlockI = Begin; 1960 for (; BlockI != End; ++BlockI) { 1961 MachineInstr &MI = *BlockI; 1962 1963 // We run this before checking if an address is mergeable, because it can produce 1964 // better code even if the instructions aren't mergeable. 1965 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1966 Modified = true; 1967 1968 // Don't combine if volatile. We also won't be able to merge across this, so 1969 // break the search. We can look after this barrier for separate merges. 1970 if (MI.hasOrderedMemoryRef()) { 1971 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1972 1973 // Search will resume after this instruction in a separate merge list. 1974 ++BlockI; 1975 break; 1976 } 1977 1978 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1979 if (InstClass == UNKNOWN) 1980 continue; 1981 1982 CombineInfo CI; 1983 CI.setMI(MI, *TII, *STM); 1984 CI.Order = Order++; 1985 1986 if (!CI.hasMergeableAddress(*MRI)) 1987 continue; 1988 1989 addInstToMergeableList(CI, MergeableInsts); 1990 } 1991 1992 // At this point we have lists of Mergeable instructions. 1993 // 1994 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1995 // list try to find an instruction that can be merged with I. If an instruction 1996 // is found, it is stored in the Paired field. If no instructions are found, then 1997 // the CombineInfo object is deleted from the list. 1998 1999 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2000 E = MergeableInsts.end(); I != E;) { 2001 2002 std::list<CombineInfo> &MergeList = *I; 2003 if (MergeList.size() <= 1) { 2004 // This means we have found only one instruction with a given address 2005 // that can be merged, and we need at least 2 instructions to do a merge, 2006 // so this list can be discarded. 2007 I = MergeableInsts.erase(I); 2008 continue; 2009 } 2010 2011 // Sort the lists by offsets, this way mergeable instructions will be 2012 // adjacent to each other in the list, which will make it easier to find 2013 // matches. 2014 MergeList.sort( 2015 [] (const CombineInfo &A, CombineInfo &B) { 2016 return A.Offset < B.Offset; 2017 }); 2018 ++I; 2019 } 2020 2021 return std::make_pair(BlockI, Modified); 2022 } 2023 2024 // Scan through looking for adjacent LDS operations with constant offsets from 2025 // the same base register. We rely on the scheduler to do the hard work of 2026 // clustering nearby loads, and assume these are all adjacent. 2027 bool SILoadStoreOptimizer::optimizeBlock( 2028 std::list<std::list<CombineInfo> > &MergeableInsts) { 2029 bool Modified = false; 2030 2031 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2032 E = MergeableInsts.end(); I != E;) { 2033 std::list<CombineInfo> &MergeList = *I; 2034 2035 bool OptimizeListAgain = false; 2036 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2037 // We weren't able to make any changes, so delete the list so we don't 2038 // process the same instructions the next time we try to optimize this 2039 // block. 2040 I = MergeableInsts.erase(I); 2041 continue; 2042 } 2043 2044 Modified = true; 2045 2046 // We made changes, but also determined that there were no more optimization 2047 // opportunities, so we don't need to reprocess the list 2048 if (!OptimizeListAgain) { 2049 I = MergeableInsts.erase(I); 2050 continue; 2051 } 2052 OptimizeAgain = true; 2053 } 2054 return Modified; 2055 } 2056 2057 bool 2058 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2059 std::list<CombineInfo> &MergeList, 2060 bool &OptimizeListAgain) { 2061 if (MergeList.empty()) 2062 return false; 2063 2064 bool Modified = false; 2065 2066 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2067 Next = std::next(I)) { 2068 2069 auto First = I; 2070 auto Second = Next; 2071 2072 if ((*First).Order > (*Second).Order) 2073 std::swap(First, Second); 2074 CombineInfo &CI = *First; 2075 CombineInfo &Paired = *Second; 2076 2077 SmallVector<MachineInstr *, 8> InstsToMove; 2078 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2079 ++I; 2080 continue; 2081 } 2082 2083 Modified = true; 2084 2085 switch (CI.InstClass) { 2086 default: 2087 llvm_unreachable("unknown InstClass"); 2088 break; 2089 case DS_READ: { 2090 MachineBasicBlock::iterator NewMI = 2091 mergeRead2Pair(CI, Paired, InstsToMove); 2092 CI.setMI(NewMI, *TII, *STM); 2093 break; 2094 } 2095 case DS_WRITE: { 2096 MachineBasicBlock::iterator NewMI = 2097 mergeWrite2Pair(CI, Paired, InstsToMove); 2098 CI.setMI(NewMI, *TII, *STM); 2099 break; 2100 } 2101 case S_BUFFER_LOAD_IMM: { 2102 MachineBasicBlock::iterator NewMI = 2103 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2104 CI.setMI(NewMI, *TII, *STM); 2105 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2106 break; 2107 } 2108 case BUFFER_LOAD: { 2109 MachineBasicBlock::iterator NewMI = 2110 mergeBufferLoadPair(CI, Paired, InstsToMove); 2111 CI.setMI(NewMI, *TII, *STM); 2112 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2113 break; 2114 } 2115 case BUFFER_STORE: { 2116 MachineBasicBlock::iterator NewMI = 2117 mergeBufferStorePair(CI, Paired, InstsToMove); 2118 CI.setMI(NewMI, *TII, *STM); 2119 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2120 break; 2121 } 2122 case MIMG: { 2123 MachineBasicBlock::iterator NewMI = 2124 mergeImagePair(CI, Paired, InstsToMove); 2125 CI.setMI(NewMI, *TII, *STM); 2126 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2127 break; 2128 } 2129 case TBUFFER_LOAD: { 2130 MachineBasicBlock::iterator NewMI = 2131 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2132 CI.setMI(NewMI, *TII, *STM); 2133 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2134 break; 2135 } 2136 case TBUFFER_STORE: { 2137 MachineBasicBlock::iterator NewMI = 2138 mergeTBufferStorePair(CI, Paired, InstsToMove); 2139 CI.setMI(NewMI, *TII, *STM); 2140 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2141 break; 2142 } 2143 } 2144 CI.Order = Paired.Order; 2145 if (I == Second) 2146 I = Next; 2147 2148 MergeList.erase(Second); 2149 } 2150 2151 return Modified; 2152 } 2153 2154 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2155 if (skipFunction(MF.getFunction())) 2156 return false; 2157 2158 STM = &MF.getSubtarget<GCNSubtarget>(); 2159 if (!STM->loadStoreOptEnabled()) 2160 return false; 2161 2162 TII = STM->getInstrInfo(); 2163 TRI = &TII->getRegisterInfo(); 2164 2165 MRI = &MF.getRegInfo(); 2166 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2167 2168 assert(MRI->isSSA() && "Must be run on SSA"); 2169 2170 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2171 2172 bool Modified = false; 2173 2174 // Contains the list of instructions for which constant offsets are being 2175 // promoted to the IMM. This is tracked for an entire block at time. 2176 SmallPtrSet<MachineInstr *, 4> AnchorList; 2177 MemInfoMap Visited; 2178 2179 for (MachineBasicBlock &MBB : MF) { 2180 MachineBasicBlock::iterator SectionEnd; 2181 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2182 I = SectionEnd) { 2183 bool CollectModified; 2184 std::list<std::list<CombineInfo>> MergeableInsts; 2185 2186 // First pass: Collect list of all instructions we know how to merge in a 2187 // subset of the block. 2188 std::tie(SectionEnd, CollectModified) = 2189 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2190 2191 Modified |= CollectModified; 2192 2193 do { 2194 OptimizeAgain = false; 2195 Modified |= optimizeBlock(MergeableInsts); 2196 } while (OptimizeAgain); 2197 } 2198 2199 Visited.clear(); 2200 AnchorList.clear(); 2201 } 2202 2203 return Modified; 2204 } 2205