1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "SIInstrInfo.h" 64 #include "SIRegisterInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/StringRef.h" 69 #include "llvm/Analysis/AliasAnalysis.h" 70 #include "llvm/CodeGen/MachineBasicBlock.h" 71 #include "llvm/CodeGen/MachineFunction.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstr.h" 74 #include "llvm/CodeGen/MachineInstrBuilder.h" 75 #include "llvm/CodeGen/MachineOperand.h" 76 #include "llvm/CodeGen/MachineRegisterInfo.h" 77 #include "llvm/IR/DebugLoc.h" 78 #include "llvm/InitializePasses.h" 79 #include "llvm/Pass.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include <algorithm> 84 #include <cassert> 85 #include <cstdlib> 86 #include <iterator> 87 #include <utility> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "si-load-store-opt" 92 93 namespace { 94 enum InstClassEnum { 95 UNKNOWN, 96 DS_READ, 97 DS_WRITE, 98 S_BUFFER_LOAD_IMM, 99 BUFFER_LOAD, 100 BUFFER_STORE, 101 MIMG, 102 TBUFFER_LOAD, 103 TBUFFER_STORE, 104 }; 105 106 enum RegisterEnum { 107 SBASE = 0x1, 108 SRSRC = 0x2, 109 SOFFSET = 0x4, 110 VADDR = 0x8, 111 ADDR = 0x10, 112 SSAMP = 0x20, 113 }; 114 115 class SILoadStoreOptimizer : public MachineFunctionPass { 116 struct CombineInfo { 117 MachineBasicBlock::iterator I; 118 unsigned EltSize; 119 unsigned Offset; 120 unsigned Width; 121 unsigned Format; 122 unsigned BaseOff; 123 unsigned DMask; 124 InstClassEnum InstClass; 125 bool GLC; 126 bool SLC; 127 bool DLC; 128 bool UseST64; 129 int AddrIdx[5]; 130 const MachineOperand *AddrReg[5]; 131 unsigned NumAddresses; 132 unsigned Order; 133 134 bool hasSameBaseAddress(const MachineInstr &MI) { 135 for (unsigned i = 0; i < NumAddresses; i++) { 136 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 137 138 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 139 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 140 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 141 return false; 142 } 143 continue; 144 } 145 146 // Check same base pointer. Be careful of subregisters, which can occur 147 // with vectors of pointers. 148 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 149 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 150 return false; 151 } 152 } 153 return true; 154 } 155 156 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 157 for (unsigned i = 0; i < NumAddresses; ++i) { 158 const MachineOperand *AddrOp = AddrReg[i]; 159 // Immediates are always OK. 160 if (AddrOp->isImm()) 161 continue; 162 163 // Don't try to merge addresses that aren't either immediates or registers. 164 // TODO: Should be possible to merge FrameIndexes and maybe some other 165 // non-register 166 if (!AddrOp->isReg()) 167 return false; 168 169 // TODO: We should be able to merge physical reg addreses. 170 if (Register::isPhysicalRegister(AddrOp->getReg())) 171 return false; 172 173 // If an address has only one use then there will be on other 174 // instructions with the same address, so we can't merge this one. 175 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 176 return false; 177 } 178 return true; 179 } 180 181 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 182 const GCNSubtarget &STM); 183 }; 184 185 struct BaseRegisters { 186 unsigned LoReg = 0; 187 unsigned HiReg = 0; 188 189 unsigned LoSubReg = 0; 190 unsigned HiSubReg = 0; 191 }; 192 193 struct MemAddress { 194 BaseRegisters Base; 195 int64_t Offset = 0; 196 }; 197 198 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 199 200 private: 201 const GCNSubtarget *STM = nullptr; 202 const SIInstrInfo *TII = nullptr; 203 const SIRegisterInfo *TRI = nullptr; 204 const MCSubtargetInfo *STI = nullptr; 205 MachineRegisterInfo *MRI = nullptr; 206 AliasAnalysis *AA = nullptr; 207 bool OptimizeAgain; 208 209 static bool dmasksCanBeCombined(const CombineInfo &CI, 210 const SIInstrInfo &TII, 211 const CombineInfo &Paired); 212 static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI, 213 CombineInfo &Paired, bool Modify = false); 214 static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI, 215 const CombineInfo &Paired); 216 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 217 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 218 const CombineInfo &Paired); 219 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 220 const CombineInfo &Paired); 221 222 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 223 SmallVectorImpl<MachineInstr *> &InstsToMove); 224 225 unsigned read2Opcode(unsigned EltSize) const; 226 unsigned read2ST64Opcode(unsigned EltSize) const; 227 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 228 CombineInfo &Paired, 229 const SmallVectorImpl<MachineInstr *> &InstsToMove); 230 231 unsigned write2Opcode(unsigned EltSize) const; 232 unsigned write2ST64Opcode(unsigned EltSize) const; 233 MachineBasicBlock::iterator 234 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 235 const SmallVectorImpl<MachineInstr *> &InstsToMove); 236 MachineBasicBlock::iterator 237 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 238 const SmallVectorImpl<MachineInstr *> &InstsToMove); 239 MachineBasicBlock::iterator 240 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 241 const SmallVectorImpl<MachineInstr *> &InstsToMove); 242 MachineBasicBlock::iterator 243 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 244 const SmallVectorImpl<MachineInstr *> &InstsToMove); 245 MachineBasicBlock::iterator 246 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 247 const SmallVectorImpl<MachineInstr *> &InstsToMove); 248 MachineBasicBlock::iterator 249 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 250 const SmallVectorImpl<MachineInstr *> &InstsToMove); 251 MachineBasicBlock::iterator 252 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 253 const SmallVectorImpl<MachineInstr *> &InstsToMove); 254 255 void updateBaseAndOffset(MachineInstr &I, unsigned NewBase, 256 int32_t NewOffset) const; 257 unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const; 258 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 259 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 260 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 261 /// Promotes constant offset to the immediate by adjusting the base. It 262 /// tries to use a base from the nearby instructions that allows it to have 263 /// a 13bit constant offset which gets promoted to the immediate. 264 bool promoteConstantOffsetToImm(MachineInstr &CI, 265 MemInfoMap &Visited, 266 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 267 void addInstToMergeableList(const CombineInfo &CI, 268 std::list<std::list<CombineInfo> > &MergeableInsts) const; 269 bool collectMergeableInsts(MachineBasicBlock &MBB, 270 std::list<std::list<CombineInfo> > &MergeableInsts) const; 271 272 public: 273 static char ID; 274 275 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 276 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 277 } 278 279 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 280 bool &OptimizeListAgain); 281 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 282 283 bool runOnMachineFunction(MachineFunction &MF) override; 284 285 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 286 287 void getAnalysisUsage(AnalysisUsage &AU) const override { 288 AU.setPreservesCFG(); 289 AU.addRequired<AAResultsWrapperPass>(); 290 291 MachineFunctionPass::getAnalysisUsage(AU); 292 } 293 }; 294 295 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 296 const unsigned Opc = MI.getOpcode(); 297 298 if (TII.isMUBUF(Opc)) { 299 // FIXME: Handle d16 correctly 300 return AMDGPU::getMUBUFElements(Opc); 301 } 302 if (TII.isMIMG(MI)) { 303 uint64_t DMaskImm = 304 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 305 return countPopulation(DMaskImm); 306 } 307 if (TII.isMTBUF(Opc)) { 308 return AMDGPU::getMTBUFElements(Opc); 309 } 310 311 switch (Opc) { 312 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 313 return 1; 314 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 315 return 2; 316 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 317 return 4; 318 default: 319 return 0; 320 } 321 } 322 323 /// Maps instruction opcode to enum InstClassEnum. 324 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 325 switch (Opc) { 326 default: 327 if (TII.isMUBUF(Opc)) { 328 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 329 default: 330 return UNKNOWN; 331 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 332 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 335 return BUFFER_LOAD; 336 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 337 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 338 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 340 return BUFFER_STORE; 341 } 342 } 343 if (TII.isMIMG(Opc)) { 344 // Ignore instructions encoded without vaddr. 345 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) 346 return UNKNOWN; 347 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 348 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 349 TII.isGather4(Opc)) 350 return UNKNOWN; 351 return MIMG; 352 } 353 if (TII.isMTBUF(Opc)) { 354 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 355 default: 356 return UNKNOWN; 357 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 358 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 359 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 360 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 361 return TBUFFER_LOAD; 362 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 363 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 364 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 365 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 366 return TBUFFER_STORE; 367 } 368 } 369 return UNKNOWN; 370 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 371 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 372 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 373 return S_BUFFER_LOAD_IMM; 374 case AMDGPU::DS_READ_B32: 375 case AMDGPU::DS_READ_B32_gfx9: 376 case AMDGPU::DS_READ_B64: 377 case AMDGPU::DS_READ_B64_gfx9: 378 return DS_READ; 379 case AMDGPU::DS_WRITE_B32: 380 case AMDGPU::DS_WRITE_B32_gfx9: 381 case AMDGPU::DS_WRITE_B64: 382 case AMDGPU::DS_WRITE_B64_gfx9: 383 return DS_WRITE; 384 } 385 } 386 387 /// Determines instruction subclass from opcode. Only instructions 388 /// of the same subclass can be merged together. 389 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 390 switch (Opc) { 391 default: 392 if (TII.isMUBUF(Opc)) 393 return AMDGPU::getMUBUFBaseOpcode(Opc); 394 if (TII.isMIMG(Opc)) { 395 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 396 assert(Info); 397 return Info->BaseOpcode; 398 } 399 if (TII.isMTBUF(Opc)) 400 return AMDGPU::getMTBUFBaseOpcode(Opc); 401 return -1; 402 case AMDGPU::DS_READ_B32: 403 case AMDGPU::DS_READ_B32_gfx9: 404 case AMDGPU::DS_READ_B64: 405 case AMDGPU::DS_READ_B64_gfx9: 406 case AMDGPU::DS_WRITE_B32: 407 case AMDGPU::DS_WRITE_B32_gfx9: 408 case AMDGPU::DS_WRITE_B64: 409 case AMDGPU::DS_WRITE_B64_gfx9: 410 return Opc; 411 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 412 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 413 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 414 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 415 } 416 } 417 418 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { 419 if (TII.isMUBUF(Opc)) { 420 unsigned result = 0; 421 422 if (AMDGPU::getMUBUFHasVAddr(Opc)) { 423 result |= VADDR; 424 } 425 426 if (AMDGPU::getMUBUFHasSrsrc(Opc)) { 427 result |= SRSRC; 428 } 429 430 if (AMDGPU::getMUBUFHasSoffset(Opc)) { 431 result |= SOFFSET; 432 } 433 434 return result; 435 } 436 437 if (TII.isMIMG(Opc)) { 438 unsigned result = VADDR | SRSRC; 439 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 440 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 441 result |= SSAMP; 442 443 return result; 444 } 445 if (TII.isMTBUF(Opc)) { 446 unsigned result = 0; 447 448 if (AMDGPU::getMTBUFHasVAddr(Opc)) { 449 result |= VADDR; 450 } 451 452 if (AMDGPU::getMTBUFHasSrsrc(Opc)) { 453 result |= SRSRC; 454 } 455 456 if (AMDGPU::getMTBUFHasSoffset(Opc)) { 457 result |= SOFFSET; 458 } 459 460 return result; 461 } 462 463 switch (Opc) { 464 default: 465 return 0; 466 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 467 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 468 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 469 return SBASE; 470 case AMDGPU::DS_READ_B32: 471 case AMDGPU::DS_READ_B64: 472 case AMDGPU::DS_READ_B32_gfx9: 473 case AMDGPU::DS_READ_B64_gfx9: 474 case AMDGPU::DS_WRITE_B32: 475 case AMDGPU::DS_WRITE_B64: 476 case AMDGPU::DS_WRITE_B32_gfx9: 477 case AMDGPU::DS_WRITE_B64_gfx9: 478 return ADDR; 479 } 480 } 481 482 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 483 const SIInstrInfo &TII, 484 const GCNSubtarget &STM) { 485 I = MI; 486 unsigned Opc = MI->getOpcode(); 487 InstClass = getInstClass(Opc, TII); 488 489 if (InstClass == UNKNOWN) 490 return; 491 492 switch (InstClass) { 493 case DS_READ: 494 EltSize = 495 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 496 : 4; 497 break; 498 case DS_WRITE: 499 EltSize = 500 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 501 : 4; 502 break; 503 case S_BUFFER_LOAD_IMM: 504 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 505 break; 506 default: 507 EltSize = 4; 508 break; 509 } 510 511 if (InstClass == MIMG) { 512 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 513 // Offset is not considered for MIMG instructions. 514 Offset = 0; 515 } else { 516 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 517 Offset = I->getOperand(OffsetIdx).getImm(); 518 } 519 520 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 521 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 522 523 Width = getOpcodeWidth(*I, TII); 524 525 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 526 Offset &= 0xffff; 527 } else if (InstClass != MIMG) { 528 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 529 if (InstClass != S_BUFFER_LOAD_IMM) { 530 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 531 } 532 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 533 } 534 535 unsigned AddrOpName[5] = {0}; 536 NumAddresses = 0; 537 const unsigned Regs = getRegs(I->getOpcode(), TII); 538 539 if (Regs & ADDR) { 540 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 541 } 542 543 if (Regs & SBASE) { 544 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 545 } 546 547 if (Regs & SRSRC) { 548 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 549 } 550 551 if (Regs & SOFFSET) { 552 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 553 } 554 555 if (Regs & VADDR) { 556 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 557 } 558 559 if (Regs & SSAMP) { 560 AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; 561 } 562 563 for (unsigned i = 0; i < NumAddresses; i++) { 564 AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); 565 AddrReg[i] = &I->getOperand(AddrIdx[i]); 566 } 567 } 568 569 } // end anonymous namespace. 570 571 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 572 "SI Load Store Optimizer", false, false) 573 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 574 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 575 false, false) 576 577 char SILoadStoreOptimizer::ID = 0; 578 579 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 580 581 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 582 return new SILoadStoreOptimizer(); 583 } 584 585 static void moveInstsAfter(MachineBasicBlock::iterator I, 586 ArrayRef<MachineInstr *> InstsToMove) { 587 MachineBasicBlock *MBB = I->getParent(); 588 ++I; 589 for (MachineInstr *MI : InstsToMove) { 590 MI->removeFromParent(); 591 MBB->insert(I, MI); 592 } 593 } 594 595 static void addDefsUsesToList(const MachineInstr &MI, 596 DenseSet<unsigned> &RegDefs, 597 DenseSet<unsigned> &PhysRegUses) { 598 for (const MachineOperand &Op : MI.operands()) { 599 if (Op.isReg()) { 600 if (Op.isDef()) 601 RegDefs.insert(Op.getReg()); 602 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) 603 PhysRegUses.insert(Op.getReg()); 604 } 605 } 606 } 607 608 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 609 MachineBasicBlock::iterator B, 610 AliasAnalysis *AA) { 611 // RAW or WAR - cannot reorder 612 // WAW - cannot reorder 613 // RAR - safe to reorder 614 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 615 } 616 617 // Add MI and its defs to the lists if MI reads one of the defs that are 618 // already in the list. Returns true in that case. 619 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs, 620 DenseSet<unsigned> &PhysRegUses, 621 SmallVectorImpl<MachineInstr *> &Insts) { 622 for (MachineOperand &Use : MI.operands()) { 623 // If one of the defs is read, then there is a use of Def between I and the 624 // instruction that I will potentially be merged with. We will need to move 625 // this instruction after the merged instructions. 626 // 627 // Similarly, if there is a def which is read by an instruction that is to 628 // be moved for merging, then we need to move the def-instruction as well. 629 // This can only happen for physical registers such as M0; virtual 630 // registers are in SSA form. 631 if (Use.isReg() && 632 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 633 (Use.isDef() && RegDefs.count(Use.getReg())) || 634 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && 635 PhysRegUses.count(Use.getReg())))) { 636 Insts.push_back(&MI); 637 addDefsUsesToList(MI, RegDefs, PhysRegUses); 638 return true; 639 } 640 } 641 642 return false; 643 } 644 645 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 646 ArrayRef<MachineInstr *> InstsToMove, 647 AliasAnalysis *AA) { 648 assert(MemOp.mayLoadOrStore()); 649 650 for (MachineInstr *InstToMove : InstsToMove) { 651 if (!InstToMove->mayLoadOrStore()) 652 continue; 653 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 654 return false; 655 } 656 return true; 657 } 658 659 // This function assumes that \p A and \p B have are identical except for 660 // size and offset, and they referecne adjacent memory. 661 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 662 const MachineMemOperand *A, 663 const MachineMemOperand *B) { 664 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 665 unsigned Size = A->getSize() + B->getSize(); 666 // This function adds the offset parameter to the existing offset for A, 667 // so we pass 0 here as the offset and then manually set it to the correct 668 // value after the call. 669 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 670 MMO->setOffset(MinOffset); 671 return MMO; 672 } 673 674 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 675 const SIInstrInfo &TII, 676 const CombineInfo &Paired) { 677 assert(CI.InstClass == MIMG); 678 679 // Ignore instructions with tfe/lwe set. 680 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 681 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 682 683 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 684 return false; 685 686 // Check other optional immediate operands for equality. 687 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 688 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 689 AMDGPU::OpName::da, AMDGPU::OpName::r128, 690 AMDGPU::OpName::a16}; 691 692 for (auto op : OperandsToMatch) { 693 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 694 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 695 return false; 696 if (Idx != -1 && 697 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 698 return false; 699 } 700 701 // Check DMask for overlaps. 702 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 703 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 704 705 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 706 if ((1u << AllowedBitsForMin) <= MinMask) 707 return false; 708 709 return true; 710 } 711 712 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 713 unsigned ComponentCount, 714 const MCSubtargetInfo &STI) { 715 if (ComponentCount > 4) 716 return 0; 717 718 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 719 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 720 if (!OldFormatInfo) 721 return 0; 722 723 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 724 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 725 ComponentCount, 726 OldFormatInfo->NumFormat, STI); 727 728 if (!NewFormatInfo) 729 return 0; 730 731 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 732 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 733 734 return NewFormatInfo->Format; 735 } 736 737 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 738 const MCSubtargetInfo &STI, 739 CombineInfo &Paired, 740 bool Modify) { 741 assert(CI.InstClass != MIMG); 742 743 // XXX - Would the same offset be OK? Is there any reason this would happen or 744 // be useful? 745 if (CI.Offset == Paired.Offset) 746 return false; 747 748 // This won't be valid if the offset isn't aligned. 749 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 750 return false; 751 752 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 753 754 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 755 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 756 if (!Info0) 757 return false; 758 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 759 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 760 if (!Info1) 761 return false; 762 763 if (Info0->BitsPerComp != Info1->BitsPerComp || 764 Info0->NumFormat != Info1->NumFormat) 765 return false; 766 767 // TODO: Should be possible to support more formats, but if format loads 768 // are not dword-aligned, the merged load might not be valid. 769 if (Info0->BitsPerComp != 32) 770 return false; 771 772 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 773 return false; 774 } 775 776 unsigned EltOffset0 = CI.Offset / CI.EltSize; 777 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 778 CI.UseST64 = false; 779 CI.BaseOff = 0; 780 781 // Handle DS instructions. 782 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 783 return (EltOffset0 + CI.Width == EltOffset1 || 784 EltOffset1 + Paired.Width == EltOffset0) && 785 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 786 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 787 } 788 789 // Handle SMEM and VMEM instructions. 790 // If the offset in elements doesn't fit in 8-bits, we might be able to use 791 // the stride 64 versions. 792 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 793 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 794 if (Modify) { 795 CI.Offset = EltOffset0 / 64; 796 Paired.Offset = EltOffset1 / 64; 797 CI.UseST64 = true; 798 } 799 return true; 800 } 801 802 // Check if the new offsets fit in the reduced 8-bit range. 803 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 804 if (Modify) { 805 CI.Offset = EltOffset0; 806 Paired.Offset = EltOffset1; 807 } 808 return true; 809 } 810 811 // Try to shift base address to decrease offsets. 812 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 813 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 814 815 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 816 if (Modify) { 817 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 818 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 819 CI.UseST64 = true; 820 } 821 return true; 822 } 823 824 if (isUInt<8>(OffsetDiff)) { 825 if (Modify) { 826 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 827 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 828 } 829 return true; 830 } 831 832 return false; 833 } 834 835 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 836 const CombineInfo &CI, 837 const CombineInfo &Paired) { 838 const unsigned Width = (CI.Width + Paired.Width); 839 switch (CI.InstClass) { 840 default: 841 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 842 case S_BUFFER_LOAD_IMM: 843 switch (Width) { 844 default: 845 return false; 846 case 2: 847 case 4: 848 return true; 849 } 850 } 851 } 852 853 /// This function assumes that CI comes before Paired in a basic block. 854 bool SILoadStoreOptimizer::checkAndPrepareMerge( 855 CombineInfo &CI, CombineInfo &Paired, 856 SmallVectorImpl<MachineInstr *> &InstsToMove) { 857 858 // Check both offsets (or masks for MIMG) can be combined and fit in the 859 // reduced range. 860 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 861 return false; 862 863 if (CI.InstClass != MIMG && 864 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STI, Paired))) 865 return false; 866 867 const unsigned Opc = CI.I->getOpcode(); 868 const InstClassEnum InstClass = getInstClass(Opc, *TII); 869 870 if (InstClass == UNKNOWN) { 871 return false; 872 } 873 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 874 875 // Do not merge VMEM buffer instructions with "swizzled" bit set. 876 int Swizzled = 877 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 878 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 879 return false; 880 881 DenseSet<unsigned> RegDefsToMove; 882 DenseSet<unsigned> PhysRegUsesToMove; 883 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 884 885 MachineBasicBlock::iterator E = std::next(Paired.I); 886 MachineBasicBlock::iterator MBBI = std::next(CI.I); 887 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 888 for (; MBBI != E; ++MBBI) { 889 890 if (MBBI == MBBE) { 891 // CombineInfo::Order is a hint on the instruction ordering within the 892 // basic block. This hint suggests that CI precedes Paired, which is 893 // true most of the time. However, moveInstsAfter() processing a 894 // previous list may have changed this order in a situation when it 895 // moves an instruction which exists in some other merge list. 896 // In this case it must be dependent. 897 return false; 898 } 899 900 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 901 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 902 // This is not a matching instruction, but we can keep looking as 903 // long as one of these conditions are met: 904 // 1. It is safe to move I down past MBBI. 905 // 2. It is safe to move MBBI down past the instruction that I will 906 // be merged into. 907 908 if (MBBI->hasUnmodeledSideEffects()) { 909 // We can't re-order this instruction with respect to other memory 910 // operations, so we fail both conditions mentioned above. 911 return false; 912 } 913 914 if (MBBI->mayLoadOrStore() && 915 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 916 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 917 // We fail condition #1, but we may still be able to satisfy condition 918 // #2. Add this instruction to the move list and then we will check 919 // if condition #2 holds once we have selected the matching instruction. 920 InstsToMove.push_back(&*MBBI); 921 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 922 continue; 923 } 924 925 // When we match I with another DS instruction we will be moving I down 926 // to the location of the matched instruction any uses of I will need to 927 // be moved down as well. 928 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 929 InstsToMove); 930 continue; 931 } 932 933 // Don't merge volatiles. 934 if (MBBI->hasOrderedMemoryRef()) 935 return false; 936 937 int Swizzled = 938 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 939 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 940 return false; 941 942 // Handle a case like 943 // DS_WRITE_B32 addr, v, idx0 944 // w = DS_READ_B32 addr, idx0 945 // DS_WRITE_B32 addr, f(w), idx1 946 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 947 // merging of the two writes. 948 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 949 InstsToMove)) 950 continue; 951 952 if (&*MBBI == &*Paired.I) { 953 // We need to go through the list of instructions that we plan to 954 // move and make sure they are all safe to move down past the merged 955 // instruction. 956 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 957 958 // Call offsetsCanBeCombined with modify = true so that the offsets are 959 // correct for the new instruction. This should return true, because 960 // this function should only be called on CombineInfo objects that 961 // have already been confirmed to be mergeable. 962 if (CI.InstClass != MIMG) 963 offsetsCanBeCombined(CI, *STI, Paired, true); 964 return true; 965 } 966 return false; 967 } 968 969 // We've found a load/store that we couldn't merge for some reason. 970 // We could potentially keep looking, but we'd need to make sure that 971 // it was safe to move I and also all the instruction in InstsToMove 972 // down past this instruction. 973 // check if we can move I across MBBI and if we can move all I's users 974 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 975 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 976 break; 977 } 978 return false; 979 } 980 981 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 982 if (STM->ldsRequiresM0Init()) 983 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 984 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 985 } 986 987 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 988 if (STM->ldsRequiresM0Init()) 989 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 990 991 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 992 : AMDGPU::DS_READ2ST64_B64_gfx9; 993 } 994 995 MachineBasicBlock::iterator 996 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 997 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 998 MachineBasicBlock *MBB = CI.I->getParent(); 999 1000 // Be careful, since the addresses could be subregisters themselves in weird 1001 // cases, like vectors of pointers. 1002 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1003 1004 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1005 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1006 1007 unsigned NewOffset0 = CI.Offset; 1008 unsigned NewOffset1 = Paired.Offset; 1009 unsigned Opc = 1010 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1011 1012 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1013 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1014 1015 if (NewOffset0 > NewOffset1) { 1016 // Canonicalize the merged instruction so the smaller offset comes first. 1017 std::swap(NewOffset0, NewOffset1); 1018 std::swap(SubRegIdx0, SubRegIdx1); 1019 } 1020 1021 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1022 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1023 1024 const MCInstrDesc &Read2Desc = TII->get(Opc); 1025 1026 const TargetRegisterClass *SuperRC = 1027 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1028 Register DestReg = MRI->createVirtualRegister(SuperRC); 1029 1030 DebugLoc DL = CI.I->getDebugLoc(); 1031 1032 Register BaseReg = AddrReg->getReg(); 1033 unsigned BaseSubReg = AddrReg->getSubReg(); 1034 unsigned BaseRegFlags = 0; 1035 if (CI.BaseOff) { 1036 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1037 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1038 .addImm(CI.BaseOff); 1039 1040 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1041 BaseRegFlags = RegState::Kill; 1042 1043 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1044 .addReg(ImmReg) 1045 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1046 .addImm(0); // clamp bit 1047 BaseSubReg = 0; 1048 } 1049 1050 MachineInstrBuilder Read2 = 1051 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1052 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1053 .addImm(NewOffset0) // offset0 1054 .addImm(NewOffset1) // offset1 1055 .addImm(0) // gds 1056 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1057 1058 (void)Read2; 1059 1060 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1061 1062 // Copy to the old destination registers. 1063 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1064 .add(*Dest0) // Copy to same destination including flags and sub reg. 1065 .addReg(DestReg, 0, SubRegIdx0); 1066 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1067 .add(*Dest1) 1068 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1069 1070 moveInstsAfter(Copy1, InstsToMove); 1071 1072 CI.I->eraseFromParent(); 1073 Paired.I->eraseFromParent(); 1074 1075 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1076 return Read2; 1077 } 1078 1079 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1080 if (STM->ldsRequiresM0Init()) 1081 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1082 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1083 : AMDGPU::DS_WRITE2_B64_gfx9; 1084 } 1085 1086 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1087 if (STM->ldsRequiresM0Init()) 1088 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1089 : AMDGPU::DS_WRITE2ST64_B64; 1090 1091 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1092 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1093 } 1094 1095 MachineBasicBlock::iterator 1096 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1097 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1098 MachineBasicBlock *MBB = CI.I->getParent(); 1099 1100 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1101 // sure we preserve the subregister index and any register flags set on them. 1102 const MachineOperand *AddrReg = 1103 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1104 const MachineOperand *Data0 = 1105 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1106 const MachineOperand *Data1 = 1107 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1108 1109 unsigned NewOffset0 = CI.Offset; 1110 unsigned NewOffset1 = Paired.Offset; 1111 unsigned Opc = 1112 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1113 1114 if (NewOffset0 > NewOffset1) { 1115 // Canonicalize the merged instruction so the smaller offset comes first. 1116 std::swap(NewOffset0, NewOffset1); 1117 std::swap(Data0, Data1); 1118 } 1119 1120 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1121 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1122 1123 const MCInstrDesc &Write2Desc = TII->get(Opc); 1124 DebugLoc DL = CI.I->getDebugLoc(); 1125 1126 Register BaseReg = AddrReg->getReg(); 1127 unsigned BaseSubReg = AddrReg->getSubReg(); 1128 unsigned BaseRegFlags = 0; 1129 if (CI.BaseOff) { 1130 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1131 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1132 .addImm(CI.BaseOff); 1133 1134 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1135 BaseRegFlags = RegState::Kill; 1136 1137 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1138 .addReg(ImmReg) 1139 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1140 .addImm(0); // clamp bit 1141 BaseSubReg = 0; 1142 } 1143 1144 MachineInstrBuilder Write2 = 1145 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1146 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1147 .add(*Data0) // data0 1148 .add(*Data1) // data1 1149 .addImm(NewOffset0) // offset0 1150 .addImm(NewOffset1) // offset1 1151 .addImm(0) // gds 1152 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1153 1154 moveInstsAfter(Write2, InstsToMove); 1155 1156 CI.I->eraseFromParent(); 1157 Paired.I->eraseFromParent(); 1158 1159 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1160 return Write2; 1161 } 1162 1163 MachineBasicBlock::iterator 1164 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1165 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1166 MachineBasicBlock *MBB = CI.I->getParent(); 1167 DebugLoc DL = CI.I->getDebugLoc(); 1168 const unsigned Opcode = getNewOpcode(CI, Paired); 1169 1170 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1171 1172 Register DestReg = MRI->createVirtualRegister(SuperRC); 1173 unsigned MergedDMask = CI.DMask | Paired.DMask; 1174 unsigned DMaskIdx = 1175 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1176 1177 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1178 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1179 if (I == DMaskIdx) 1180 MIB.addImm(MergedDMask); 1181 else 1182 MIB.add((*CI.I).getOperand(I)); 1183 } 1184 1185 // It shouldn't be possible to get this far if the two instructions 1186 // don't have a single memoperand, because MachineInstr::mayAlias() 1187 // will return true if this is the case. 1188 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1189 1190 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1191 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1192 1193 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1194 1195 unsigned SubRegIdx0, SubRegIdx1; 1196 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1197 1198 // Copy to the old destination registers. 1199 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1200 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1201 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1202 1203 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1204 .add(*Dest0) // Copy to same destination including flags and sub reg. 1205 .addReg(DestReg, 0, SubRegIdx0); 1206 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1207 .add(*Dest1) 1208 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1209 1210 moveInstsAfter(Copy1, InstsToMove); 1211 1212 CI.I->eraseFromParent(); 1213 Paired.I->eraseFromParent(); 1214 return New; 1215 } 1216 1217 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1218 CombineInfo &CI, CombineInfo &Paired, 1219 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1220 MachineBasicBlock *MBB = CI.I->getParent(); 1221 DebugLoc DL = CI.I->getDebugLoc(); 1222 const unsigned Opcode = getNewOpcode(CI, Paired); 1223 1224 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1225 1226 Register DestReg = MRI->createVirtualRegister(SuperRC); 1227 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1228 1229 // It shouldn't be possible to get this far if the two instructions 1230 // don't have a single memoperand, because MachineInstr::mayAlias() 1231 // will return true if this is the case. 1232 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1233 1234 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1235 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1236 1237 MachineInstr *New = 1238 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1239 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1240 .addImm(MergedOffset) // offset 1241 .addImm(CI.GLC) // glc 1242 .addImm(CI.DLC) // dlc 1243 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1244 1245 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1246 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1247 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1248 1249 // Copy to the old destination registers. 1250 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1251 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1252 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1253 1254 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1255 .add(*Dest0) // Copy to same destination including flags and sub reg. 1256 .addReg(DestReg, 0, SubRegIdx0); 1257 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1258 .add(*Dest1) 1259 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1260 1261 moveInstsAfter(Copy1, InstsToMove); 1262 1263 CI.I->eraseFromParent(); 1264 Paired.I->eraseFromParent(); 1265 return New; 1266 } 1267 1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1269 CombineInfo &CI, CombineInfo &Paired, 1270 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1271 MachineBasicBlock *MBB = CI.I->getParent(); 1272 DebugLoc DL = CI.I->getDebugLoc(); 1273 1274 const unsigned Opcode = getNewOpcode(CI, Paired); 1275 1276 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1277 1278 // Copy to the new source register. 1279 Register DestReg = MRI->createVirtualRegister(SuperRC); 1280 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1281 1282 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1283 1284 const unsigned Regs = getRegs(Opcode, *TII); 1285 1286 if (Regs & VADDR) 1287 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1288 1289 // It shouldn't be possible to get this far if the two instructions 1290 // don't have a single memoperand, because MachineInstr::mayAlias() 1291 // will return true if this is the case. 1292 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1293 1294 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1295 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1296 1297 MachineInstr *New = 1298 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1299 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1300 .addImm(MergedOffset) // offset 1301 .addImm(CI.GLC) // glc 1302 .addImm(CI.SLC) // slc 1303 .addImm(0) // tfe 1304 .addImm(CI.DLC) // dlc 1305 .addImm(0) // swz 1306 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1307 1308 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1309 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1310 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1311 1312 // Copy to the old destination registers. 1313 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1314 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1315 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1316 1317 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1318 .add(*Dest0) // Copy to same destination including flags and sub reg. 1319 .addReg(DestReg, 0, SubRegIdx0); 1320 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1321 .add(*Dest1) 1322 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1323 1324 moveInstsAfter(Copy1, InstsToMove); 1325 1326 CI.I->eraseFromParent(); 1327 Paired.I->eraseFromParent(); 1328 return New; 1329 } 1330 1331 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1332 CombineInfo &CI, CombineInfo &Paired, 1333 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1334 MachineBasicBlock *MBB = CI.I->getParent(); 1335 DebugLoc DL = CI.I->getDebugLoc(); 1336 1337 const unsigned Opcode = getNewOpcode(CI, Paired); 1338 1339 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1340 1341 // Copy to the new source register. 1342 Register DestReg = MRI->createVirtualRegister(SuperRC); 1343 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1344 1345 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1346 1347 const unsigned Regs = getRegs(Opcode, *TII); 1348 1349 if (Regs & VADDR) 1350 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1351 1352 unsigned JoinedFormat = 1353 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI); 1354 1355 // It shouldn't be possible to get this far if the two instructions 1356 // don't have a single memoperand, because MachineInstr::mayAlias() 1357 // will return true if this is the case. 1358 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1359 1360 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1361 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1362 1363 MachineInstr *New = 1364 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1365 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1366 .addImm(MergedOffset) // offset 1367 .addImm(JoinedFormat) // format 1368 .addImm(CI.GLC) // glc 1369 .addImm(CI.SLC) // slc 1370 .addImm(0) // tfe 1371 .addImm(CI.DLC) // dlc 1372 .addImm(0) // swz 1373 .addMemOperand( 1374 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1375 1376 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1377 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1378 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1379 1380 // Copy to the old destination registers. 1381 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1382 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1383 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1384 1385 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1386 .add(*Dest0) // Copy to same destination including flags and sub reg. 1387 .addReg(DestReg, 0, SubRegIdx0); 1388 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1389 .add(*Dest1) 1390 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1391 1392 moveInstsAfter(Copy1, InstsToMove); 1393 1394 CI.I->eraseFromParent(); 1395 Paired.I->eraseFromParent(); 1396 return New; 1397 } 1398 1399 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1400 CombineInfo &CI, CombineInfo &Paired, 1401 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1402 MachineBasicBlock *MBB = CI.I->getParent(); 1403 DebugLoc DL = CI.I->getDebugLoc(); 1404 1405 const unsigned Opcode = getNewOpcode(CI, Paired); 1406 1407 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1408 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1409 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1410 1411 // Copy to the new source register. 1412 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1413 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1414 1415 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1416 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1417 1418 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1419 .add(*Src0) 1420 .addImm(SubRegIdx0) 1421 .add(*Src1) 1422 .addImm(SubRegIdx1); 1423 1424 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1425 .addReg(SrcReg, RegState::Kill); 1426 1427 const unsigned Regs = getRegs(Opcode, *TII); 1428 1429 if (Regs & VADDR) 1430 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1431 1432 unsigned JoinedFormat = 1433 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI); 1434 1435 // It shouldn't be possible to get this far if the two instructions 1436 // don't have a single memoperand, because MachineInstr::mayAlias() 1437 // will return true if this is the case. 1438 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1439 1440 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1441 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1442 1443 MachineInstr *New = 1444 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1445 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1446 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1447 .addImm(JoinedFormat) // format 1448 .addImm(CI.GLC) // glc 1449 .addImm(CI.SLC) // slc 1450 .addImm(0) // tfe 1451 .addImm(CI.DLC) // dlc 1452 .addImm(0) // swz 1453 .addMemOperand( 1454 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1455 1456 moveInstsAfter(MIB, InstsToMove); 1457 1458 CI.I->eraseFromParent(); 1459 Paired.I->eraseFromParent(); 1460 return New; 1461 } 1462 1463 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1464 const CombineInfo &Paired) { 1465 const unsigned Width = CI.Width + Paired.Width; 1466 1467 switch (CI.InstClass) { 1468 default: 1469 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1470 // FIXME: Handle d16 correctly 1471 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1472 Width); 1473 case TBUFFER_LOAD: 1474 case TBUFFER_STORE: 1475 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1476 Width); 1477 1478 case UNKNOWN: 1479 llvm_unreachable("Unknown instruction class"); 1480 case S_BUFFER_LOAD_IMM: 1481 switch (Width) { 1482 default: 1483 return 0; 1484 case 2: 1485 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1486 case 4: 1487 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1488 } 1489 case MIMG: 1490 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1491 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1492 } 1493 } 1494 1495 std::pair<unsigned, unsigned> 1496 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1497 1498 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1499 return std::make_pair(0, 0); 1500 1501 bool ReverseOrder; 1502 if (CI.InstClass == MIMG) { 1503 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1504 "No overlaps"); 1505 ReverseOrder = CI.DMask > Paired.DMask; 1506 } else 1507 ReverseOrder = CI.Offset > Paired.Offset; 1508 1509 static const unsigned Idxs[4][4] = { 1510 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1511 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1512 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1513 {AMDGPU::sub3, 0, 0, 0}, 1514 }; 1515 unsigned Idx0; 1516 unsigned Idx1; 1517 1518 assert(CI.Width >= 1 && CI.Width <= 3); 1519 assert(Paired.Width >= 1 && Paired.Width <= 3); 1520 1521 if (ReverseOrder) { 1522 Idx1 = Idxs[0][Paired.Width - 1]; 1523 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1524 } else { 1525 Idx0 = Idxs[0][CI.Width - 1]; 1526 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1527 } 1528 1529 return std::make_pair(Idx0, Idx1); 1530 } 1531 1532 const TargetRegisterClass * 1533 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1534 const CombineInfo &Paired) { 1535 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1536 switch (CI.Width + Paired.Width) { 1537 default: 1538 return nullptr; 1539 case 2: 1540 return &AMDGPU::SReg_64_XEXECRegClass; 1541 case 4: 1542 return &AMDGPU::SGPR_128RegClass; 1543 case 8: 1544 return &AMDGPU::SReg_256RegClass; 1545 case 16: 1546 return &AMDGPU::SReg_512RegClass; 1547 } 1548 } else { 1549 switch (CI.Width + Paired.Width) { 1550 default: 1551 return nullptr; 1552 case 2: 1553 return &AMDGPU::VReg_64RegClass; 1554 case 3: 1555 return &AMDGPU::VReg_96RegClass; 1556 case 4: 1557 return &AMDGPU::VReg_128RegClass; 1558 } 1559 } 1560 } 1561 1562 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1563 CombineInfo &CI, CombineInfo &Paired, 1564 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1565 MachineBasicBlock *MBB = CI.I->getParent(); 1566 DebugLoc DL = CI.I->getDebugLoc(); 1567 1568 const unsigned Opcode = getNewOpcode(CI, Paired); 1569 1570 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1571 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1572 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1573 1574 // Copy to the new source register. 1575 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1576 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1577 1578 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1579 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1580 1581 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1582 .add(*Src0) 1583 .addImm(SubRegIdx0) 1584 .add(*Src1) 1585 .addImm(SubRegIdx1); 1586 1587 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1588 .addReg(SrcReg, RegState::Kill); 1589 1590 const unsigned Regs = getRegs(Opcode, *TII); 1591 1592 if (Regs & VADDR) 1593 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1594 1595 1596 // It shouldn't be possible to get this far if the two instructions 1597 // don't have a single memoperand, because MachineInstr::mayAlias() 1598 // will return true if this is the case. 1599 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1600 1601 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1602 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1603 1604 MachineInstr *New = 1605 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1606 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1607 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1608 .addImm(CI.GLC) // glc 1609 .addImm(CI.SLC) // slc 1610 .addImm(0) // tfe 1611 .addImm(CI.DLC) // dlc 1612 .addImm(0) // swz 1613 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1614 1615 moveInstsAfter(MIB, InstsToMove); 1616 1617 CI.I->eraseFromParent(); 1618 Paired.I->eraseFromParent(); 1619 return New; 1620 } 1621 1622 MachineOperand 1623 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1624 APInt V(32, Val, true); 1625 if (TII->isInlineConstant(V)) 1626 return MachineOperand::CreateImm(Val); 1627 1628 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1629 MachineInstr *Mov = 1630 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1631 TII->get(AMDGPU::S_MOV_B32), Reg) 1632 .addImm(Val); 1633 (void)Mov; 1634 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1635 return MachineOperand::CreateReg(Reg, false); 1636 } 1637 1638 // Compute base address using Addr and return the final register. 1639 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1640 const MemAddress &Addr) const { 1641 MachineBasicBlock *MBB = MI.getParent(); 1642 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1643 DebugLoc DL = MI.getDebugLoc(); 1644 1645 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1646 Addr.Base.LoSubReg) && 1647 "Expected 32-bit Base-Register-Low!!"); 1648 1649 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1650 Addr.Base.HiSubReg) && 1651 "Expected 32-bit Base-Register-Hi!!"); 1652 1653 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1654 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1655 MachineOperand OffsetHi = 1656 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1657 1658 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1659 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1660 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1661 1662 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1663 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1664 MachineInstr *LoHalf = 1665 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) 1666 .addReg(CarryReg, RegState::Define) 1667 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1668 .add(OffsetLo) 1669 .addImm(0); // clamp bit 1670 (void)LoHalf; 1671 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1672 1673 MachineInstr *HiHalf = 1674 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1675 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1676 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1677 .add(OffsetHi) 1678 .addReg(CarryReg, RegState::Kill) 1679 .addImm(0); // clamp bit 1680 (void)HiHalf; 1681 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1682 1683 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1684 MachineInstr *FullBase = 1685 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1686 .addReg(DestSub0) 1687 .addImm(AMDGPU::sub0) 1688 .addReg(DestSub1) 1689 .addImm(AMDGPU::sub1); 1690 (void)FullBase; 1691 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1692 1693 return FullDestReg; 1694 } 1695 1696 // Update base and offset with the NewBase and NewOffset in MI. 1697 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1698 unsigned NewBase, 1699 int32_t NewOffset) const { 1700 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1701 Base->setReg(NewBase); 1702 Base->setIsKill(false); 1703 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1704 } 1705 1706 Optional<int32_t> 1707 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1708 if (Op.isImm()) 1709 return Op.getImm(); 1710 1711 if (!Op.isReg()) 1712 return None; 1713 1714 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1715 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1716 !Def->getOperand(1).isImm()) 1717 return None; 1718 1719 return Def->getOperand(1).getImm(); 1720 } 1721 1722 // Analyze Base and extracts: 1723 // - 32bit base registers, subregisters 1724 // - 64bit constant offset 1725 // Expecting base computation as: 1726 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1727 // %LO:vgpr_32, %c:sreg_64_xexec = 1728 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1729 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1730 // %Base:vreg_64 = 1731 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1732 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1733 MemAddress &Addr) const { 1734 if (!Base.isReg()) 1735 return; 1736 1737 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1738 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1739 || Def->getNumOperands() != 5) 1740 return; 1741 1742 MachineOperand BaseLo = Def->getOperand(1); 1743 MachineOperand BaseHi = Def->getOperand(3); 1744 if (!BaseLo.isReg() || !BaseHi.isReg()) 1745 return; 1746 1747 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1748 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1749 1750 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || 1751 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1752 return; 1753 1754 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1755 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1756 1757 auto Offset0P = extractConstOffset(*Src0); 1758 if (Offset0P) 1759 BaseLo = *Src1; 1760 else { 1761 if (!(Offset0P = extractConstOffset(*Src1))) 1762 return; 1763 BaseLo = *Src0; 1764 } 1765 1766 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1767 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1768 1769 if (Src0->isImm()) 1770 std::swap(Src0, Src1); 1771 1772 if (!Src1->isImm()) 1773 return; 1774 1775 uint64_t Offset1 = Src1->getImm(); 1776 BaseHi = *Src0; 1777 1778 Addr.Base.LoReg = BaseLo.getReg(); 1779 Addr.Base.HiReg = BaseHi.getReg(); 1780 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1781 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1782 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1783 } 1784 1785 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1786 MachineInstr &MI, 1787 MemInfoMap &Visited, 1788 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1789 1790 if (!(MI.mayLoad() ^ MI.mayStore())) 1791 return false; 1792 1793 // TODO: Support flat and scratch. 1794 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1795 return false; 1796 1797 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1798 return false; 1799 1800 if (AnchorList.count(&MI)) 1801 return false; 1802 1803 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1804 1805 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1806 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1807 return false; 1808 } 1809 1810 // Step1: Find the base-registers and a 64bit constant offset. 1811 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1812 MemAddress MAddr; 1813 if (Visited.find(&MI) == Visited.end()) { 1814 processBaseWithConstOffset(Base, MAddr); 1815 Visited[&MI] = MAddr; 1816 } else 1817 MAddr = Visited[&MI]; 1818 1819 if (MAddr.Offset == 0) { 1820 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1821 " constant offsets that can be promoted.\n";); 1822 return false; 1823 } 1824 1825 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1826 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1827 1828 // Step2: Traverse through MI's basic block and find an anchor(that has the 1829 // same base-registers) with the highest 13bit distance from MI's offset. 1830 // E.g. (64bit loads) 1831 // bb: 1832 // addr1 = &a + 4096; load1 = load(addr1, 0) 1833 // addr2 = &a + 6144; load2 = load(addr2, 0) 1834 // addr3 = &a + 8192; load3 = load(addr3, 0) 1835 // addr4 = &a + 10240; load4 = load(addr4, 0) 1836 // addr5 = &a + 12288; load5 = load(addr5, 0) 1837 // 1838 // Starting from the first load, the optimization will try to find a new base 1839 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1840 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1841 // as the new-base(anchor) because of the maximum distance which can 1842 // accomodate more intermediate bases presumeably. 1843 // 1844 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1845 // (&a + 8192) for load1, load2, load4. 1846 // addr = &a + 8192 1847 // load1 = load(addr, -4096) 1848 // load2 = load(addr, -2048) 1849 // load3 = load(addr, 0) 1850 // load4 = load(addr, 2048) 1851 // addr5 = &a + 12288; load5 = load(addr5, 0) 1852 // 1853 MachineInstr *AnchorInst = nullptr; 1854 MemAddress AnchorAddr; 1855 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1856 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1857 1858 MachineBasicBlock *MBB = MI.getParent(); 1859 MachineBasicBlock::iterator E = MBB->end(); 1860 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1861 ++MBBI; 1862 const SITargetLowering *TLI = 1863 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1864 1865 for ( ; MBBI != E; ++MBBI) { 1866 MachineInstr &MINext = *MBBI; 1867 // TODO: Support finding an anchor(with same base) from store addresses or 1868 // any other load addresses where the opcodes are different. 1869 if (MINext.getOpcode() != MI.getOpcode() || 1870 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1871 continue; 1872 1873 const MachineOperand &BaseNext = 1874 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1875 MemAddress MAddrNext; 1876 if (Visited.find(&MINext) == Visited.end()) { 1877 processBaseWithConstOffset(BaseNext, MAddrNext); 1878 Visited[&MINext] = MAddrNext; 1879 } else 1880 MAddrNext = Visited[&MINext]; 1881 1882 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1883 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1884 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1885 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1886 continue; 1887 1888 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1889 1890 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1891 TargetLoweringBase::AddrMode AM; 1892 AM.HasBaseReg = true; 1893 AM.BaseOffs = Dist; 1894 if (TLI->isLegalGlobalAddressingMode(AM) && 1895 (uint32_t)std::abs(Dist) > MaxDist) { 1896 MaxDist = std::abs(Dist); 1897 1898 AnchorAddr = MAddrNext; 1899 AnchorInst = &MINext; 1900 } 1901 } 1902 1903 if (AnchorInst) { 1904 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1905 AnchorInst->dump()); 1906 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1907 << AnchorAddr.Offset << "\n\n"); 1908 1909 // Instead of moving up, just re-compute anchor-instruction's base address. 1910 unsigned Base = computeBase(MI, AnchorAddr); 1911 1912 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1913 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1914 1915 for (auto P : InstsWCommonBase) { 1916 TargetLoweringBase::AddrMode AM; 1917 AM.HasBaseReg = true; 1918 AM.BaseOffs = P.second - AnchorAddr.Offset; 1919 1920 if (TLI->isLegalGlobalAddressingMode(AM)) { 1921 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1922 dbgs() << ")"; P.first->dump()); 1923 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1924 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1925 } 1926 } 1927 AnchorList.insert(AnchorInst); 1928 return true; 1929 } 1930 1931 return false; 1932 } 1933 1934 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1935 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1936 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1937 if (AddrList.front().InstClass == CI.InstClass && 1938 AddrList.front().hasSameBaseAddress(*CI.I)) { 1939 AddrList.emplace_back(CI); 1940 return; 1941 } 1942 } 1943 1944 // Base address not found, so add a new list. 1945 MergeableInsts.emplace_back(1, CI); 1946 } 1947 1948 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, 1949 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1950 bool Modified = false; 1951 // Contain the list 1952 MemInfoMap Visited; 1953 // Contains the list of instructions for which constant offsets are being 1954 // promoted to the IMM. 1955 SmallPtrSet<MachineInstr *, 4> AnchorList; 1956 1957 // Sort potential mergeable instructions into lists. One list per base address. 1958 unsigned Order = 0; 1959 for (MachineInstr &MI : MBB.instrs()) { 1960 // We run this before checking if an address is mergeable, because it can produce 1961 // better code even if the instructions aren't mergeable. 1962 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1963 Modified = true; 1964 1965 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1966 if (InstClass == UNKNOWN) 1967 continue; 1968 1969 // Don't combine if volatile. 1970 if (MI.hasOrderedMemoryRef()) 1971 continue; 1972 1973 CombineInfo CI; 1974 CI.setMI(MI, *TII, *STM); 1975 CI.Order = Order++; 1976 1977 if (!CI.hasMergeableAddress(*MRI)) 1978 continue; 1979 1980 addInstToMergeableList(CI, MergeableInsts); 1981 } 1982 1983 // At this point we have lists of Mergeable instructions. 1984 // 1985 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1986 // list try to find an instruction that can be merged with I. If an instruction 1987 // is found, it is stored in the Paired field. If no instructions are found, then 1988 // the CombineInfo object is deleted from the list. 1989 1990 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 1991 E = MergeableInsts.end(); I != E;) { 1992 1993 std::list<CombineInfo> &MergeList = *I; 1994 if (MergeList.size() <= 1) { 1995 // This means we have found only one instruction with a given address 1996 // that can be merged, and we need at least 2 instructions to do a merge, 1997 // so this list can be discarded. 1998 I = MergeableInsts.erase(I); 1999 continue; 2000 } 2001 2002 // Sort the lists by offsets, this way mergeable instructions will be 2003 // adjacent to each other in the list, which will make it easier to find 2004 // matches. 2005 MergeList.sort( 2006 [] (const CombineInfo &A, CombineInfo &B) { 2007 return A.Offset < B.Offset; 2008 }); 2009 ++I; 2010 } 2011 2012 return Modified; 2013 } 2014 2015 // Scan through looking for adjacent LDS operations with constant offsets from 2016 // the same base register. We rely on the scheduler to do the hard work of 2017 // clustering nearby loads, and assume these are all adjacent. 2018 bool SILoadStoreOptimizer::optimizeBlock( 2019 std::list<std::list<CombineInfo> > &MergeableInsts) { 2020 bool Modified = false; 2021 2022 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2023 E = MergeableInsts.end(); I != E;) { 2024 std::list<CombineInfo> &MergeList = *I; 2025 2026 bool OptimizeListAgain = false; 2027 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2028 // We weren't able to make any changes, so delete the list so we don't 2029 // process the same instructions the next time we try to optimize this 2030 // block. 2031 I = MergeableInsts.erase(I); 2032 continue; 2033 } 2034 2035 Modified = true; 2036 2037 // We made changes, but also determined that there were no more optimization 2038 // opportunities, so we don't need to reprocess the list 2039 if (!OptimizeListAgain) { 2040 I = MergeableInsts.erase(I); 2041 continue; 2042 } 2043 OptimizeAgain = true; 2044 } 2045 return Modified; 2046 } 2047 2048 bool 2049 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2050 std::list<CombineInfo> &MergeList, 2051 bool &OptimizeListAgain) { 2052 if (MergeList.empty()) 2053 return false; 2054 2055 bool Modified = false; 2056 2057 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2058 Next = std::next(I)) { 2059 2060 auto First = I; 2061 auto Second = Next; 2062 2063 if ((*First).Order > (*Second).Order) 2064 std::swap(First, Second); 2065 CombineInfo &CI = *First; 2066 CombineInfo &Paired = *Second; 2067 2068 SmallVector<MachineInstr *, 8> InstsToMove; 2069 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2070 ++I; 2071 continue; 2072 } 2073 2074 Modified = true; 2075 2076 switch (CI.InstClass) { 2077 default: 2078 llvm_unreachable("unknown InstClass"); 2079 break; 2080 case DS_READ: { 2081 MachineBasicBlock::iterator NewMI = 2082 mergeRead2Pair(CI, Paired, InstsToMove); 2083 CI.setMI(NewMI, *TII, *STM); 2084 break; 2085 } 2086 case DS_WRITE: { 2087 MachineBasicBlock::iterator NewMI = 2088 mergeWrite2Pair(CI, Paired, InstsToMove); 2089 CI.setMI(NewMI, *TII, *STM); 2090 break; 2091 } 2092 case S_BUFFER_LOAD_IMM: { 2093 MachineBasicBlock::iterator NewMI = 2094 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2095 CI.setMI(NewMI, *TII, *STM); 2096 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2097 break; 2098 } 2099 case BUFFER_LOAD: { 2100 MachineBasicBlock::iterator NewMI = 2101 mergeBufferLoadPair(CI, Paired, InstsToMove); 2102 CI.setMI(NewMI, *TII, *STM); 2103 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2104 break; 2105 } 2106 case BUFFER_STORE: { 2107 MachineBasicBlock::iterator NewMI = 2108 mergeBufferStorePair(CI, Paired, InstsToMove); 2109 CI.setMI(NewMI, *TII, *STM); 2110 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2111 break; 2112 } 2113 case MIMG: { 2114 MachineBasicBlock::iterator NewMI = 2115 mergeImagePair(CI, Paired, InstsToMove); 2116 CI.setMI(NewMI, *TII, *STM); 2117 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2118 break; 2119 } 2120 case TBUFFER_LOAD: { 2121 MachineBasicBlock::iterator NewMI = 2122 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2123 CI.setMI(NewMI, *TII, *STM); 2124 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2125 break; 2126 } 2127 case TBUFFER_STORE: { 2128 MachineBasicBlock::iterator NewMI = 2129 mergeTBufferStorePair(CI, Paired, InstsToMove); 2130 CI.setMI(NewMI, *TII, *STM); 2131 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2132 break; 2133 } 2134 } 2135 CI.Order = Paired.Order; 2136 if (I == Second) 2137 I = Next; 2138 2139 MergeList.erase(Second); 2140 } 2141 2142 return Modified; 2143 } 2144 2145 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2146 if (skipFunction(MF.getFunction())) 2147 return false; 2148 2149 STM = &MF.getSubtarget<GCNSubtarget>(); 2150 if (!STM->loadStoreOptEnabled()) 2151 return false; 2152 2153 TII = STM->getInstrInfo(); 2154 TRI = &TII->getRegisterInfo(); 2155 STI = &MF.getSubtarget<MCSubtargetInfo>(); 2156 2157 MRI = &MF.getRegInfo(); 2158 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2159 2160 assert(MRI->isSSA() && "Must be run on SSA"); 2161 2162 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2163 2164 bool Modified = false; 2165 2166 2167 for (MachineBasicBlock &MBB : MF) { 2168 std::list<std::list<CombineInfo> > MergeableInsts; 2169 // First pass: Collect list of all instructions we know how to merge. 2170 Modified |= collectMergeableInsts(MBB, MergeableInsts); 2171 do { 2172 OptimizeAgain = false; 2173 Modified |= optimizeBlock(MergeableInsts); 2174 } while (OptimizeAgain); 2175 } 2176 2177 return Modified; 2178 } 2179