1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "AMDGPUSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "SIInstrInfo.h" 64 #include "SIRegisterInfo.h" 65 #include "Utils/AMDGPUBaseInfo.h" 66 #include "llvm/ADT/ArrayRef.h" 67 #include "llvm/ADT/SmallVector.h" 68 #include "llvm/ADT/StringRef.h" 69 #include "llvm/Analysis/AliasAnalysis.h" 70 #include "llvm/CodeGen/MachineBasicBlock.h" 71 #include "llvm/CodeGen/MachineFunction.h" 72 #include "llvm/CodeGen/MachineFunctionPass.h" 73 #include "llvm/CodeGen/MachineInstr.h" 74 #include "llvm/CodeGen/MachineInstrBuilder.h" 75 #include "llvm/CodeGen/MachineOperand.h" 76 #include "llvm/CodeGen/MachineRegisterInfo.h" 77 #include "llvm/IR/DebugLoc.h" 78 #include "llvm/InitializePasses.h" 79 #include "llvm/Pass.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/MathExtras.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include <algorithm> 84 #include <cassert> 85 #include <cstdlib> 86 #include <iterator> 87 #include <utility> 88 89 using namespace llvm; 90 91 #define DEBUG_TYPE "si-load-store-opt" 92 93 namespace { 94 enum InstClassEnum { 95 UNKNOWN, 96 DS_READ, 97 DS_WRITE, 98 S_BUFFER_LOAD_IMM, 99 BUFFER_LOAD, 100 BUFFER_STORE, 101 MIMG, 102 TBUFFER_LOAD, 103 TBUFFER_STORE, 104 }; 105 106 enum RegisterEnum { 107 SBASE = 0x1, 108 SRSRC = 0x2, 109 SOFFSET = 0x4, 110 VADDR = 0x8, 111 ADDR = 0x10, 112 SSAMP = 0x20, 113 }; 114 115 class SILoadStoreOptimizer : public MachineFunctionPass { 116 struct CombineInfo { 117 MachineBasicBlock::iterator I; 118 unsigned EltSize; 119 unsigned Offset; 120 unsigned Width; 121 unsigned Format; 122 unsigned BaseOff; 123 unsigned DMask; 124 InstClassEnum InstClass; 125 bool GLC; 126 bool SLC; 127 bool DLC; 128 bool UseST64; 129 int AddrIdx[5]; 130 const MachineOperand *AddrReg[5]; 131 unsigned NumAddresses; 132 unsigned Order; 133 134 bool hasSameBaseAddress(const MachineInstr &MI) { 135 for (unsigned i = 0; i < NumAddresses; i++) { 136 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 137 138 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 139 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 140 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 141 return false; 142 } 143 continue; 144 } 145 146 // Check same base pointer. Be careful of subregisters, which can occur 147 // with vectors of pointers. 148 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 149 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 150 return false; 151 } 152 } 153 return true; 154 } 155 156 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 157 for (unsigned i = 0; i < NumAddresses; ++i) { 158 const MachineOperand *AddrOp = AddrReg[i]; 159 // Immediates are always OK. 160 if (AddrOp->isImm()) 161 continue; 162 163 // Don't try to merge addresses that aren't either immediates or registers. 164 // TODO: Should be possible to merge FrameIndexes and maybe some other 165 // non-register 166 if (!AddrOp->isReg()) 167 return false; 168 169 // TODO: We should be able to merge physical reg addreses. 170 if (Register::isPhysicalRegister(AddrOp->getReg())) 171 return false; 172 173 // If an address has only one use then there will be on other 174 // instructions with the same address, so we can't merge this one. 175 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 176 return false; 177 } 178 return true; 179 } 180 181 void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, 182 const GCNSubtarget &STM); 183 }; 184 185 struct BaseRegisters { 186 Register LoReg; 187 Register HiReg; 188 189 unsigned LoSubReg = 0; 190 unsigned HiSubReg = 0; 191 }; 192 193 struct MemAddress { 194 BaseRegisters Base; 195 int64_t Offset = 0; 196 }; 197 198 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 199 200 private: 201 const GCNSubtarget *STM = nullptr; 202 const SIInstrInfo *TII = nullptr; 203 const SIRegisterInfo *TRI = nullptr; 204 MachineRegisterInfo *MRI = nullptr; 205 AliasAnalysis *AA = nullptr; 206 bool OptimizeAgain; 207 208 static bool dmasksCanBeCombined(const CombineInfo &CI, 209 const SIInstrInfo &TII, 210 const CombineInfo &Paired); 211 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 212 CombineInfo &Paired, bool Modify = false); 213 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 214 const CombineInfo &Paired); 215 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 216 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 217 const CombineInfo &Paired); 218 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 219 const CombineInfo &Paired); 220 221 bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired, 222 SmallVectorImpl<MachineInstr *> &InstsToMove); 223 224 unsigned read2Opcode(unsigned EltSize) const; 225 unsigned read2ST64Opcode(unsigned EltSize) const; 226 MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, 227 CombineInfo &Paired, 228 const SmallVectorImpl<MachineInstr *> &InstsToMove); 229 230 unsigned write2Opcode(unsigned EltSize) const; 231 unsigned write2ST64Opcode(unsigned EltSize) const; 232 MachineBasicBlock::iterator 233 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 234 const SmallVectorImpl<MachineInstr *> &InstsToMove); 235 MachineBasicBlock::iterator 236 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 237 const SmallVectorImpl<MachineInstr *> &InstsToMove); 238 MachineBasicBlock::iterator 239 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 240 const SmallVectorImpl<MachineInstr *> &InstsToMove); 241 MachineBasicBlock::iterator 242 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 243 const SmallVectorImpl<MachineInstr *> &InstsToMove); 244 MachineBasicBlock::iterator 245 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 246 const SmallVectorImpl<MachineInstr *> &InstsToMove); 247 MachineBasicBlock::iterator 248 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 249 const SmallVectorImpl<MachineInstr *> &InstsToMove); 250 MachineBasicBlock::iterator 251 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 252 const SmallVectorImpl<MachineInstr *> &InstsToMove); 253 254 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 255 int32_t NewOffset) const; 256 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 257 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 258 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 259 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 260 /// Promotes constant offset to the immediate by adjusting the base. It 261 /// tries to use a base from the nearby instructions that allows it to have 262 /// a 13bit constant offset which gets promoted to the immediate. 263 bool promoteConstantOffsetToImm(MachineInstr &CI, 264 MemInfoMap &Visited, 265 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 266 void addInstToMergeableList(const CombineInfo &CI, 267 std::list<std::list<CombineInfo> > &MergeableInsts) const; 268 269 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 270 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 271 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 272 std::list<std::list<CombineInfo>> &MergeableInsts) const; 273 274 public: 275 static char ID; 276 277 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 278 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 279 } 280 281 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 282 bool &OptimizeListAgain); 283 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 284 285 bool runOnMachineFunction(MachineFunction &MF) override; 286 287 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 288 289 void getAnalysisUsage(AnalysisUsage &AU) const override { 290 AU.setPreservesCFG(); 291 AU.addRequired<AAResultsWrapperPass>(); 292 293 MachineFunctionPass::getAnalysisUsage(AU); 294 } 295 296 MachineFunctionProperties getRequiredProperties() const override { 297 return MachineFunctionProperties() 298 .set(MachineFunctionProperties::Property::IsSSA); 299 } 300 }; 301 302 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 303 const unsigned Opc = MI.getOpcode(); 304 305 if (TII.isMUBUF(Opc)) { 306 // FIXME: Handle d16 correctly 307 return AMDGPU::getMUBUFElements(Opc); 308 } 309 if (TII.isMIMG(MI)) { 310 uint64_t DMaskImm = 311 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 312 return countPopulation(DMaskImm); 313 } 314 if (TII.isMTBUF(Opc)) { 315 return AMDGPU::getMTBUFElements(Opc); 316 } 317 318 switch (Opc) { 319 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 320 return 1; 321 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 322 return 2; 323 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 324 return 4; 325 default: 326 return 0; 327 } 328 } 329 330 /// Maps instruction opcode to enum InstClassEnum. 331 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 332 switch (Opc) { 333 default: 334 if (TII.isMUBUF(Opc)) { 335 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 336 default: 337 return UNKNOWN; 338 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 339 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 340 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 341 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 342 return BUFFER_LOAD; 343 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 344 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 345 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 346 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 347 return BUFFER_STORE; 348 } 349 } 350 if (TII.isMIMG(Opc)) { 351 // Ignore instructions encoded without vaddr. 352 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1) 353 return UNKNOWN; 354 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 355 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 356 TII.isGather4(Opc)) 357 return UNKNOWN; 358 return MIMG; 359 } 360 if (TII.isMTBUF(Opc)) { 361 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 362 default: 363 return UNKNOWN; 364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 365 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 366 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 367 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 368 return TBUFFER_LOAD; 369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 370 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 371 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 372 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 373 return TBUFFER_STORE; 374 } 375 } 376 return UNKNOWN; 377 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 378 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 379 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 380 return S_BUFFER_LOAD_IMM; 381 case AMDGPU::DS_READ_B32: 382 case AMDGPU::DS_READ_B32_gfx9: 383 case AMDGPU::DS_READ_B64: 384 case AMDGPU::DS_READ_B64_gfx9: 385 return DS_READ; 386 case AMDGPU::DS_WRITE_B32: 387 case AMDGPU::DS_WRITE_B32_gfx9: 388 case AMDGPU::DS_WRITE_B64: 389 case AMDGPU::DS_WRITE_B64_gfx9: 390 return DS_WRITE; 391 } 392 } 393 394 /// Determines instruction subclass from opcode. Only instructions 395 /// of the same subclass can be merged together. 396 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 397 switch (Opc) { 398 default: 399 if (TII.isMUBUF(Opc)) 400 return AMDGPU::getMUBUFBaseOpcode(Opc); 401 if (TII.isMIMG(Opc)) { 402 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 403 assert(Info); 404 return Info->BaseOpcode; 405 } 406 if (TII.isMTBUF(Opc)) 407 return AMDGPU::getMTBUFBaseOpcode(Opc); 408 return -1; 409 case AMDGPU::DS_READ_B32: 410 case AMDGPU::DS_READ_B32_gfx9: 411 case AMDGPU::DS_READ_B64: 412 case AMDGPU::DS_READ_B64_gfx9: 413 case AMDGPU::DS_WRITE_B32: 414 case AMDGPU::DS_WRITE_B32_gfx9: 415 case AMDGPU::DS_WRITE_B64: 416 case AMDGPU::DS_WRITE_B64_gfx9: 417 return Opc; 418 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 419 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 420 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 421 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 422 } 423 } 424 425 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) { 426 if (TII.isMUBUF(Opc)) { 427 unsigned result = 0; 428 429 if (AMDGPU::getMUBUFHasVAddr(Opc)) { 430 result |= VADDR; 431 } 432 433 if (AMDGPU::getMUBUFHasSrsrc(Opc)) { 434 result |= SRSRC; 435 } 436 437 if (AMDGPU::getMUBUFHasSoffset(Opc)) { 438 result |= SOFFSET; 439 } 440 441 return result; 442 } 443 444 if (TII.isMIMG(Opc)) { 445 unsigned result = VADDR | SRSRC; 446 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 447 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 448 result |= SSAMP; 449 450 return result; 451 } 452 if (TII.isMTBUF(Opc)) { 453 unsigned result = 0; 454 455 if (AMDGPU::getMTBUFHasVAddr(Opc)) { 456 result |= VADDR; 457 } 458 459 if (AMDGPU::getMTBUFHasSrsrc(Opc)) { 460 result |= SRSRC; 461 } 462 463 if (AMDGPU::getMTBUFHasSoffset(Opc)) { 464 result |= SOFFSET; 465 } 466 467 return result; 468 } 469 470 switch (Opc) { 471 default: 472 return 0; 473 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 474 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 475 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 476 return SBASE; 477 case AMDGPU::DS_READ_B32: 478 case AMDGPU::DS_READ_B64: 479 case AMDGPU::DS_READ_B32_gfx9: 480 case AMDGPU::DS_READ_B64_gfx9: 481 case AMDGPU::DS_WRITE_B32: 482 case AMDGPU::DS_WRITE_B64: 483 case AMDGPU::DS_WRITE_B32_gfx9: 484 case AMDGPU::DS_WRITE_B64_gfx9: 485 return ADDR; 486 } 487 } 488 489 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 490 const SIInstrInfo &TII, 491 const GCNSubtarget &STM) { 492 I = MI; 493 unsigned Opc = MI->getOpcode(); 494 InstClass = getInstClass(Opc, TII); 495 496 if (InstClass == UNKNOWN) 497 return; 498 499 switch (InstClass) { 500 case DS_READ: 501 EltSize = 502 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 503 : 4; 504 break; 505 case DS_WRITE: 506 EltSize = 507 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 508 : 4; 509 break; 510 case S_BUFFER_LOAD_IMM: 511 EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4); 512 break; 513 default: 514 EltSize = 4; 515 break; 516 } 517 518 if (InstClass == MIMG) { 519 DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 520 // Offset is not considered for MIMG instructions. 521 Offset = 0; 522 } else { 523 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 524 Offset = I->getOperand(OffsetIdx).getImm(); 525 } 526 527 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 528 Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 529 530 Width = getOpcodeWidth(*I, TII); 531 532 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 533 Offset &= 0xffff; 534 } else if (InstClass != MIMG) { 535 GLC = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm(); 536 if (InstClass != S_BUFFER_LOAD_IMM) { 537 SLC = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm(); 538 } 539 DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm(); 540 } 541 542 unsigned AddrOpName[5] = {0}; 543 NumAddresses = 0; 544 const unsigned Regs = getRegs(I->getOpcode(), TII); 545 546 if (Regs & ADDR) { 547 AddrOpName[NumAddresses++] = AMDGPU::OpName::addr; 548 } 549 550 if (Regs & SBASE) { 551 AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase; 552 } 553 554 if (Regs & SRSRC) { 555 AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc; 556 } 557 558 if (Regs & SOFFSET) { 559 AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset; 560 } 561 562 if (Regs & VADDR) { 563 AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr; 564 } 565 566 if (Regs & SSAMP) { 567 AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp; 568 } 569 570 for (unsigned i = 0; i < NumAddresses; i++) { 571 AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); 572 AddrReg[i] = &I->getOperand(AddrIdx[i]); 573 } 574 } 575 576 } // end anonymous namespace. 577 578 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 579 "SI Load Store Optimizer", false, false) 580 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 581 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 582 false, false) 583 584 char SILoadStoreOptimizer::ID = 0; 585 586 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 587 588 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 589 return new SILoadStoreOptimizer(); 590 } 591 592 static void moveInstsAfter(MachineBasicBlock::iterator I, 593 ArrayRef<MachineInstr *> InstsToMove) { 594 MachineBasicBlock *MBB = I->getParent(); 595 ++I; 596 for (MachineInstr *MI : InstsToMove) { 597 MI->removeFromParent(); 598 MBB->insert(I, MI); 599 } 600 } 601 602 static void addDefsUsesToList(const MachineInstr &MI, 603 DenseSet<Register> &RegDefs, 604 DenseSet<Register> &PhysRegUses) { 605 for (const MachineOperand &Op : MI.operands()) { 606 if (Op.isReg()) { 607 if (Op.isDef()) 608 RegDefs.insert(Op.getReg()); 609 else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg())) 610 PhysRegUses.insert(Op.getReg()); 611 } 612 } 613 } 614 615 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 616 MachineBasicBlock::iterator B, 617 AliasAnalysis *AA) { 618 // RAW or WAR - cannot reorder 619 // WAW - cannot reorder 620 // RAR - safe to reorder 621 return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); 622 } 623 624 // Add MI and its defs to the lists if MI reads one of the defs that are 625 // already in the list. Returns true in that case. 626 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs, 627 DenseSet<Register> &PhysRegUses, 628 SmallVectorImpl<MachineInstr *> &Insts) { 629 for (MachineOperand &Use : MI.operands()) { 630 // If one of the defs is read, then there is a use of Def between I and the 631 // instruction that I will potentially be merged with. We will need to move 632 // this instruction after the merged instructions. 633 // 634 // Similarly, if there is a def which is read by an instruction that is to 635 // be moved for merging, then we need to move the def-instruction as well. 636 // This can only happen for physical registers such as M0; virtual 637 // registers are in SSA form. 638 if (Use.isReg() && 639 ((Use.readsReg() && RegDefs.count(Use.getReg())) || 640 (Use.isDef() && RegDefs.count(Use.getReg())) || 641 (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) && 642 PhysRegUses.count(Use.getReg())))) { 643 Insts.push_back(&MI); 644 addDefsUsesToList(MI, RegDefs, PhysRegUses); 645 return true; 646 } 647 } 648 649 return false; 650 } 651 652 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, 653 ArrayRef<MachineInstr *> InstsToMove, 654 AliasAnalysis *AA) { 655 assert(MemOp.mayLoadOrStore()); 656 657 for (MachineInstr *InstToMove : InstsToMove) { 658 if (!InstToMove->mayLoadOrStore()) 659 continue; 660 if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA)) 661 return false; 662 } 663 return true; 664 } 665 666 // This function assumes that \p A and \p B have are identical except for 667 // size and offset, and they referecne adjacent memory. 668 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 669 const MachineMemOperand *A, 670 const MachineMemOperand *B) { 671 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 672 unsigned Size = A->getSize() + B->getSize(); 673 // This function adds the offset parameter to the existing offset for A, 674 // so we pass 0 here as the offset and then manually set it to the correct 675 // value after the call. 676 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 677 MMO->setOffset(MinOffset); 678 return MMO; 679 } 680 681 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 682 const SIInstrInfo &TII, 683 const CombineInfo &Paired) { 684 assert(CI.InstClass == MIMG); 685 686 // Ignore instructions with tfe/lwe set. 687 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 688 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 689 690 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 691 return false; 692 693 // Check other optional immediate operands for equality. 694 unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc, 695 AMDGPU::OpName::d16, AMDGPU::OpName::unorm, 696 AMDGPU::OpName::da, AMDGPU::OpName::r128, 697 AMDGPU::OpName::a16}; 698 699 for (auto op : OperandsToMatch) { 700 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 701 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 702 return false; 703 if (Idx != -1 && 704 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 705 return false; 706 } 707 708 // Check DMask for overlaps. 709 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 710 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 711 712 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 713 if ((1u << AllowedBitsForMin) <= MinMask) 714 return false; 715 716 return true; 717 } 718 719 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 720 unsigned ComponentCount, 721 const GCNSubtarget &STI) { 722 if (ComponentCount > 4) 723 return 0; 724 725 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 726 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 727 if (!OldFormatInfo) 728 return 0; 729 730 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 731 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 732 ComponentCount, 733 OldFormatInfo->NumFormat, STI); 734 735 if (!NewFormatInfo) 736 return 0; 737 738 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 739 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 740 741 return NewFormatInfo->Format; 742 } 743 744 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 745 const GCNSubtarget &STI, 746 CombineInfo &Paired, 747 bool Modify) { 748 assert(CI.InstClass != MIMG); 749 750 // XXX - Would the same offset be OK? Is there any reason this would happen or 751 // be useful? 752 if (CI.Offset == Paired.Offset) 753 return false; 754 755 // This won't be valid if the offset isn't aligned. 756 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 757 return false; 758 759 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 760 761 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 762 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 763 if (!Info0) 764 return false; 765 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 766 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 767 if (!Info1) 768 return false; 769 770 if (Info0->BitsPerComp != Info1->BitsPerComp || 771 Info0->NumFormat != Info1->NumFormat) 772 return false; 773 774 // TODO: Should be possible to support more formats, but if format loads 775 // are not dword-aligned, the merged load might not be valid. 776 if (Info0->BitsPerComp != 32) 777 return false; 778 779 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 780 return false; 781 } 782 783 unsigned EltOffset0 = CI.Offset / CI.EltSize; 784 unsigned EltOffset1 = Paired.Offset / CI.EltSize; 785 CI.UseST64 = false; 786 CI.BaseOff = 0; 787 788 // Handle DS instructions. 789 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 790 return (EltOffset0 + CI.Width == EltOffset1 || 791 EltOffset1 + Paired.Width == EltOffset0) && 792 CI.GLC == Paired.GLC && CI.DLC == Paired.DLC && 793 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC); 794 } 795 796 // Handle SMEM and VMEM instructions. 797 // If the offset in elements doesn't fit in 8-bits, we might be able to use 798 // the stride 64 versions. 799 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 800 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 801 if (Modify) { 802 CI.Offset = EltOffset0 / 64; 803 Paired.Offset = EltOffset1 / 64; 804 CI.UseST64 = true; 805 } 806 return true; 807 } 808 809 // Check if the new offsets fit in the reduced 8-bit range. 810 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 811 if (Modify) { 812 CI.Offset = EltOffset0; 813 Paired.Offset = EltOffset1; 814 } 815 return true; 816 } 817 818 // Try to shift base address to decrease offsets. 819 unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0); 820 CI.BaseOff = std::min(CI.Offset, Paired.Offset); 821 822 if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) { 823 if (Modify) { 824 CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64; 825 Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64; 826 CI.UseST64 = true; 827 } 828 return true; 829 } 830 831 if (isUInt<8>(OffsetDiff)) { 832 if (Modify) { 833 CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize; 834 Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize; 835 } 836 return true; 837 } 838 839 return false; 840 } 841 842 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 843 const CombineInfo &CI, 844 const CombineInfo &Paired) { 845 const unsigned Width = (CI.Width + Paired.Width); 846 switch (CI.InstClass) { 847 default: 848 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 849 case S_BUFFER_LOAD_IMM: 850 switch (Width) { 851 default: 852 return false; 853 case 2: 854 case 4: 855 return true; 856 } 857 } 858 } 859 860 /// This function assumes that CI comes before Paired in a basic block. 861 bool SILoadStoreOptimizer::checkAndPrepareMerge( 862 CombineInfo &CI, CombineInfo &Paired, 863 SmallVectorImpl<MachineInstr *> &InstsToMove) { 864 865 // Check both offsets (or masks for MIMG) can be combined and fit in the 866 // reduced range. 867 if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired)) 868 return false; 869 870 if (CI.InstClass != MIMG && 871 (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))) 872 return false; 873 874 const unsigned Opc = CI.I->getOpcode(); 875 const InstClassEnum InstClass = getInstClass(Opc, *TII); 876 877 if (InstClass == UNKNOWN) { 878 return false; 879 } 880 const unsigned InstSubclass = getInstSubclass(Opc, *TII); 881 882 // Do not merge VMEM buffer instructions with "swizzled" bit set. 883 int Swizzled = 884 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); 885 if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) 886 return false; 887 888 DenseSet<Register> RegDefsToMove; 889 DenseSet<Register> PhysRegUsesToMove; 890 addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove); 891 892 MachineBasicBlock::iterator E = std::next(Paired.I); 893 MachineBasicBlock::iterator MBBI = std::next(CI.I); 894 MachineBasicBlock::iterator MBBE = CI.I->getParent()->end(); 895 for (; MBBI != E; ++MBBI) { 896 897 if (MBBI == MBBE) { 898 // CombineInfo::Order is a hint on the instruction ordering within the 899 // basic block. This hint suggests that CI precedes Paired, which is 900 // true most of the time. However, moveInstsAfter() processing a 901 // previous list may have changed this order in a situation when it 902 // moves an instruction which exists in some other merge list. 903 // In this case it must be dependent. 904 return false; 905 } 906 907 if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) || 908 (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) { 909 // This is not a matching instruction, but we can keep looking as 910 // long as one of these conditions are met: 911 // 1. It is safe to move I down past MBBI. 912 // 2. It is safe to move MBBI down past the instruction that I will 913 // be merged into. 914 915 if (MBBI->hasUnmodeledSideEffects()) { 916 // We can't re-order this instruction with respect to other memory 917 // operations, so we fail both conditions mentioned above. 918 return false; 919 } 920 921 if (MBBI->mayLoadOrStore() && 922 (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 923 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) { 924 // We fail condition #1, but we may still be able to satisfy condition 925 // #2. Add this instruction to the move list and then we will check 926 // if condition #2 holds once we have selected the matching instruction. 927 InstsToMove.push_back(&*MBBI); 928 addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove); 929 continue; 930 } 931 932 // When we match I with another DS instruction we will be moving I down 933 // to the location of the matched instruction any uses of I will need to 934 // be moved down as well. 935 addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 936 InstsToMove); 937 continue; 938 } 939 940 // Don't merge volatiles. 941 if (MBBI->hasOrderedMemoryRef()) 942 return false; 943 944 int Swizzled = 945 AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz); 946 if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm()) 947 return false; 948 949 // Handle a case like 950 // DS_WRITE_B32 addr, v, idx0 951 // w = DS_READ_B32 addr, idx0 952 // DS_WRITE_B32 addr, f(w), idx1 953 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 954 // merging of the two writes. 955 if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, 956 InstsToMove)) 957 continue; 958 959 if (&*MBBI == &*Paired.I) { 960 // We need to go through the list of instructions that we plan to 961 // move and make sure they are all safe to move down past the merged 962 // instruction. 963 if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) { 964 965 // Call offsetsCanBeCombined with modify = true so that the offsets are 966 // correct for the new instruction. This should return true, because 967 // this function should only be called on CombineInfo objects that 968 // have already been confirmed to be mergeable. 969 if (CI.InstClass != MIMG) 970 offsetsCanBeCombined(CI, *STM, Paired, true); 971 return true; 972 } 973 return false; 974 } 975 976 // We've found a load/store that we couldn't merge for some reason. 977 // We could potentially keep looking, but we'd need to make sure that 978 // it was safe to move I and also all the instruction in InstsToMove 979 // down past this instruction. 980 // check if we can move I across MBBI and if we can move all I's users 981 if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) || 982 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) 983 break; 984 } 985 return false; 986 } 987 988 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 989 if (STM->ldsRequiresM0Init()) 990 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 991 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 992 } 993 994 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 995 if (STM->ldsRequiresM0Init()) 996 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 997 998 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 999 : AMDGPU::DS_READ2ST64_B64_gfx9; 1000 } 1001 1002 MachineBasicBlock::iterator 1003 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 1004 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1005 MachineBasicBlock *MBB = CI.I->getParent(); 1006 1007 // Be careful, since the addresses could be subregisters themselves in weird 1008 // cases, like vectors of pointers. 1009 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1010 1011 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 1012 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 1013 1014 unsigned NewOffset0 = CI.Offset; 1015 unsigned NewOffset1 = Paired.Offset; 1016 unsigned Opc = 1017 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 1018 1019 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 1020 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 1021 1022 if (NewOffset0 > NewOffset1) { 1023 // Canonicalize the merged instruction so the smaller offset comes first. 1024 std::swap(NewOffset0, NewOffset1); 1025 std::swap(SubRegIdx0, SubRegIdx1); 1026 } 1027 1028 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1029 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1030 1031 const MCInstrDesc &Read2Desc = TII->get(Opc); 1032 1033 const TargetRegisterClass *SuperRC = 1034 (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 1035 Register DestReg = MRI->createVirtualRegister(SuperRC); 1036 1037 DebugLoc DL = CI.I->getDebugLoc(); 1038 1039 Register BaseReg = AddrReg->getReg(); 1040 unsigned BaseSubReg = AddrReg->getSubReg(); 1041 unsigned BaseRegFlags = 0; 1042 if (CI.BaseOff) { 1043 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1044 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1045 .addImm(CI.BaseOff); 1046 1047 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1048 BaseRegFlags = RegState::Kill; 1049 1050 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1051 .addReg(ImmReg) 1052 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1053 .addImm(0); // clamp bit 1054 BaseSubReg = 0; 1055 } 1056 1057 MachineInstrBuilder Read2 = 1058 BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg) 1059 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1060 .addImm(NewOffset0) // offset0 1061 .addImm(NewOffset1) // offset1 1062 .addImm(0) // gds 1063 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1064 1065 (void)Read2; 1066 1067 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1068 1069 // Copy to the old destination registers. 1070 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1071 .add(*Dest0) // Copy to same destination including flags and sub reg. 1072 .addReg(DestReg, 0, SubRegIdx0); 1073 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1074 .add(*Dest1) 1075 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1076 1077 moveInstsAfter(Copy1, InstsToMove); 1078 1079 CI.I->eraseFromParent(); 1080 Paired.I->eraseFromParent(); 1081 1082 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 1083 return Read2; 1084 } 1085 1086 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1087 if (STM->ldsRequiresM0Init()) 1088 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1089 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1090 : AMDGPU::DS_WRITE2_B64_gfx9; 1091 } 1092 1093 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1094 if (STM->ldsRequiresM0Init()) 1095 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1096 : AMDGPU::DS_WRITE2ST64_B64; 1097 1098 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1099 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1100 } 1101 1102 MachineBasicBlock::iterator 1103 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 1104 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1105 MachineBasicBlock *MBB = CI.I->getParent(); 1106 1107 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1108 // sure we preserve the subregister index and any register flags set on them. 1109 const MachineOperand *AddrReg = 1110 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1111 const MachineOperand *Data0 = 1112 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1113 const MachineOperand *Data1 = 1114 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1115 1116 unsigned NewOffset0 = CI.Offset; 1117 unsigned NewOffset1 = Paired.Offset; 1118 unsigned Opc = 1119 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1120 1121 if (NewOffset0 > NewOffset1) { 1122 // Canonicalize the merged instruction so the smaller offset comes first. 1123 std::swap(NewOffset0, NewOffset1); 1124 std::swap(Data0, Data1); 1125 } 1126 1127 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1128 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1129 1130 const MCInstrDesc &Write2Desc = TII->get(Opc); 1131 DebugLoc DL = CI.I->getDebugLoc(); 1132 1133 Register BaseReg = AddrReg->getReg(); 1134 unsigned BaseSubReg = AddrReg->getSubReg(); 1135 unsigned BaseRegFlags = 0; 1136 if (CI.BaseOff) { 1137 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1138 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1139 .addImm(CI.BaseOff); 1140 1141 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1142 BaseRegFlags = RegState::Kill; 1143 1144 TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg) 1145 .addReg(ImmReg) 1146 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1147 .addImm(0); // clamp bit 1148 BaseSubReg = 0; 1149 } 1150 1151 MachineInstrBuilder Write2 = 1152 BuildMI(*MBB, Paired.I, DL, Write2Desc) 1153 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1154 .add(*Data0) // data0 1155 .add(*Data1) // data1 1156 .addImm(NewOffset0) // offset0 1157 .addImm(NewOffset1) // offset1 1158 .addImm(0) // gds 1159 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1160 1161 moveInstsAfter(Write2, InstsToMove); 1162 1163 CI.I->eraseFromParent(); 1164 Paired.I->eraseFromParent(); 1165 1166 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1167 return Write2; 1168 } 1169 1170 MachineBasicBlock::iterator 1171 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1172 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1173 MachineBasicBlock *MBB = CI.I->getParent(); 1174 DebugLoc DL = CI.I->getDebugLoc(); 1175 const unsigned Opcode = getNewOpcode(CI, Paired); 1176 1177 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1178 1179 Register DestReg = MRI->createVirtualRegister(SuperRC); 1180 unsigned MergedDMask = CI.DMask | Paired.DMask; 1181 unsigned DMaskIdx = 1182 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1183 1184 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1185 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1186 if (I == DMaskIdx) 1187 MIB.addImm(MergedDMask); 1188 else 1189 MIB.add((*CI.I).getOperand(I)); 1190 } 1191 1192 // It shouldn't be possible to get this far if the two instructions 1193 // don't have a single memoperand, because MachineInstr::mayAlias() 1194 // will return true if this is the case. 1195 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1196 1197 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1198 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1199 1200 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1201 1202 unsigned SubRegIdx0, SubRegIdx1; 1203 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1204 1205 // Copy to the old destination registers. 1206 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1207 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1208 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1209 1210 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1211 .add(*Dest0) // Copy to same destination including flags and sub reg. 1212 .addReg(DestReg, 0, SubRegIdx0); 1213 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1214 .add(*Dest1) 1215 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1216 1217 moveInstsAfter(Copy1, InstsToMove); 1218 1219 CI.I->eraseFromParent(); 1220 Paired.I->eraseFromParent(); 1221 return New; 1222 } 1223 1224 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1225 CombineInfo &CI, CombineInfo &Paired, 1226 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1227 MachineBasicBlock *MBB = CI.I->getParent(); 1228 DebugLoc DL = CI.I->getDebugLoc(); 1229 const unsigned Opcode = getNewOpcode(CI, Paired); 1230 1231 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1232 1233 Register DestReg = MRI->createVirtualRegister(SuperRC); 1234 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1235 1236 // It shouldn't be possible to get this far if the two instructions 1237 // don't have a single memoperand, because MachineInstr::mayAlias() 1238 // will return true if this is the case. 1239 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1240 1241 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1242 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1243 1244 MachineInstr *New = 1245 BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg) 1246 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1247 .addImm(MergedOffset) // offset 1248 .addImm(CI.GLC) // glc 1249 .addImm(CI.DLC) // dlc 1250 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1251 1252 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1253 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1254 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1255 1256 // Copy to the old destination registers. 1257 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1258 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1259 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1260 1261 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1262 .add(*Dest0) // Copy to same destination including flags and sub reg. 1263 .addReg(DestReg, 0, SubRegIdx0); 1264 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1265 .add(*Dest1) 1266 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1267 1268 moveInstsAfter(Copy1, InstsToMove); 1269 1270 CI.I->eraseFromParent(); 1271 Paired.I->eraseFromParent(); 1272 return New; 1273 } 1274 1275 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1276 CombineInfo &CI, CombineInfo &Paired, 1277 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1278 MachineBasicBlock *MBB = CI.I->getParent(); 1279 DebugLoc DL = CI.I->getDebugLoc(); 1280 1281 const unsigned Opcode = getNewOpcode(CI, Paired); 1282 1283 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1284 1285 // Copy to the new source register. 1286 Register DestReg = MRI->createVirtualRegister(SuperRC); 1287 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1288 1289 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1290 1291 const unsigned Regs = getRegs(Opcode, *TII); 1292 1293 if (Regs & VADDR) 1294 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1295 1296 // It shouldn't be possible to get this far if the two instructions 1297 // don't have a single memoperand, because MachineInstr::mayAlias() 1298 // will return true if this is the case. 1299 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1300 1301 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1302 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1303 1304 MachineInstr *New = 1305 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1306 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1307 .addImm(MergedOffset) // offset 1308 .addImm(CI.GLC) // glc 1309 .addImm(CI.SLC) // slc 1310 .addImm(0) // tfe 1311 .addImm(CI.DLC) // dlc 1312 .addImm(0) // swz 1313 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1314 1315 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1316 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1317 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1318 1319 // Copy to the old destination registers. 1320 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1321 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1322 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1323 1324 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1325 .add(*Dest0) // Copy to same destination including flags and sub reg. 1326 .addReg(DestReg, 0, SubRegIdx0); 1327 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1328 .add(*Dest1) 1329 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1330 1331 moveInstsAfter(Copy1, InstsToMove); 1332 1333 CI.I->eraseFromParent(); 1334 Paired.I->eraseFromParent(); 1335 return New; 1336 } 1337 1338 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1339 CombineInfo &CI, CombineInfo &Paired, 1340 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1341 MachineBasicBlock *MBB = CI.I->getParent(); 1342 DebugLoc DL = CI.I->getDebugLoc(); 1343 1344 const unsigned Opcode = getNewOpcode(CI, Paired); 1345 1346 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1347 1348 // Copy to the new source register. 1349 Register DestReg = MRI->createVirtualRegister(SuperRC); 1350 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1351 1352 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg); 1353 1354 const unsigned Regs = getRegs(Opcode, *TII); 1355 1356 if (Regs & VADDR) 1357 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1358 1359 unsigned JoinedFormat = 1360 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1361 1362 // It shouldn't be possible to get this far if the two instructions 1363 // don't have a single memoperand, because MachineInstr::mayAlias() 1364 // will return true if this is the case. 1365 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1366 1367 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1368 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1369 1370 MachineInstr *New = 1371 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1372 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1373 .addImm(MergedOffset) // offset 1374 .addImm(JoinedFormat) // format 1375 .addImm(CI.GLC) // glc 1376 .addImm(CI.SLC) // slc 1377 .addImm(0) // tfe 1378 .addImm(CI.DLC) // dlc 1379 .addImm(0) // swz 1380 .addMemOperand( 1381 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1382 1383 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1384 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1385 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1386 1387 // Copy to the old destination registers. 1388 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1389 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1390 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1391 1392 BuildMI(*MBB, Paired.I, DL, CopyDesc) 1393 .add(*Dest0) // Copy to same destination including flags and sub reg. 1394 .addReg(DestReg, 0, SubRegIdx0); 1395 MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc) 1396 .add(*Dest1) 1397 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1398 1399 moveInstsAfter(Copy1, InstsToMove); 1400 1401 CI.I->eraseFromParent(); 1402 Paired.I->eraseFromParent(); 1403 return New; 1404 } 1405 1406 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1407 CombineInfo &CI, CombineInfo &Paired, 1408 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1409 MachineBasicBlock *MBB = CI.I->getParent(); 1410 DebugLoc DL = CI.I->getDebugLoc(); 1411 1412 const unsigned Opcode = getNewOpcode(CI, Paired); 1413 1414 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1415 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1416 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1417 1418 // Copy to the new source register. 1419 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1420 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1421 1422 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1423 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1424 1425 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1426 .add(*Src0) 1427 .addImm(SubRegIdx0) 1428 .add(*Src1) 1429 .addImm(SubRegIdx1); 1430 1431 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1432 .addReg(SrcReg, RegState::Kill); 1433 1434 const unsigned Regs = getRegs(Opcode, *TII); 1435 1436 if (Regs & VADDR) 1437 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1438 1439 unsigned JoinedFormat = 1440 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1441 1442 // It shouldn't be possible to get this far if the two instructions 1443 // don't have a single memoperand, because MachineInstr::mayAlias() 1444 // will return true if this is the case. 1445 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1446 1447 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1448 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1449 1450 MachineInstr *New = 1451 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1452 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1453 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1454 .addImm(JoinedFormat) // format 1455 .addImm(CI.GLC) // glc 1456 .addImm(CI.SLC) // slc 1457 .addImm(0) // tfe 1458 .addImm(CI.DLC) // dlc 1459 .addImm(0) // swz 1460 .addMemOperand( 1461 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1462 1463 moveInstsAfter(MIB, InstsToMove); 1464 1465 CI.I->eraseFromParent(); 1466 Paired.I->eraseFromParent(); 1467 return New; 1468 } 1469 1470 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1471 const CombineInfo &Paired) { 1472 const unsigned Width = CI.Width + Paired.Width; 1473 1474 switch (CI.InstClass) { 1475 default: 1476 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1477 // FIXME: Handle d16 correctly 1478 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1479 Width); 1480 case TBUFFER_LOAD: 1481 case TBUFFER_STORE: 1482 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1483 Width); 1484 1485 case UNKNOWN: 1486 llvm_unreachable("Unknown instruction class"); 1487 case S_BUFFER_LOAD_IMM: 1488 switch (Width) { 1489 default: 1490 return 0; 1491 case 2: 1492 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1493 case 4: 1494 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1495 } 1496 case MIMG: 1497 assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width)); 1498 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1499 } 1500 } 1501 1502 std::pair<unsigned, unsigned> 1503 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) { 1504 1505 if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4) 1506 return std::make_pair(0, 0); 1507 1508 bool ReverseOrder; 1509 if (CI.InstClass == MIMG) { 1510 assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1511 "No overlaps"); 1512 ReverseOrder = CI.DMask > Paired.DMask; 1513 } else 1514 ReverseOrder = CI.Offset > Paired.Offset; 1515 1516 static const unsigned Idxs[4][4] = { 1517 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1518 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0}, 1519 {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0}, 1520 {AMDGPU::sub3, 0, 0, 0}, 1521 }; 1522 unsigned Idx0; 1523 unsigned Idx1; 1524 1525 assert(CI.Width >= 1 && CI.Width <= 3); 1526 assert(Paired.Width >= 1 && Paired.Width <= 3); 1527 1528 if (ReverseOrder) { 1529 Idx1 = Idxs[0][Paired.Width - 1]; 1530 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1531 } else { 1532 Idx0 = Idxs[0][CI.Width - 1]; 1533 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1534 } 1535 1536 return std::make_pair(Idx0, Idx1); 1537 } 1538 1539 const TargetRegisterClass * 1540 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1541 const CombineInfo &Paired) { 1542 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1543 switch (CI.Width + Paired.Width) { 1544 default: 1545 return nullptr; 1546 case 2: 1547 return &AMDGPU::SReg_64_XEXECRegClass; 1548 case 4: 1549 return &AMDGPU::SGPR_128RegClass; 1550 case 8: 1551 return &AMDGPU::SGPR_256RegClass; 1552 case 16: 1553 return &AMDGPU::SGPR_512RegClass; 1554 } 1555 } else { 1556 switch (CI.Width + Paired.Width) { 1557 default: 1558 return nullptr; 1559 case 2: 1560 return &AMDGPU::VReg_64RegClass; 1561 case 3: 1562 return &AMDGPU::VReg_96RegClass; 1563 case 4: 1564 return &AMDGPU::VReg_128RegClass; 1565 } 1566 } 1567 } 1568 1569 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1570 CombineInfo &CI, CombineInfo &Paired, 1571 const SmallVectorImpl<MachineInstr *> &InstsToMove) { 1572 MachineBasicBlock *MBB = CI.I->getParent(); 1573 DebugLoc DL = CI.I->getDebugLoc(); 1574 1575 const unsigned Opcode = getNewOpcode(CI, Paired); 1576 1577 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1578 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1579 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1580 1581 // Copy to the new source register. 1582 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1583 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1584 1585 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1586 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1587 1588 BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1589 .add(*Src0) 1590 .addImm(SubRegIdx0) 1591 .add(*Src1) 1592 .addImm(SubRegIdx1); 1593 1594 auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode)) 1595 .addReg(SrcReg, RegState::Kill); 1596 1597 const unsigned Regs = getRegs(Opcode, *TII); 1598 1599 if (Regs & VADDR) 1600 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1601 1602 1603 // It shouldn't be possible to get this far if the two instructions 1604 // don't have a single memoperand, because MachineInstr::mayAlias() 1605 // will return true if this is the case. 1606 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1607 1608 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1609 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1610 1611 MachineInstr *New = 1612 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1613 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1614 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1615 .addImm(CI.GLC) // glc 1616 .addImm(CI.SLC) // slc 1617 .addImm(0) // tfe 1618 .addImm(CI.DLC) // dlc 1619 .addImm(0) // swz 1620 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1621 1622 moveInstsAfter(MIB, InstsToMove); 1623 1624 CI.I->eraseFromParent(); 1625 Paired.I->eraseFromParent(); 1626 return New; 1627 } 1628 1629 MachineOperand 1630 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1631 APInt V(32, Val, true); 1632 if (TII->isInlineConstant(V)) 1633 return MachineOperand::CreateImm(Val); 1634 1635 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1636 MachineInstr *Mov = 1637 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1638 TII->get(AMDGPU::S_MOV_B32), Reg) 1639 .addImm(Val); 1640 (void)Mov; 1641 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1642 return MachineOperand::CreateReg(Reg, false); 1643 } 1644 1645 // Compute base address using Addr and return the final register. 1646 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1647 const MemAddress &Addr) const { 1648 MachineBasicBlock *MBB = MI.getParent(); 1649 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1650 DebugLoc DL = MI.getDebugLoc(); 1651 1652 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1653 Addr.Base.LoSubReg) && 1654 "Expected 32-bit Base-Register-Low!!"); 1655 1656 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1657 Addr.Base.HiSubReg) && 1658 "Expected 32-bit Base-Register-Hi!!"); 1659 1660 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1661 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1662 MachineOperand OffsetHi = 1663 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1664 1665 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1666 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1667 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1668 1669 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1670 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1671 MachineInstr *LoHalf = 1672 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0) 1673 .addReg(CarryReg, RegState::Define) 1674 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1675 .add(OffsetLo) 1676 .addImm(0); // clamp bit 1677 (void)LoHalf; 1678 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1679 1680 MachineInstr *HiHalf = 1681 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1682 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1683 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1684 .add(OffsetHi) 1685 .addReg(CarryReg, RegState::Kill) 1686 .addImm(0); // clamp bit 1687 (void)HiHalf; 1688 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1689 1690 Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass); 1691 MachineInstr *FullBase = 1692 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1693 .addReg(DestSub0) 1694 .addImm(AMDGPU::sub0) 1695 .addReg(DestSub1) 1696 .addImm(AMDGPU::sub1); 1697 (void)FullBase; 1698 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1699 1700 return FullDestReg; 1701 } 1702 1703 // Update base and offset with the NewBase and NewOffset in MI. 1704 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1705 Register NewBase, 1706 int32_t NewOffset) const { 1707 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1708 Base->setReg(NewBase); 1709 Base->setIsKill(false); 1710 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1711 } 1712 1713 Optional<int32_t> 1714 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1715 if (Op.isImm()) 1716 return Op.getImm(); 1717 1718 if (!Op.isReg()) 1719 return None; 1720 1721 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1722 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1723 !Def->getOperand(1).isImm()) 1724 return None; 1725 1726 return Def->getOperand(1).getImm(); 1727 } 1728 1729 // Analyze Base and extracts: 1730 // - 32bit base registers, subregisters 1731 // - 64bit constant offset 1732 // Expecting base computation as: 1733 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1734 // %LO:vgpr_32, %c:sreg_64_xexec = 1735 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1736 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1737 // %Base:vreg_64 = 1738 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1739 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1740 MemAddress &Addr) const { 1741 if (!Base.isReg()) 1742 return; 1743 1744 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1745 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1746 || Def->getNumOperands() != 5) 1747 return; 1748 1749 MachineOperand BaseLo = Def->getOperand(1); 1750 MachineOperand BaseHi = Def->getOperand(3); 1751 if (!BaseLo.isReg() || !BaseHi.isReg()) 1752 return; 1753 1754 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1755 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1756 1757 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 || 1758 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1759 return; 1760 1761 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1762 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1763 1764 auto Offset0P = extractConstOffset(*Src0); 1765 if (Offset0P) 1766 BaseLo = *Src1; 1767 else { 1768 if (!(Offset0P = extractConstOffset(*Src1))) 1769 return; 1770 BaseLo = *Src0; 1771 } 1772 1773 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1774 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1775 1776 if (Src0->isImm()) 1777 std::swap(Src0, Src1); 1778 1779 if (!Src1->isImm()) 1780 return; 1781 1782 uint64_t Offset1 = Src1->getImm(); 1783 BaseHi = *Src0; 1784 1785 Addr.Base.LoReg = BaseLo.getReg(); 1786 Addr.Base.HiReg = BaseHi.getReg(); 1787 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1788 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1789 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1790 } 1791 1792 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1793 MachineInstr &MI, 1794 MemInfoMap &Visited, 1795 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1796 1797 if (!(MI.mayLoad() ^ MI.mayStore())) 1798 return false; 1799 1800 // TODO: Support flat and scratch. 1801 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1802 return false; 1803 1804 if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL) 1805 return false; 1806 1807 if (AnchorList.count(&MI)) 1808 return false; 1809 1810 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1811 1812 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1813 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1814 return false; 1815 } 1816 1817 // Step1: Find the base-registers and a 64bit constant offset. 1818 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1819 MemAddress MAddr; 1820 if (Visited.find(&MI) == Visited.end()) { 1821 processBaseWithConstOffset(Base, MAddr); 1822 Visited[&MI] = MAddr; 1823 } else 1824 MAddr = Visited[&MI]; 1825 1826 if (MAddr.Offset == 0) { 1827 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1828 " constant offsets that can be promoted.\n";); 1829 return false; 1830 } 1831 1832 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1833 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1834 1835 // Step2: Traverse through MI's basic block and find an anchor(that has the 1836 // same base-registers) with the highest 13bit distance from MI's offset. 1837 // E.g. (64bit loads) 1838 // bb: 1839 // addr1 = &a + 4096; load1 = load(addr1, 0) 1840 // addr2 = &a + 6144; load2 = load(addr2, 0) 1841 // addr3 = &a + 8192; load3 = load(addr3, 0) 1842 // addr4 = &a + 10240; load4 = load(addr4, 0) 1843 // addr5 = &a + 12288; load5 = load(addr5, 0) 1844 // 1845 // Starting from the first load, the optimization will try to find a new base 1846 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1847 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1848 // as the new-base(anchor) because of the maximum distance which can 1849 // accomodate more intermediate bases presumeably. 1850 // 1851 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1852 // (&a + 8192) for load1, load2, load4. 1853 // addr = &a + 8192 1854 // load1 = load(addr, -4096) 1855 // load2 = load(addr, -2048) 1856 // load3 = load(addr, 0) 1857 // load4 = load(addr, 2048) 1858 // addr5 = &a + 12288; load5 = load(addr5, 0) 1859 // 1860 MachineInstr *AnchorInst = nullptr; 1861 MemAddress AnchorAddr; 1862 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1863 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1864 1865 MachineBasicBlock *MBB = MI.getParent(); 1866 MachineBasicBlock::iterator E = MBB->end(); 1867 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1868 ++MBBI; 1869 const SITargetLowering *TLI = 1870 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1871 1872 for ( ; MBBI != E; ++MBBI) { 1873 MachineInstr &MINext = *MBBI; 1874 // TODO: Support finding an anchor(with same base) from store addresses or 1875 // any other load addresses where the opcodes are different. 1876 if (MINext.getOpcode() != MI.getOpcode() || 1877 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1878 continue; 1879 1880 const MachineOperand &BaseNext = 1881 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1882 MemAddress MAddrNext; 1883 if (Visited.find(&MINext) == Visited.end()) { 1884 processBaseWithConstOffset(BaseNext, MAddrNext); 1885 Visited[&MINext] = MAddrNext; 1886 } else 1887 MAddrNext = Visited[&MINext]; 1888 1889 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1890 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1891 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1892 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1893 continue; 1894 1895 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1896 1897 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1898 TargetLoweringBase::AddrMode AM; 1899 AM.HasBaseReg = true; 1900 AM.BaseOffs = Dist; 1901 if (TLI->isLegalGlobalAddressingMode(AM) && 1902 (uint32_t)std::abs(Dist) > MaxDist) { 1903 MaxDist = std::abs(Dist); 1904 1905 AnchorAddr = MAddrNext; 1906 AnchorInst = &MINext; 1907 } 1908 } 1909 1910 if (AnchorInst) { 1911 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1912 AnchorInst->dump()); 1913 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1914 << AnchorAddr.Offset << "\n\n"); 1915 1916 // Instead of moving up, just re-compute anchor-instruction's base address. 1917 Register Base = computeBase(MI, AnchorAddr); 1918 1919 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1920 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1921 1922 for (auto P : InstsWCommonBase) { 1923 TargetLoweringBase::AddrMode AM; 1924 AM.HasBaseReg = true; 1925 AM.BaseOffs = P.second - AnchorAddr.Offset; 1926 1927 if (TLI->isLegalGlobalAddressingMode(AM)) { 1928 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1929 dbgs() << ")"; P.first->dump()); 1930 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1931 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1932 } 1933 } 1934 AnchorList.insert(AnchorInst); 1935 return true; 1936 } 1937 1938 return false; 1939 } 1940 1941 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1942 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1943 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1944 if (AddrList.front().InstClass == CI.InstClass && 1945 AddrList.front().hasSameBaseAddress(*CI.I)) { 1946 AddrList.emplace_back(CI); 1947 return; 1948 } 1949 } 1950 1951 // Base address not found, so add a new list. 1952 MergeableInsts.emplace_back(1, CI); 1953 } 1954 1955 std::pair<MachineBasicBlock::iterator, bool> 1956 SILoadStoreOptimizer::collectMergeableInsts( 1957 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1958 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1959 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1960 bool Modified = false; 1961 1962 // Sort potential mergeable instructions into lists. One list per base address. 1963 unsigned Order = 0; 1964 MachineBasicBlock::iterator BlockI = Begin; 1965 for (; BlockI != End; ++BlockI) { 1966 MachineInstr &MI = *BlockI; 1967 1968 // We run this before checking if an address is mergeable, because it can produce 1969 // better code even if the instructions aren't mergeable. 1970 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1971 Modified = true; 1972 1973 // Don't combine if volatile. We also won't be able to merge across this, so 1974 // break the search. We can look after this barrier for separate merges. 1975 if (MI.hasOrderedMemoryRef()) { 1976 LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI); 1977 1978 // Search will resume after this instruction in a separate merge list. 1979 ++BlockI; 1980 break; 1981 } 1982 1983 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1984 if (InstClass == UNKNOWN) 1985 continue; 1986 1987 CombineInfo CI; 1988 CI.setMI(MI, *TII, *STM); 1989 CI.Order = Order++; 1990 1991 if (!CI.hasMergeableAddress(*MRI)) 1992 continue; 1993 1994 addInstToMergeableList(CI, MergeableInsts); 1995 } 1996 1997 // At this point we have lists of Mergeable instructions. 1998 // 1999 // Part 2: Sort lists by offset and then for each CombineInfo object in the 2000 // list try to find an instruction that can be merged with I. If an instruction 2001 // is found, it is stored in the Paired field. If no instructions are found, then 2002 // the CombineInfo object is deleted from the list. 2003 2004 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2005 E = MergeableInsts.end(); I != E;) { 2006 2007 std::list<CombineInfo> &MergeList = *I; 2008 if (MergeList.size() <= 1) { 2009 // This means we have found only one instruction with a given address 2010 // that can be merged, and we need at least 2 instructions to do a merge, 2011 // so this list can be discarded. 2012 I = MergeableInsts.erase(I); 2013 continue; 2014 } 2015 2016 // Sort the lists by offsets, this way mergeable instructions will be 2017 // adjacent to each other in the list, which will make it easier to find 2018 // matches. 2019 MergeList.sort( 2020 [] (const CombineInfo &A, CombineInfo &B) { 2021 return A.Offset < B.Offset; 2022 }); 2023 ++I; 2024 } 2025 2026 return std::make_pair(BlockI, Modified); 2027 } 2028 2029 // Scan through looking for adjacent LDS operations with constant offsets from 2030 // the same base register. We rely on the scheduler to do the hard work of 2031 // clustering nearby loads, and assume these are all adjacent. 2032 bool SILoadStoreOptimizer::optimizeBlock( 2033 std::list<std::list<CombineInfo> > &MergeableInsts) { 2034 bool Modified = false; 2035 2036 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 2037 E = MergeableInsts.end(); I != E;) { 2038 std::list<CombineInfo> &MergeList = *I; 2039 2040 bool OptimizeListAgain = false; 2041 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 2042 // We weren't able to make any changes, so delete the list so we don't 2043 // process the same instructions the next time we try to optimize this 2044 // block. 2045 I = MergeableInsts.erase(I); 2046 continue; 2047 } 2048 2049 Modified = true; 2050 2051 // We made changes, but also determined that there were no more optimization 2052 // opportunities, so we don't need to reprocess the list 2053 if (!OptimizeListAgain) { 2054 I = MergeableInsts.erase(I); 2055 continue; 2056 } 2057 OptimizeAgain = true; 2058 } 2059 return Modified; 2060 } 2061 2062 bool 2063 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 2064 std::list<CombineInfo> &MergeList, 2065 bool &OptimizeListAgain) { 2066 if (MergeList.empty()) 2067 return false; 2068 2069 bool Modified = false; 2070 2071 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 2072 Next = std::next(I)) { 2073 2074 auto First = I; 2075 auto Second = Next; 2076 2077 if ((*First).Order > (*Second).Order) 2078 std::swap(First, Second); 2079 CombineInfo &CI = *First; 2080 CombineInfo &Paired = *Second; 2081 2082 SmallVector<MachineInstr *, 8> InstsToMove; 2083 if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) { 2084 ++I; 2085 continue; 2086 } 2087 2088 Modified = true; 2089 2090 switch (CI.InstClass) { 2091 default: 2092 llvm_unreachable("unknown InstClass"); 2093 break; 2094 case DS_READ: { 2095 MachineBasicBlock::iterator NewMI = 2096 mergeRead2Pair(CI, Paired, InstsToMove); 2097 CI.setMI(NewMI, *TII, *STM); 2098 break; 2099 } 2100 case DS_WRITE: { 2101 MachineBasicBlock::iterator NewMI = 2102 mergeWrite2Pair(CI, Paired, InstsToMove); 2103 CI.setMI(NewMI, *TII, *STM); 2104 break; 2105 } 2106 case S_BUFFER_LOAD_IMM: { 2107 MachineBasicBlock::iterator NewMI = 2108 mergeSBufferLoadImmPair(CI, Paired, InstsToMove); 2109 CI.setMI(NewMI, *TII, *STM); 2110 OptimizeListAgain |= (CI.Width + Paired.Width) < 16; 2111 break; 2112 } 2113 case BUFFER_LOAD: { 2114 MachineBasicBlock::iterator NewMI = 2115 mergeBufferLoadPair(CI, Paired, InstsToMove); 2116 CI.setMI(NewMI, *TII, *STM); 2117 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2118 break; 2119 } 2120 case BUFFER_STORE: { 2121 MachineBasicBlock::iterator NewMI = 2122 mergeBufferStorePair(CI, Paired, InstsToMove); 2123 CI.setMI(NewMI, *TII, *STM); 2124 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2125 break; 2126 } 2127 case MIMG: { 2128 MachineBasicBlock::iterator NewMI = 2129 mergeImagePair(CI, Paired, InstsToMove); 2130 CI.setMI(NewMI, *TII, *STM); 2131 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2132 break; 2133 } 2134 case TBUFFER_LOAD: { 2135 MachineBasicBlock::iterator NewMI = 2136 mergeTBufferLoadPair(CI, Paired, InstsToMove); 2137 CI.setMI(NewMI, *TII, *STM); 2138 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2139 break; 2140 } 2141 case TBUFFER_STORE: { 2142 MachineBasicBlock::iterator NewMI = 2143 mergeTBufferStorePair(CI, Paired, InstsToMove); 2144 CI.setMI(NewMI, *TII, *STM); 2145 OptimizeListAgain |= (CI.Width + Paired.Width) < 4; 2146 break; 2147 } 2148 } 2149 CI.Order = Paired.Order; 2150 if (I == Second) 2151 I = Next; 2152 2153 MergeList.erase(Second); 2154 } 2155 2156 return Modified; 2157 } 2158 2159 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2160 if (skipFunction(MF.getFunction())) 2161 return false; 2162 2163 STM = &MF.getSubtarget<GCNSubtarget>(); 2164 if (!STM->loadStoreOptEnabled()) 2165 return false; 2166 2167 TII = STM->getInstrInfo(); 2168 TRI = &TII->getRegisterInfo(); 2169 2170 MRI = &MF.getRegInfo(); 2171 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2172 2173 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2174 2175 bool Modified = false; 2176 2177 // Contains the list of instructions for which constant offsets are being 2178 // promoted to the IMM. This is tracked for an entire block at time. 2179 SmallPtrSet<MachineInstr *, 4> AnchorList; 2180 MemInfoMap Visited; 2181 2182 for (MachineBasicBlock &MBB : MF) { 2183 MachineBasicBlock::iterator SectionEnd; 2184 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2185 I = SectionEnd) { 2186 bool CollectModified; 2187 std::list<std::list<CombineInfo>> MergeableInsts; 2188 2189 // First pass: Collect list of all instructions we know how to merge in a 2190 // subset of the block. 2191 std::tie(SectionEnd, CollectModified) = 2192 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2193 2194 Modified |= CollectModified; 2195 2196 do { 2197 OptimizeAgain = false; 2198 Modified |= optimizeBlock(MergeableInsts); 2199 } while (OptimizeAgain); 2200 } 2201 2202 Visited.clear(); 2203 AnchorList.clear(); 2204 } 2205 2206 return Modified; 2207 } 2208