1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass tries to fuse DS instructions with close by immediate offsets. 10 // This will fuse operations such as 11 // ds_read_b32 v0, v2 offset:16 12 // ds_read_b32 v1, v2 offset:32 13 // ==> 14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 15 // 16 // The same is done for certain SMEM and VMEM opcodes, e.g.: 17 // s_buffer_load_dword s4, s[0:3], 4 18 // s_buffer_load_dword s5, s[0:3], 8 19 // ==> 20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4 21 // 22 // This pass also tries to promote constant offset to the immediate by 23 // adjusting the base. It tries to use a base from the nearby instructions that 24 // allows it to have a 13bit constant offset and then promotes the 13bit offset 25 // to the immediate. 26 // E.g. 27 // s_movk_i32 s0, 0x1800 28 // v_add_co_u32_e32 v0, vcc, s0, v2 29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 30 // 31 // s_movk_i32 s0, 0x1000 32 // v_add_co_u32_e32 v5, vcc, s0, v2 33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34 // global_load_dwordx2 v[5:6], v[5:6], off 35 // global_load_dwordx2 v[0:1], v[0:1], off 36 // => 37 // s_movk_i32 s0, 0x1000 38 // v_add_co_u32_e32 v5, vcc, s0, v2 39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40 // global_load_dwordx2 v[5:6], v[5:6], off 41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048 42 // 43 // Future improvements: 44 // 45 // - This is currently missing stores of constants because loading 46 // the constant into the data register is placed between the stores, although 47 // this is arguably a scheduling problem. 48 // 49 // - Live interval recomputing seems inefficient. This currently only matches 50 // one pair, and recomputes live intervals and moves on to the next pair. It 51 // would be better to compute a list of all merges that need to occur. 52 // 53 // - With a list of instructions to process, we can also merge more. If a 54 // cluster of loads have offsets that are too large to fit in the 8-bit 55 // offsets, but are close enough to fit in the 8 bits, we can add to the base 56 // pointer and use the new reduced offsets. 57 // 58 //===----------------------------------------------------------------------===// 59 60 #include "AMDGPU.h" 61 #include "GCNSubtarget.h" 62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 63 #include "llvm/Analysis/AliasAnalysis.h" 64 #include "llvm/CodeGen/MachineFunctionPass.h" 65 #include "llvm/InitializePasses.h" 66 67 using namespace llvm; 68 69 #define DEBUG_TYPE "si-load-store-opt" 70 71 namespace { 72 enum InstClassEnum { 73 UNKNOWN, 74 DS_READ, 75 DS_WRITE, 76 S_BUFFER_LOAD_IMM, 77 BUFFER_LOAD, 78 BUFFER_STORE, 79 MIMG, 80 TBUFFER_LOAD, 81 TBUFFER_STORE, 82 }; 83 84 struct AddressRegs { 85 unsigned char NumVAddrs = 0; 86 bool SBase = false; 87 bool SRsrc = false; 88 bool SOffset = false; 89 bool VAddr = false; 90 bool Addr = false; 91 bool SSamp = false; 92 }; 93 94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 95 const unsigned MaxAddressRegs = 12 + 1 + 1; 96 97 class SILoadStoreOptimizer : public MachineFunctionPass { 98 struct CombineInfo { 99 MachineBasicBlock::iterator I; 100 unsigned EltSize; 101 unsigned Offset; 102 unsigned Width; 103 unsigned Format; 104 unsigned BaseOff; 105 unsigned DMask; 106 InstClassEnum InstClass; 107 unsigned CPol = 0; 108 bool IsAGPR; 109 bool UseST64; 110 int AddrIdx[MaxAddressRegs]; 111 const MachineOperand *AddrReg[MaxAddressRegs]; 112 unsigned NumAddresses; 113 unsigned Order; 114 115 bool hasSameBaseAddress(const MachineInstr &MI) { 116 for (unsigned i = 0; i < NumAddresses; i++) { 117 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]); 118 119 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) { 120 if (AddrReg[i]->isImm() != AddrRegNext.isImm() || 121 AddrReg[i]->getImm() != AddrRegNext.getImm()) { 122 return false; 123 } 124 continue; 125 } 126 127 // Check same base pointer. Be careful of subregisters, which can occur 128 // with vectors of pointers. 129 if (AddrReg[i]->getReg() != AddrRegNext.getReg() || 130 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) { 131 return false; 132 } 133 } 134 return true; 135 } 136 137 bool hasMergeableAddress(const MachineRegisterInfo &MRI) { 138 for (unsigned i = 0; i < NumAddresses; ++i) { 139 const MachineOperand *AddrOp = AddrReg[i]; 140 // Immediates are always OK. 141 if (AddrOp->isImm()) 142 continue; 143 144 // Don't try to merge addresses that aren't either immediates or registers. 145 // TODO: Should be possible to merge FrameIndexes and maybe some other 146 // non-register 147 if (!AddrOp->isReg()) 148 return false; 149 150 // TODO: We should be able to merge physical reg addresses. 151 if (AddrOp->getReg().isPhysical()) 152 return false; 153 154 // If an address has only one use then there will be on other 155 // instructions with the same address, so we can't merge this one. 156 if (MRI.hasOneNonDBGUse(AddrOp->getReg())) 157 return false; 158 } 159 return true; 160 } 161 162 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO); 163 }; 164 165 struct BaseRegisters { 166 Register LoReg; 167 Register HiReg; 168 169 unsigned LoSubReg = 0; 170 unsigned HiSubReg = 0; 171 }; 172 173 struct MemAddress { 174 BaseRegisters Base; 175 int64_t Offset = 0; 176 }; 177 178 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>; 179 180 private: 181 const GCNSubtarget *STM = nullptr; 182 const SIInstrInfo *TII = nullptr; 183 const SIRegisterInfo *TRI = nullptr; 184 MachineRegisterInfo *MRI = nullptr; 185 AliasAnalysis *AA = nullptr; 186 bool OptimizeAgain; 187 188 bool canSwapInstructions(const DenseSet<Register> &ARegDefs, 189 const DenseSet<Register> &ARegUses, 190 const MachineInstr &A, const MachineInstr &B) const; 191 static bool dmasksCanBeCombined(const CombineInfo &CI, 192 const SIInstrInfo &TII, 193 const CombineInfo &Paired); 194 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI, 195 CombineInfo &Paired, bool Modify = false); 196 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, 197 const CombineInfo &Paired); 198 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); 199 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI, 200 const CombineInfo &Paired); 201 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI, 202 const CombineInfo &Paired); 203 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const; 204 205 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired); 206 207 unsigned read2Opcode(unsigned EltSize) const; 208 unsigned read2ST64Opcode(unsigned EltSize) const; 209 MachineBasicBlock::iterator 210 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 211 MachineBasicBlock::iterator InsertBefore); 212 213 unsigned write2Opcode(unsigned EltSize) const; 214 unsigned write2ST64Opcode(unsigned EltSize) const; 215 MachineBasicBlock::iterator 216 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired, 217 MachineBasicBlock::iterator InsertBefore); 218 MachineBasicBlock::iterator 219 mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 220 MachineBasicBlock::iterator InsertBefore); 221 MachineBasicBlock::iterator 222 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired, 223 MachineBasicBlock::iterator InsertBefore); 224 MachineBasicBlock::iterator 225 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 226 MachineBasicBlock::iterator InsertBefore); 227 MachineBasicBlock::iterator 228 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 229 MachineBasicBlock::iterator InsertBefore); 230 MachineBasicBlock::iterator 231 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired, 232 MachineBasicBlock::iterator InsertBefore); 233 MachineBasicBlock::iterator 234 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired, 235 MachineBasicBlock::iterator InsertBefore); 236 237 void updateBaseAndOffset(MachineInstr &I, Register NewBase, 238 int32_t NewOffset) const; 239 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const; 240 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const; 241 Optional<int32_t> extractConstOffset(const MachineOperand &Op) const; 242 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const; 243 /// Promotes constant offset to the immediate by adjusting the base. It 244 /// tries to use a base from the nearby instructions that allows it to have 245 /// a 13bit constant offset which gets promoted to the immediate. 246 bool promoteConstantOffsetToImm(MachineInstr &CI, 247 MemInfoMap &Visited, 248 SmallPtrSet<MachineInstr *, 4> &Promoted) const; 249 void addInstToMergeableList(const CombineInfo &CI, 250 std::list<std::list<CombineInfo> > &MergeableInsts) const; 251 252 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts( 253 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 254 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 255 std::list<std::list<CombineInfo>> &MergeableInsts) const; 256 257 public: 258 static char ID; 259 260 SILoadStoreOptimizer() : MachineFunctionPass(ID) { 261 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 262 } 263 264 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList, 265 bool &OptimizeListAgain); 266 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts); 267 268 bool runOnMachineFunction(MachineFunction &MF) override; 269 270 StringRef getPassName() const override { return "SI Load Store Optimizer"; } 271 272 void getAnalysisUsage(AnalysisUsage &AU) const override { 273 AU.setPreservesCFG(); 274 AU.addRequired<AAResultsWrapperPass>(); 275 276 MachineFunctionPass::getAnalysisUsage(AU); 277 } 278 279 MachineFunctionProperties getRequiredProperties() const override { 280 return MachineFunctionProperties() 281 .set(MachineFunctionProperties::Property::IsSSA); 282 } 283 }; 284 285 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { 286 const unsigned Opc = MI.getOpcode(); 287 288 if (TII.isMUBUF(Opc)) { 289 // FIXME: Handle d16 correctly 290 return AMDGPU::getMUBUFElements(Opc); 291 } 292 if (TII.isMIMG(MI)) { 293 uint64_t DMaskImm = 294 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm(); 295 return countPopulation(DMaskImm); 296 } 297 if (TII.isMTBUF(Opc)) { 298 return AMDGPU::getMTBUFElements(Opc); 299 } 300 301 switch (Opc) { 302 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 303 return 1; 304 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 305 return 2; 306 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 307 return 4; 308 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 309 return 8; 310 case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH; 311 case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH; 312 case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH; 313 case AMDGPU::DS_WRITE_B32_gfx9: 314 return 1; 315 case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH; 316 case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH; 317 case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH; 318 case AMDGPU::DS_WRITE_B64_gfx9: 319 return 2; 320 default: 321 return 0; 322 } 323 } 324 325 /// Maps instruction opcode to enum InstClassEnum. 326 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { 327 switch (Opc) { 328 default: 329 if (TII.isMUBUF(Opc)) { 330 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) { 331 default: 332 return UNKNOWN; 333 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN: 334 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact: 335 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET: 336 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact: 337 return BUFFER_LOAD; 338 case AMDGPU::BUFFER_STORE_DWORD_OFFEN: 339 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact: 340 case AMDGPU::BUFFER_STORE_DWORD_OFFSET: 341 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact: 342 return BUFFER_STORE; 343 } 344 } 345 if (TII.isMIMG(Opc)) { 346 // Ignore instructions encoded without vaddr. 347 if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 && 348 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1) 349 return UNKNOWN; 350 // Ignore BVH instructions 351 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH) 352 return UNKNOWN; 353 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 354 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || 355 TII.isGather4(Opc)) 356 return UNKNOWN; 357 return MIMG; 358 } 359 if (TII.isMTBUF(Opc)) { 360 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) { 361 default: 362 return UNKNOWN; 363 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN: 364 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact: 365 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET: 366 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact: 367 return TBUFFER_LOAD; 368 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN: 369 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact: 370 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET: 371 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact: 372 return TBUFFER_STORE; 373 } 374 } 375 return UNKNOWN; 376 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 377 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 378 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 379 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 380 return S_BUFFER_LOAD_IMM; 381 case AMDGPU::DS_READ_B32: 382 case AMDGPU::DS_READ_B32_gfx9: 383 case AMDGPU::DS_READ_B64: 384 case AMDGPU::DS_READ_B64_gfx9: 385 return DS_READ; 386 case AMDGPU::DS_WRITE_B32: 387 case AMDGPU::DS_WRITE_B32_gfx9: 388 case AMDGPU::DS_WRITE_B64: 389 case AMDGPU::DS_WRITE_B64_gfx9: 390 return DS_WRITE; 391 } 392 } 393 394 /// Determines instruction subclass from opcode. Only instructions 395 /// of the same subclass can be merged together. The merged instruction may have 396 /// a different subclass but must have the same class. 397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { 398 switch (Opc) { 399 default: 400 if (TII.isMUBUF(Opc)) 401 return AMDGPU::getMUBUFBaseOpcode(Opc); 402 if (TII.isMIMG(Opc)) { 403 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 404 assert(Info); 405 return Info->BaseOpcode; 406 } 407 if (TII.isMTBUF(Opc)) 408 return AMDGPU::getMTBUFBaseOpcode(Opc); 409 return -1; 410 case AMDGPU::DS_READ_B32: 411 case AMDGPU::DS_READ_B32_gfx9: 412 case AMDGPU::DS_READ_B64: 413 case AMDGPU::DS_READ_B64_gfx9: 414 case AMDGPU::DS_WRITE_B32: 415 case AMDGPU::DS_WRITE_B32_gfx9: 416 case AMDGPU::DS_WRITE_B64: 417 case AMDGPU::DS_WRITE_B64_gfx9: 418 return Opc; 419 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 420 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 421 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 422 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 423 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM; 424 } 425 } 426 427 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { 428 AddressRegs Result; 429 430 if (TII.isMUBUF(Opc)) { 431 if (AMDGPU::getMUBUFHasVAddr(Opc)) 432 Result.VAddr = true; 433 if (AMDGPU::getMUBUFHasSrsrc(Opc)) 434 Result.SRsrc = true; 435 if (AMDGPU::getMUBUFHasSoffset(Opc)) 436 Result.SOffset = true; 437 438 return Result; 439 } 440 441 if (TII.isMIMG(Opc)) { 442 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); 443 if (VAddr0Idx >= 0) { 444 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 445 Result.NumVAddrs = SRsrcIdx - VAddr0Idx; 446 } else { 447 Result.VAddr = true; 448 } 449 Result.SRsrc = true; 450 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc); 451 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler) 452 Result.SSamp = true; 453 454 return Result; 455 } 456 if (TII.isMTBUF(Opc)) { 457 if (AMDGPU::getMTBUFHasVAddr(Opc)) 458 Result.VAddr = true; 459 if (AMDGPU::getMTBUFHasSrsrc(Opc)) 460 Result.SRsrc = true; 461 if (AMDGPU::getMTBUFHasSoffset(Opc)) 462 Result.SOffset = true; 463 464 return Result; 465 } 466 467 switch (Opc) { 468 default: 469 return Result; 470 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: 471 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: 472 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: 473 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: 474 Result.SBase = true; 475 return Result; 476 case AMDGPU::DS_READ_B32: 477 case AMDGPU::DS_READ_B64: 478 case AMDGPU::DS_READ_B32_gfx9: 479 case AMDGPU::DS_READ_B64_gfx9: 480 case AMDGPU::DS_WRITE_B32: 481 case AMDGPU::DS_WRITE_B64: 482 case AMDGPU::DS_WRITE_B32_gfx9: 483 case AMDGPU::DS_WRITE_B64_gfx9: 484 Result.Addr = true; 485 return Result; 486 } 487 } 488 489 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, 490 const SILoadStoreOptimizer &LSO) { 491 I = MI; 492 unsigned Opc = MI->getOpcode(); 493 InstClass = getInstClass(Opc, *LSO.TII); 494 495 if (InstClass == UNKNOWN) 496 return; 497 498 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI)); 499 500 switch (InstClass) { 501 case DS_READ: 502 EltSize = 503 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 504 : 4; 505 break; 506 case DS_WRITE: 507 EltSize = 508 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 509 : 4; 510 break; 511 case S_BUFFER_LOAD_IMM: 512 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4); 513 break; 514 default: 515 EltSize = 4; 516 break; 517 } 518 519 if (InstClass == MIMG) { 520 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm(); 521 // Offset is not considered for MIMG instructions. 522 Offset = 0; 523 } else { 524 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset); 525 Offset = I->getOperand(OffsetIdx).getImm(); 526 } 527 528 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) 529 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm(); 530 531 Width = getOpcodeWidth(*I, *LSO.TII); 532 533 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) { 534 Offset &= 0xffff; 535 } else if (InstClass != MIMG) { 536 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm(); 537 } 538 539 AddressRegs Regs = getRegs(Opc, *LSO.TII); 540 541 NumAddresses = 0; 542 for (unsigned J = 0; J < Regs.NumVAddrs; J++) 543 AddrIdx[NumAddresses++] = 544 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J; 545 if (Regs.Addr) 546 AddrIdx[NumAddresses++] = 547 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr); 548 if (Regs.SBase) 549 AddrIdx[NumAddresses++] = 550 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase); 551 if (Regs.SRsrc) 552 AddrIdx[NumAddresses++] = 553 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); 554 if (Regs.SOffset) 555 AddrIdx[NumAddresses++] = 556 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset); 557 if (Regs.VAddr) 558 AddrIdx[NumAddresses++] = 559 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr); 560 if (Regs.SSamp) 561 AddrIdx[NumAddresses++] = 562 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp); 563 assert(NumAddresses <= MaxAddressRegs); 564 565 for (unsigned J = 0; J < NumAddresses; J++) 566 AddrReg[J] = &I->getOperand(AddrIdx[J]); 567 } 568 569 } // end anonymous namespace. 570 571 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 572 "SI Load Store Optimizer", false, false) 573 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 574 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", 575 false, false) 576 577 char SILoadStoreOptimizer::ID = 0; 578 579 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 580 581 FunctionPass *llvm::createSILoadStoreOptimizerPass() { 582 return new SILoadStoreOptimizer(); 583 } 584 585 static void addDefsUsesToList(const MachineInstr &MI, 586 DenseSet<Register> &RegDefs, 587 DenseSet<Register> &RegUses) { 588 for (const auto &Op : MI.operands()) { 589 if (!Op.isReg()) 590 continue; 591 if (Op.isDef()) 592 RegDefs.insert(Op.getReg()); 593 if (Op.readsReg()) 594 RegUses.insert(Op.getReg()); 595 } 596 } 597 598 bool SILoadStoreOptimizer::canSwapInstructions( 599 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses, 600 const MachineInstr &A, const MachineInstr &B) const { 601 if (A.mayLoadOrStore() && B.mayLoadOrStore() && 602 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true)) 603 return false; 604 for (const auto &BOp : B.operands()) { 605 if (!BOp.isReg()) 606 continue; 607 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg())) 608 return false; 609 if (BOp.isDef() && ARegUses.contains(BOp.getReg())) 610 return false; 611 } 612 return true; 613 } 614 615 // This function assumes that \p A and \p B have are identical except for 616 // size and offset, and they reference adjacent memory. 617 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF, 618 const MachineMemOperand *A, 619 const MachineMemOperand *B) { 620 unsigned MinOffset = std::min(A->getOffset(), B->getOffset()); 621 unsigned Size = A->getSize() + B->getSize(); 622 // This function adds the offset parameter to the existing offset for A, 623 // so we pass 0 here as the offset and then manually set it to the correct 624 // value after the call. 625 MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size); 626 MMO->setOffset(MinOffset); 627 return MMO; 628 } 629 630 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, 631 const SIInstrInfo &TII, 632 const CombineInfo &Paired) { 633 assert(CI.InstClass == MIMG); 634 635 // Ignore instructions with tfe/lwe set. 636 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe); 637 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe); 638 639 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm())) 640 return false; 641 642 // Check other optional immediate operands for equality. 643 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16, 644 AMDGPU::OpName::unorm, AMDGPU::OpName::da, 645 AMDGPU::OpName::r128, AMDGPU::OpName::a16}; 646 647 for (auto op : OperandsToMatch) { 648 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op); 649 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx) 650 return false; 651 if (Idx != -1 && 652 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm()) 653 return false; 654 } 655 656 // Check DMask for overlaps. 657 unsigned MaxMask = std::max(CI.DMask, Paired.DMask); 658 unsigned MinMask = std::min(CI.DMask, Paired.DMask); 659 660 unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask); 661 if ((1u << AllowedBitsForMin) <= MinMask) 662 return false; 663 664 return true; 665 } 666 667 static unsigned getBufferFormatWithCompCount(unsigned OldFormat, 668 unsigned ComponentCount, 669 const GCNSubtarget &STI) { 670 if (ComponentCount > 4) 671 return 0; 672 673 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo = 674 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI); 675 if (!OldFormatInfo) 676 return 0; 677 678 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo = 679 llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp, 680 ComponentCount, 681 OldFormatInfo->NumFormat, STI); 682 683 if (!NewFormatInfo) 684 return 0; 685 686 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat && 687 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp); 688 689 return NewFormatInfo->Format; 690 } 691 692 // Return the value in the inclusive range [Lo,Hi] that is aligned to the 693 // highest power of two. Note that the result is well defined for all inputs 694 // including corner cases like: 695 // - if Lo == Hi, return that value 696 // - if Lo == 0, return 0 (even though the "- 1" below underflows 697 // - if Lo > Hi, return 0 (as if the range wrapped around) 698 static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) { 699 return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1); 700 } 701 702 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI, 703 const GCNSubtarget &STI, 704 CombineInfo &Paired, 705 bool Modify) { 706 assert(CI.InstClass != MIMG); 707 708 // XXX - Would the same offset be OK? Is there any reason this would happen or 709 // be useful? 710 if (CI.Offset == Paired.Offset) 711 return false; 712 713 // This won't be valid if the offset isn't aligned. 714 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0)) 715 return false; 716 717 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) { 718 719 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 = 720 llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI); 721 if (!Info0) 722 return false; 723 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 = 724 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI); 725 if (!Info1) 726 return false; 727 728 if (Info0->BitsPerComp != Info1->BitsPerComp || 729 Info0->NumFormat != Info1->NumFormat) 730 return false; 731 732 // TODO: Should be possible to support more formats, but if format loads 733 // are not dword-aligned, the merged load might not be valid. 734 if (Info0->BitsPerComp != 32) 735 return false; 736 737 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0) 738 return false; 739 } 740 741 uint32_t EltOffset0 = CI.Offset / CI.EltSize; 742 uint32_t EltOffset1 = Paired.Offset / CI.EltSize; 743 CI.UseST64 = false; 744 CI.BaseOff = 0; 745 746 // Handle all non-DS instructions. 747 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) { 748 return (EltOffset0 + CI.Width == EltOffset1 || 749 EltOffset1 + Paired.Width == EltOffset0) && 750 CI.CPol == Paired.CPol && 751 (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol); 752 } 753 754 // If the offset in elements doesn't fit in 8-bits, we might be able to use 755 // the stride 64 versions. 756 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 && 757 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) { 758 if (Modify) { 759 CI.Offset = EltOffset0 / 64; 760 Paired.Offset = EltOffset1 / 64; 761 CI.UseST64 = true; 762 } 763 return true; 764 } 765 766 // Check if the new offsets fit in the reduced 8-bit range. 767 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) { 768 if (Modify) { 769 CI.Offset = EltOffset0; 770 Paired.Offset = EltOffset1; 771 } 772 return true; 773 } 774 775 // Try to shift base address to decrease offsets. 776 uint32_t Min = std::min(EltOffset0, EltOffset1); 777 uint32_t Max = std::max(EltOffset0, EltOffset1); 778 779 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64; 780 if (((Max - Min) & ~Mask) == 0) { 781 if (Modify) { 782 // From the range of values we could use for BaseOff, choose the one that 783 // is aligned to the highest power of two, to maximise the chance that 784 // the same offset can be reused for other load/store pairs. 785 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min); 786 // Copy the low bits of the offsets, so that when we adjust them by 787 // subtracting BaseOff they will be multiples of 64. 788 BaseOff |= Min & maskTrailingOnes<uint32_t>(6); 789 CI.BaseOff = BaseOff * CI.EltSize; 790 CI.Offset = (EltOffset0 - BaseOff) / 64; 791 Paired.Offset = (EltOffset1 - BaseOff) / 64; 792 CI.UseST64 = true; 793 } 794 return true; 795 } 796 797 if (isUInt<8>(Max - Min)) { 798 if (Modify) { 799 // From the range of values we could use for BaseOff, choose the one that 800 // is aligned to the highest power of two, to maximise the chance that 801 // the same offset can be reused for other load/store pairs. 802 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min); 803 CI.BaseOff = BaseOff * CI.EltSize; 804 CI.Offset = EltOffset0 - BaseOff; 805 Paired.Offset = EltOffset1 - BaseOff; 806 } 807 return true; 808 } 809 810 return false; 811 } 812 813 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM, 814 const CombineInfo &CI, 815 const CombineInfo &Paired) { 816 const unsigned Width = (CI.Width + Paired.Width); 817 switch (CI.InstClass) { 818 default: 819 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3)); 820 case S_BUFFER_LOAD_IMM: 821 switch (Width) { 822 default: 823 return false; 824 case 2: 825 case 4: 826 case 8: 827 return true; 828 } 829 } 830 } 831 832 const TargetRegisterClass * 833 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const { 834 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { 835 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 836 } 837 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) { 838 return TRI->getRegClassForReg(*MRI, Src->getReg()); 839 } 840 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) { 841 return TRI->getRegClassForReg(*MRI, Src->getReg()); 842 } 843 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) { 844 return TRI->getRegClassForReg(*MRI, Dst->getReg()); 845 } 846 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) { 847 return TRI->getRegClassForReg(*MRI, Src->getReg()); 848 } 849 return nullptr; 850 } 851 852 /// This function assumes that CI comes before Paired in a basic block. Return 853 /// an insertion point for the merged instruction or nullptr on failure. 854 SILoadStoreOptimizer::CombineInfo * 855 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI, 856 CombineInfo &Paired) { 857 // If another instruction has already been merged into CI, it may now be a 858 // type that we can't do any further merging into. 859 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN) 860 return nullptr; 861 assert(CI.InstClass == Paired.InstClass); 862 863 if (getInstSubclass(CI.I->getOpcode(), *TII) != 864 getInstSubclass(Paired.I->getOpcode(), *TII)) 865 return nullptr; 866 867 // Check both offsets (or masks for MIMG) can be combined and fit in the 868 // reduced range. 869 if (CI.InstClass == MIMG) { 870 if (!dmasksCanBeCombined(CI, *TII, Paired)) 871 return nullptr; 872 } else { 873 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)) 874 return nullptr; 875 } 876 877 DenseSet<Register> RegDefs; 878 DenseSet<Register> RegUses; 879 CombineInfo *Where; 880 if (CI.I->mayLoad()) { 881 // Try to hoist Paired up to CI. 882 addDefsUsesToList(*Paired.I, RegDefs, RegUses); 883 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) { 884 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI)) 885 return nullptr; 886 } 887 Where = &CI; 888 } else { 889 // Try to sink CI down to Paired. 890 addDefsUsesToList(*CI.I, RegDefs, RegUses); 891 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) { 892 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI)) 893 return nullptr; 894 } 895 Where = &Paired; 896 } 897 898 // Call offsetsCanBeCombined with modify = true so that the offsets are 899 // correct for the new instruction. This should return true, because 900 // this function should only be called on CombineInfo objects that 901 // have already been confirmed to be mergeable. 902 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE) 903 offsetsCanBeCombined(CI, *STM, Paired, true); 904 return Where; 905 } 906 907 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const { 908 if (STM->ldsRequiresM0Init()) 909 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 910 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9; 911 } 912 913 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const { 914 if (STM->ldsRequiresM0Init()) 915 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 916 917 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9 918 : AMDGPU::DS_READ2ST64_B64_gfx9; 919 } 920 921 MachineBasicBlock::iterator 922 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired, 923 MachineBasicBlock::iterator InsertBefore) { 924 MachineBasicBlock *MBB = CI.I->getParent(); 925 926 // Be careful, since the addresses could be subregisters themselves in weird 927 // cases, like vectors of pointers. 928 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 929 930 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst); 931 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst); 932 933 unsigned NewOffset0 = CI.Offset; 934 unsigned NewOffset1 = Paired.Offset; 935 unsigned Opc = 936 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize); 937 938 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 939 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 940 941 if (NewOffset0 > NewOffset1) { 942 // Canonicalize the merged instruction so the smaller offset comes first. 943 std::swap(NewOffset0, NewOffset1); 944 std::swap(SubRegIdx0, SubRegIdx1); 945 } 946 947 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 948 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 949 950 const MCInstrDesc &Read2Desc = TII->get(Opc); 951 952 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 953 Register DestReg = MRI->createVirtualRegister(SuperRC); 954 955 DebugLoc DL = CI.I->getDebugLoc(); 956 957 Register BaseReg = AddrReg->getReg(); 958 unsigned BaseSubReg = AddrReg->getSubReg(); 959 unsigned BaseRegFlags = 0; 960 if (CI.BaseOff) { 961 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 962 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 963 .addImm(CI.BaseOff); 964 965 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 966 BaseRegFlags = RegState::Kill; 967 968 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 969 .addReg(ImmReg) 970 .addReg(AddrReg->getReg(), 0, BaseSubReg) 971 .addImm(0); // clamp bit 972 BaseSubReg = 0; 973 } 974 975 MachineInstrBuilder Read2 = 976 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg) 977 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 978 .addImm(NewOffset0) // offset0 979 .addImm(NewOffset1) // offset1 980 .addImm(0) // gds 981 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 982 983 (void)Read2; 984 985 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 986 987 // Copy to the old destination registers. 988 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 989 .add(*Dest0) // Copy to same destination including flags and sub reg. 990 .addReg(DestReg, 0, SubRegIdx0); 991 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 992 .add(*Dest1) 993 .addReg(DestReg, RegState::Kill, SubRegIdx1); 994 995 CI.I->eraseFromParent(); 996 Paired.I->eraseFromParent(); 997 998 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 999 return Read2; 1000 } 1001 1002 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { 1003 if (STM->ldsRequiresM0Init()) 1004 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 1005 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 1006 : AMDGPU::DS_WRITE2_B64_gfx9; 1007 } 1008 1009 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const { 1010 if (STM->ldsRequiresM0Init()) 1011 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 1012 : AMDGPU::DS_WRITE2ST64_B64; 1013 1014 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9 1015 : AMDGPU::DS_WRITE2ST64_B64_gfx9; 1016 } 1017 1018 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 1019 CombineInfo &CI, CombineInfo &Paired, 1020 MachineBasicBlock::iterator InsertBefore) { 1021 MachineBasicBlock *MBB = CI.I->getParent(); 1022 1023 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 1024 // sure we preserve the subregister index and any register flags set on them. 1025 const MachineOperand *AddrReg = 1026 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr); 1027 const MachineOperand *Data0 = 1028 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0); 1029 const MachineOperand *Data1 = 1030 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0); 1031 1032 unsigned NewOffset0 = CI.Offset; 1033 unsigned NewOffset1 = Paired.Offset; 1034 unsigned Opc = 1035 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize); 1036 1037 if (NewOffset0 > NewOffset1) { 1038 // Canonicalize the merged instruction so the smaller offset comes first. 1039 std::swap(NewOffset0, NewOffset1); 1040 std::swap(Data0, Data1); 1041 } 1042 1043 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 1044 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit"); 1045 1046 const MCInstrDesc &Write2Desc = TII->get(Opc); 1047 DebugLoc DL = CI.I->getDebugLoc(); 1048 1049 Register BaseReg = AddrReg->getReg(); 1050 unsigned BaseSubReg = AddrReg->getSubReg(); 1051 unsigned BaseRegFlags = 0; 1052 if (CI.BaseOff) { 1053 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1054 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg) 1055 .addImm(CI.BaseOff); 1056 1057 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1058 BaseRegFlags = RegState::Kill; 1059 1060 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg) 1061 .addReg(ImmReg) 1062 .addReg(AddrReg->getReg(), 0, BaseSubReg) 1063 .addImm(0); // clamp bit 1064 BaseSubReg = 0; 1065 } 1066 1067 MachineInstrBuilder Write2 = 1068 BuildMI(*MBB, InsertBefore, DL, Write2Desc) 1069 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr 1070 .add(*Data0) // data0 1071 .add(*Data1) // data1 1072 .addImm(NewOffset0) // offset0 1073 .addImm(NewOffset1) // offset1 1074 .addImm(0) // gds 1075 .cloneMergedMemRefs({&*CI.I, &*Paired.I}); 1076 1077 CI.I->eraseFromParent(); 1078 Paired.I->eraseFromParent(); 1079 1080 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 1081 return Write2; 1082 } 1083 1084 MachineBasicBlock::iterator 1085 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired, 1086 MachineBasicBlock::iterator InsertBefore) { 1087 MachineBasicBlock *MBB = CI.I->getParent(); 1088 DebugLoc DL = CI.I->getDebugLoc(); 1089 const unsigned Opcode = getNewOpcode(CI, Paired); 1090 1091 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1092 1093 Register DestReg = MRI->createVirtualRegister(SuperRC); 1094 unsigned MergedDMask = CI.DMask | Paired.DMask; 1095 unsigned DMaskIdx = 1096 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask); 1097 1098 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1099 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) { 1100 if (I == DMaskIdx) 1101 MIB.addImm(MergedDMask); 1102 else 1103 MIB.add((*CI.I).getOperand(I)); 1104 } 1105 1106 // It shouldn't be possible to get this far if the two instructions 1107 // don't have a single memoperand, because MachineInstr::mayAlias() 1108 // will return true if this is the case. 1109 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1110 1111 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1112 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1113 1114 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1115 1116 unsigned SubRegIdx0, SubRegIdx1; 1117 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired); 1118 1119 // Copy to the old destination registers. 1120 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1121 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1122 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1123 1124 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1125 .add(*Dest0) // Copy to same destination including flags and sub reg. 1126 .addReg(DestReg, 0, SubRegIdx0); 1127 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1128 .add(*Dest1) 1129 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1130 1131 CI.I->eraseFromParent(); 1132 Paired.I->eraseFromParent(); 1133 return New; 1134 } 1135 1136 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair( 1137 CombineInfo &CI, CombineInfo &Paired, 1138 MachineBasicBlock::iterator InsertBefore) { 1139 MachineBasicBlock *MBB = CI.I->getParent(); 1140 DebugLoc DL = CI.I->getDebugLoc(); 1141 const unsigned Opcode = getNewOpcode(CI, Paired); 1142 1143 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1144 1145 Register DestReg = MRI->createVirtualRegister(SuperRC); 1146 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1147 1148 // It shouldn't be possible to get this far if the two instructions 1149 // don't have a single memoperand, because MachineInstr::mayAlias() 1150 // will return true if this is the case. 1151 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1152 1153 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1154 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1155 1156 MachineInstr *New = 1157 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg) 1158 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) 1159 .addImm(MergedOffset) // offset 1160 .addImm(CI.CPol) // cpol 1161 .addMemOperand( 1162 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1163 1164 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1165 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1166 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1167 1168 // Copy to the old destination registers. 1169 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1170 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst); 1171 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst); 1172 1173 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1174 .add(*Dest0) // Copy to same destination including flags and sub reg. 1175 .addReg(DestReg, 0, SubRegIdx0); 1176 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1177 .add(*Dest1) 1178 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1179 1180 CI.I->eraseFromParent(); 1181 Paired.I->eraseFromParent(); 1182 return New; 1183 } 1184 1185 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair( 1186 CombineInfo &CI, CombineInfo &Paired, 1187 MachineBasicBlock::iterator InsertBefore) { 1188 MachineBasicBlock *MBB = CI.I->getParent(); 1189 DebugLoc DL = CI.I->getDebugLoc(); 1190 1191 const unsigned Opcode = getNewOpcode(CI, Paired); 1192 1193 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1194 1195 // Copy to the new source register. 1196 Register DestReg = MRI->createVirtualRegister(SuperRC); 1197 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1198 1199 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1200 1201 AddressRegs Regs = getRegs(Opcode, *TII); 1202 1203 if (Regs.VAddr) 1204 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1205 1206 // It shouldn't be possible to get this far if the two instructions 1207 // don't have a single memoperand, because MachineInstr::mayAlias() 1208 // will return true if this is the case. 1209 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1210 1211 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1212 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1213 1214 MachineInstr *New = 1215 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1216 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1217 .addImm(MergedOffset) // offset 1218 .addImm(CI.CPol) // cpol 1219 .addImm(0) // tfe 1220 .addImm(0) // swz 1221 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1222 1223 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1224 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1225 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1226 1227 // Copy to the old destination registers. 1228 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1229 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1230 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1231 1232 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1233 .add(*Dest0) // Copy to same destination including flags and sub reg. 1234 .addReg(DestReg, 0, SubRegIdx0); 1235 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1236 .add(*Dest1) 1237 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1238 1239 CI.I->eraseFromParent(); 1240 Paired.I->eraseFromParent(); 1241 return New; 1242 } 1243 1244 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair( 1245 CombineInfo &CI, CombineInfo &Paired, 1246 MachineBasicBlock::iterator InsertBefore) { 1247 MachineBasicBlock *MBB = CI.I->getParent(); 1248 DebugLoc DL = CI.I->getDebugLoc(); 1249 1250 const unsigned Opcode = getNewOpcode(CI, Paired); 1251 1252 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1253 1254 // Copy to the new source register. 1255 Register DestReg = MRI->createVirtualRegister(SuperRC); 1256 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset); 1257 1258 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg); 1259 1260 AddressRegs Regs = getRegs(Opcode, *TII); 1261 1262 if (Regs.VAddr) 1263 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1264 1265 unsigned JoinedFormat = 1266 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1267 1268 // It shouldn't be possible to get this far if the two instructions 1269 // don't have a single memoperand, because MachineInstr::mayAlias() 1270 // will return true if this is the case. 1271 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1272 1273 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1274 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1275 1276 MachineInstr *New = 1277 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1278 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1279 .addImm(MergedOffset) // offset 1280 .addImm(JoinedFormat) // format 1281 .addImm(CI.CPol) // cpol 1282 .addImm(0) // tfe 1283 .addImm(0) // swz 1284 .addMemOperand( 1285 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1286 1287 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1288 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1289 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1290 1291 // Copy to the old destination registers. 1292 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 1293 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1294 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1295 1296 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1297 .add(*Dest0) // Copy to same destination including flags and sub reg. 1298 .addReg(DestReg, 0, SubRegIdx0); 1299 BuildMI(*MBB, InsertBefore, DL, CopyDesc) 1300 .add(*Dest1) 1301 .addReg(DestReg, RegState::Kill, SubRegIdx1); 1302 1303 CI.I->eraseFromParent(); 1304 Paired.I->eraseFromParent(); 1305 return New; 1306 } 1307 1308 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair( 1309 CombineInfo &CI, CombineInfo &Paired, 1310 MachineBasicBlock::iterator InsertBefore) { 1311 MachineBasicBlock *MBB = CI.I->getParent(); 1312 DebugLoc DL = CI.I->getDebugLoc(); 1313 1314 const unsigned Opcode = getNewOpcode(CI, Paired); 1315 1316 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1317 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1318 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1319 1320 // Copy to the new source register. 1321 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1322 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1323 1324 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1325 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1326 1327 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1328 .add(*Src0) 1329 .addImm(SubRegIdx0) 1330 .add(*Src1) 1331 .addImm(SubRegIdx1); 1332 1333 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1334 .addReg(SrcReg, RegState::Kill); 1335 1336 AddressRegs Regs = getRegs(Opcode, *TII); 1337 1338 if (Regs.VAddr) 1339 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1340 1341 unsigned JoinedFormat = 1342 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM); 1343 1344 // It shouldn't be possible to get this far if the two instructions 1345 // don't have a single memoperand, because MachineInstr::mayAlias() 1346 // will return true if this is the case. 1347 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1348 1349 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1350 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1351 1352 MachineInstr *New = 1353 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1354 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1355 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1356 .addImm(JoinedFormat) // format 1357 .addImm(CI.CPol) // cpol 1358 .addImm(0) // tfe 1359 .addImm(0) // swz 1360 .addMemOperand( 1361 combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1362 1363 CI.I->eraseFromParent(); 1364 Paired.I->eraseFromParent(); 1365 return New; 1366 } 1367 1368 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, 1369 const CombineInfo &Paired) { 1370 const unsigned Width = CI.Width + Paired.Width; 1371 1372 switch (CI.InstClass) { 1373 default: 1374 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE); 1375 // FIXME: Handle d16 correctly 1376 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()), 1377 Width); 1378 case TBUFFER_LOAD: 1379 case TBUFFER_STORE: 1380 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()), 1381 Width); 1382 1383 case UNKNOWN: 1384 llvm_unreachable("Unknown instruction class"); 1385 case S_BUFFER_LOAD_IMM: 1386 switch (Width) { 1387 default: 1388 return 0; 1389 case 2: 1390 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM; 1391 case 4: 1392 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM; 1393 case 8: 1394 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM; 1395 } 1396 case MIMG: 1397 assert((countPopulation(CI.DMask | Paired.DMask) == Width) && 1398 "No overlaps"); 1399 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width); 1400 } 1401 } 1402 1403 std::pair<unsigned, unsigned> 1404 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, 1405 const CombineInfo &Paired) { 1406 bool ReverseOrder; 1407 if (CI.InstClass == MIMG) { 1408 assert( 1409 (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) && 1410 "No overlaps"); 1411 ReverseOrder = CI.DMask > Paired.DMask; 1412 } else { 1413 ReverseOrder = CI.Offset > Paired.Offset; 1414 } 1415 1416 unsigned Idx0; 1417 unsigned Idx1; 1418 1419 static const unsigned Idxs[5][4] = { 1420 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3}, 1421 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4}, 1422 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5}, 1423 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6}, 1424 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7}, 1425 }; 1426 1427 assert(CI.Width >= 1 && CI.Width <= 4); 1428 assert(Paired.Width >= 1 && Paired.Width <= 4); 1429 1430 if (ReverseOrder) { 1431 Idx1 = Idxs[0][Paired.Width - 1]; 1432 Idx0 = Idxs[Paired.Width][CI.Width - 1]; 1433 } else { 1434 Idx0 = Idxs[0][CI.Width - 1]; 1435 Idx1 = Idxs[CI.Width][Paired.Width - 1]; 1436 } 1437 1438 return std::make_pair(Idx0, Idx1); 1439 } 1440 1441 const TargetRegisterClass * 1442 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI, 1443 const CombineInfo &Paired) { 1444 if (CI.InstClass == S_BUFFER_LOAD_IMM) { 1445 switch (CI.Width + Paired.Width) { 1446 default: 1447 return nullptr; 1448 case 2: 1449 return &AMDGPU::SReg_64_XEXECRegClass; 1450 case 4: 1451 return &AMDGPU::SGPR_128RegClass; 1452 case 8: 1453 return &AMDGPU::SGPR_256RegClass; 1454 case 16: 1455 return &AMDGPU::SGPR_512RegClass; 1456 } 1457 } 1458 1459 unsigned BitWidth = 32 * (CI.Width + Paired.Width); 1460 return TRI->isAGPRClass(getDataRegClass(*CI.I)) 1461 ? TRI->getAGPRClassForBitWidth(BitWidth) 1462 : TRI->getVGPRClassForBitWidth(BitWidth); 1463 } 1464 1465 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair( 1466 CombineInfo &CI, CombineInfo &Paired, 1467 MachineBasicBlock::iterator InsertBefore) { 1468 MachineBasicBlock *MBB = CI.I->getParent(); 1469 DebugLoc DL = CI.I->getDebugLoc(); 1470 1471 const unsigned Opcode = getNewOpcode(CI, Paired); 1472 1473 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired); 1474 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); 1475 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx); 1476 1477 // Copy to the new source register. 1478 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); 1479 Register SrcReg = MRI->createVirtualRegister(SuperRC); 1480 1481 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata); 1482 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata); 1483 1484 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg) 1485 .add(*Src0) 1486 .addImm(SubRegIdx0) 1487 .add(*Src1) 1488 .addImm(SubRegIdx1); 1489 1490 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode)) 1491 .addReg(SrcReg, RegState::Kill); 1492 1493 AddressRegs Regs = getRegs(Opcode, *TII); 1494 1495 if (Regs.VAddr) 1496 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr)); 1497 1498 1499 // It shouldn't be possible to get this far if the two instructions 1500 // don't have a single memoperand, because MachineInstr::mayAlias() 1501 // will return true if this is the case. 1502 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand()); 1503 1504 const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); 1505 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin(); 1506 1507 MachineInstr *New = 1508 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) 1509 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) 1510 .addImm(std::min(CI.Offset, Paired.Offset)) // offset 1511 .addImm(CI.CPol) // cpol 1512 .addImm(0) // tfe 1513 .addImm(0) // swz 1514 .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); 1515 1516 CI.I->eraseFromParent(); 1517 Paired.I->eraseFromParent(); 1518 return New; 1519 } 1520 1521 MachineOperand 1522 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const { 1523 APInt V(32, Val, true); 1524 if (TII->isInlineConstant(V)) 1525 return MachineOperand::CreateImm(Val); 1526 1527 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); 1528 MachineInstr *Mov = 1529 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), 1530 TII->get(AMDGPU::S_MOV_B32), Reg) 1531 .addImm(Val); 1532 (void)Mov; 1533 LLVM_DEBUG(dbgs() << " "; Mov->dump()); 1534 return MachineOperand::CreateReg(Reg, false); 1535 } 1536 1537 // Compute base address using Addr and return the final register. 1538 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI, 1539 const MemAddress &Addr) const { 1540 MachineBasicBlock *MBB = MI.getParent(); 1541 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1542 DebugLoc DL = MI.getDebugLoc(); 1543 1544 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 || 1545 Addr.Base.LoSubReg) && 1546 "Expected 32-bit Base-Register-Low!!"); 1547 1548 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 || 1549 Addr.Base.HiSubReg) && 1550 "Expected 32-bit Base-Register-Hi!!"); 1551 1552 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n"); 1553 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI); 1554 MachineOperand OffsetHi = 1555 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI); 1556 1557 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); 1558 Register CarryReg = MRI->createVirtualRegister(CarryRC); 1559 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC); 1560 1561 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1562 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); 1563 MachineInstr *LoHalf = 1564 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0) 1565 .addReg(CarryReg, RegState::Define) 1566 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg) 1567 .add(OffsetLo) 1568 .addImm(0); // clamp bit 1569 (void)LoHalf; 1570 LLVM_DEBUG(dbgs() << " "; LoHalf->dump();); 1571 1572 MachineInstr *HiHalf = 1573 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1) 1574 .addReg(DeadCarryReg, RegState::Define | RegState::Dead) 1575 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg) 1576 .add(OffsetHi) 1577 .addReg(CarryReg, RegState::Kill) 1578 .addImm(0); // clamp bit 1579 (void)HiHalf; 1580 LLVM_DEBUG(dbgs() << " "; HiHalf->dump();); 1581 1582 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class()); 1583 MachineInstr *FullBase = 1584 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg) 1585 .addReg(DestSub0) 1586 .addImm(AMDGPU::sub0) 1587 .addReg(DestSub1) 1588 .addImm(AMDGPU::sub1); 1589 (void)FullBase; 1590 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";); 1591 1592 return FullDestReg; 1593 } 1594 1595 // Update base and offset with the NewBase and NewOffset in MI. 1596 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI, 1597 Register NewBase, 1598 int32_t NewOffset) const { 1599 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1600 Base->setReg(NewBase); 1601 Base->setIsKill(false); 1602 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset); 1603 } 1604 1605 Optional<int32_t> 1606 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const { 1607 if (Op.isImm()) 1608 return Op.getImm(); 1609 1610 if (!Op.isReg()) 1611 return None; 1612 1613 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 1614 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 || 1615 !Def->getOperand(1).isImm()) 1616 return None; 1617 1618 return Def->getOperand(1).getImm(); 1619 } 1620 1621 // Analyze Base and extracts: 1622 // - 32bit base registers, subregisters 1623 // - 64bit constant offset 1624 // Expecting base computation as: 1625 // %OFFSET0:sgpr_32 = S_MOV_B32 8000 1626 // %LO:vgpr_32, %c:sreg_64_xexec = 1627 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 1628 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 1629 // %Base:vreg_64 = 1630 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 1631 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base, 1632 MemAddress &Addr) const { 1633 if (!Base.isReg()) 1634 return; 1635 1636 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg()); 1637 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE 1638 || Def->getNumOperands() != 5) 1639 return; 1640 1641 MachineOperand BaseLo = Def->getOperand(1); 1642 MachineOperand BaseHi = Def->getOperand(3); 1643 if (!BaseLo.isReg() || !BaseHi.isReg()) 1644 return; 1645 1646 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg()); 1647 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg()); 1648 1649 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 || 1650 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64) 1651 return; 1652 1653 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0); 1654 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1); 1655 1656 auto Offset0P = extractConstOffset(*Src0); 1657 if (Offset0P) 1658 BaseLo = *Src1; 1659 else { 1660 if (!(Offset0P = extractConstOffset(*Src1))) 1661 return; 1662 BaseLo = *Src0; 1663 } 1664 1665 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0); 1666 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1); 1667 1668 if (Src0->isImm()) 1669 std::swap(Src0, Src1); 1670 1671 if (!Src1->isImm()) 1672 return; 1673 1674 uint64_t Offset1 = Src1->getImm(); 1675 BaseHi = *Src0; 1676 1677 Addr.Base.LoReg = BaseLo.getReg(); 1678 Addr.Base.HiReg = BaseHi.getReg(); 1679 Addr.Base.LoSubReg = BaseLo.getSubReg(); 1680 Addr.Base.HiSubReg = BaseHi.getSubReg(); 1681 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32); 1682 } 1683 1684 bool SILoadStoreOptimizer::promoteConstantOffsetToImm( 1685 MachineInstr &MI, 1686 MemInfoMap &Visited, 1687 SmallPtrSet<MachineInstr *, 4> &AnchorList) const { 1688 1689 if (!(MI.mayLoad() ^ MI.mayStore())) 1690 return false; 1691 1692 // TODO: Support flat and scratch. 1693 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0) 1694 return false; 1695 1696 if (MI.mayLoad() && 1697 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) 1698 return false; 1699 1700 if (AnchorList.count(&MI)) 1701 return false; 1702 1703 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump()); 1704 1705 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) { 1706 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";); 1707 return false; 1708 } 1709 1710 // Step1: Find the base-registers and a 64bit constant offset. 1711 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr); 1712 MemAddress MAddr; 1713 if (Visited.find(&MI) == Visited.end()) { 1714 processBaseWithConstOffset(Base, MAddr); 1715 Visited[&MI] = MAddr; 1716 } else 1717 MAddr = Visited[&MI]; 1718 1719 if (MAddr.Offset == 0) { 1720 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no" 1721 " constant offsets that can be promoted.\n";); 1722 return false; 1723 } 1724 1725 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", " 1726 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";); 1727 1728 // Step2: Traverse through MI's basic block and find an anchor(that has the 1729 // same base-registers) with the highest 13bit distance from MI's offset. 1730 // E.g. (64bit loads) 1731 // bb: 1732 // addr1 = &a + 4096; load1 = load(addr1, 0) 1733 // addr2 = &a + 6144; load2 = load(addr2, 0) 1734 // addr3 = &a + 8192; load3 = load(addr3, 0) 1735 // addr4 = &a + 10240; load4 = load(addr4, 0) 1736 // addr5 = &a + 12288; load5 = load(addr5, 0) 1737 // 1738 // Starting from the first load, the optimization will try to find a new base 1739 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 1740 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192 1741 // as the new-base(anchor) because of the maximum distance which can 1742 // accommodate more intermediate bases presumably. 1743 // 1744 // Step3: move (&a + 8192) above load1. Compute and promote offsets from 1745 // (&a + 8192) for load1, load2, load4. 1746 // addr = &a + 8192 1747 // load1 = load(addr, -4096) 1748 // load2 = load(addr, -2048) 1749 // load3 = load(addr, 0) 1750 // load4 = load(addr, 2048) 1751 // addr5 = &a + 12288; load5 = load(addr5, 0) 1752 // 1753 MachineInstr *AnchorInst = nullptr; 1754 MemAddress AnchorAddr; 1755 uint32_t MaxDist = std::numeric_limits<uint32_t>::min(); 1756 SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase; 1757 1758 MachineBasicBlock *MBB = MI.getParent(); 1759 MachineBasicBlock::iterator E = MBB->end(); 1760 MachineBasicBlock::iterator MBBI = MI.getIterator(); 1761 ++MBBI; 1762 const SITargetLowering *TLI = 1763 static_cast<const SITargetLowering *>(STM->getTargetLowering()); 1764 1765 for ( ; MBBI != E; ++MBBI) { 1766 MachineInstr &MINext = *MBBI; 1767 // TODO: Support finding an anchor(with same base) from store addresses or 1768 // any other load addresses where the opcodes are different. 1769 if (MINext.getOpcode() != MI.getOpcode() || 1770 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm()) 1771 continue; 1772 1773 const MachineOperand &BaseNext = 1774 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr); 1775 MemAddress MAddrNext; 1776 if (Visited.find(&MINext) == Visited.end()) { 1777 processBaseWithConstOffset(BaseNext, MAddrNext); 1778 Visited[&MINext] = MAddrNext; 1779 } else 1780 MAddrNext = Visited[&MINext]; 1781 1782 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg || 1783 MAddrNext.Base.HiReg != MAddr.Base.HiReg || 1784 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg || 1785 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg) 1786 continue; 1787 1788 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset)); 1789 1790 int64_t Dist = MAddr.Offset - MAddrNext.Offset; 1791 TargetLoweringBase::AddrMode AM; 1792 AM.HasBaseReg = true; 1793 AM.BaseOffs = Dist; 1794 if (TLI->isLegalGlobalAddressingMode(AM) && 1795 (uint32_t)std::abs(Dist) > MaxDist) { 1796 MaxDist = std::abs(Dist); 1797 1798 AnchorAddr = MAddrNext; 1799 AnchorInst = &MINext; 1800 } 1801 } 1802 1803 if (AnchorInst) { 1804 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): "; 1805 AnchorInst->dump()); 1806 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: " 1807 << AnchorAddr.Offset << "\n\n"); 1808 1809 // Instead of moving up, just re-compute anchor-instruction's base address. 1810 Register Base = computeBase(MI, AnchorAddr); 1811 1812 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset); 1813 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump();); 1814 1815 for (auto P : InstsWCommonBase) { 1816 TargetLoweringBase::AddrMode AM; 1817 AM.HasBaseReg = true; 1818 AM.BaseOffs = P.second - AnchorAddr.Offset; 1819 1820 if (TLI->isLegalGlobalAddressingMode(AM)) { 1821 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second; 1822 dbgs() << ")"; P.first->dump()); 1823 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset); 1824 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump()); 1825 } 1826 } 1827 AnchorList.insert(AnchorInst); 1828 return true; 1829 } 1830 1831 return false; 1832 } 1833 1834 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, 1835 std::list<std::list<CombineInfo> > &MergeableInsts) const { 1836 for (std::list<CombineInfo> &AddrList : MergeableInsts) { 1837 if (AddrList.front().InstClass == CI.InstClass && 1838 AddrList.front().IsAGPR == CI.IsAGPR && 1839 AddrList.front().hasSameBaseAddress(*CI.I)) { 1840 AddrList.emplace_back(CI); 1841 return; 1842 } 1843 } 1844 1845 // Base address not found, so add a new list. 1846 MergeableInsts.emplace_back(1, CI); 1847 } 1848 1849 std::pair<MachineBasicBlock::iterator, bool> 1850 SILoadStoreOptimizer::collectMergeableInsts( 1851 MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, 1852 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList, 1853 std::list<std::list<CombineInfo>> &MergeableInsts) const { 1854 bool Modified = false; 1855 1856 // Sort potential mergeable instructions into lists. One list per base address. 1857 unsigned Order = 0; 1858 MachineBasicBlock::iterator BlockI = Begin; 1859 for (; BlockI != End; ++BlockI) { 1860 MachineInstr &MI = *BlockI; 1861 1862 // We run this before checking if an address is mergeable, because it can produce 1863 // better code even if the instructions aren't mergeable. 1864 if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) 1865 Modified = true; 1866 1867 // Treat volatile accesses, ordered accesses and unmodeled side effects as 1868 // barriers. We can look after this barrier for separate merges. 1869 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) { 1870 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI); 1871 1872 // Search will resume after this instruction in a separate merge list. 1873 ++BlockI; 1874 break; 1875 } 1876 1877 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); 1878 if (InstClass == UNKNOWN) 1879 continue; 1880 1881 // Do not merge VMEM buffer instructions with "swizzled" bit set. 1882 int Swizzled = 1883 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz); 1884 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm()) 1885 continue; 1886 1887 CombineInfo CI; 1888 CI.setMI(MI, *this); 1889 CI.Order = Order++; 1890 1891 if (!CI.hasMergeableAddress(*MRI)) 1892 continue; 1893 1894 if (CI.InstClass == DS_WRITE && CI.IsAGPR) { 1895 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 1896 // operands. However we are reporting that ds_write2 shall have 1897 // only VGPR data so that machine copy propagation does not 1898 // create an illegal instruction with a VGPR and AGPR sources. 1899 // Consequenctially if we create such instruction the verifier 1900 // will complain. 1901 continue; 1902 } 1903 1904 LLVM_DEBUG(dbgs() << "Mergeable: " << MI); 1905 1906 addInstToMergeableList(CI, MergeableInsts); 1907 } 1908 1909 // At this point we have lists of Mergeable instructions. 1910 // 1911 // Part 2: Sort lists by offset and then for each CombineInfo object in the 1912 // list try to find an instruction that can be merged with I. If an instruction 1913 // is found, it is stored in the Paired field. If no instructions are found, then 1914 // the CombineInfo object is deleted from the list. 1915 1916 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 1917 E = MergeableInsts.end(); I != E;) { 1918 1919 std::list<CombineInfo> &MergeList = *I; 1920 if (MergeList.size() <= 1) { 1921 // This means we have found only one instruction with a given address 1922 // that can be merged, and we need at least 2 instructions to do a merge, 1923 // so this list can be discarded. 1924 I = MergeableInsts.erase(I); 1925 continue; 1926 } 1927 1928 // Sort the lists by offsets, this way mergeable instructions will be 1929 // adjacent to each other in the list, which will make it easier to find 1930 // matches. 1931 MergeList.sort( 1932 [] (const CombineInfo &A, const CombineInfo &B) { 1933 return A.Offset < B.Offset; 1934 }); 1935 ++I; 1936 } 1937 1938 return std::make_pair(BlockI, Modified); 1939 } 1940 1941 // Scan through looking for adjacent LDS operations with constant offsets from 1942 // the same base register. We rely on the scheduler to do the hard work of 1943 // clustering nearby loads, and assume these are all adjacent. 1944 bool SILoadStoreOptimizer::optimizeBlock( 1945 std::list<std::list<CombineInfo> > &MergeableInsts) { 1946 bool Modified = false; 1947 1948 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(), 1949 E = MergeableInsts.end(); I != E;) { 1950 std::list<CombineInfo> &MergeList = *I; 1951 1952 bool OptimizeListAgain = false; 1953 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { 1954 // We weren't able to make any changes, so delete the list so we don't 1955 // process the same instructions the next time we try to optimize this 1956 // block. 1957 I = MergeableInsts.erase(I); 1958 continue; 1959 } 1960 1961 Modified = true; 1962 1963 // We made changes, but also determined that there were no more optimization 1964 // opportunities, so we don't need to reprocess the list 1965 if (!OptimizeListAgain) { 1966 I = MergeableInsts.erase(I); 1967 continue; 1968 } 1969 OptimizeAgain = true; 1970 } 1971 return Modified; 1972 } 1973 1974 bool 1975 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( 1976 std::list<CombineInfo> &MergeList, 1977 bool &OptimizeListAgain) { 1978 if (MergeList.empty()) 1979 return false; 1980 1981 bool Modified = false; 1982 1983 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end(); 1984 Next = std::next(I)) { 1985 1986 auto First = I; 1987 auto Second = Next; 1988 1989 if ((*First).Order > (*Second).Order) 1990 std::swap(First, Second); 1991 CombineInfo &CI = *First; 1992 CombineInfo &Paired = *Second; 1993 1994 CombineInfo *Where = checkAndPrepareMerge(CI, Paired); 1995 if (!Where) { 1996 ++I; 1997 continue; 1998 } 1999 2000 Modified = true; 2001 2002 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I); 2003 2004 MachineBasicBlock::iterator NewMI; 2005 switch (CI.InstClass) { 2006 default: 2007 llvm_unreachable("unknown InstClass"); 2008 break; 2009 case DS_READ: 2010 NewMI = mergeRead2Pair(CI, Paired, Where->I); 2011 break; 2012 case DS_WRITE: 2013 NewMI = mergeWrite2Pair(CI, Paired, Where->I); 2014 break; 2015 case S_BUFFER_LOAD_IMM: 2016 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I); 2017 OptimizeListAgain |= CI.Width + Paired.Width < 8; 2018 break; 2019 case BUFFER_LOAD: 2020 NewMI = mergeBufferLoadPair(CI, Paired, Where->I); 2021 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2022 break; 2023 case BUFFER_STORE: 2024 NewMI = mergeBufferStorePair(CI, Paired, Where->I); 2025 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2026 break; 2027 case MIMG: 2028 NewMI = mergeImagePair(CI, Paired, Where->I); 2029 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2030 break; 2031 case TBUFFER_LOAD: 2032 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I); 2033 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2034 break; 2035 case TBUFFER_STORE: 2036 NewMI = mergeTBufferStorePair(CI, Paired, Where->I); 2037 OptimizeListAgain |= CI.Width + Paired.Width < 4; 2038 break; 2039 } 2040 CI.setMI(NewMI, *this); 2041 CI.Order = Where->Order; 2042 if (I == Second) 2043 I = Next; 2044 2045 MergeList.erase(Second); 2046 } 2047 2048 return Modified; 2049 } 2050 2051 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 2052 if (skipFunction(MF.getFunction())) 2053 return false; 2054 2055 STM = &MF.getSubtarget<GCNSubtarget>(); 2056 if (!STM->loadStoreOptEnabled()) 2057 return false; 2058 2059 TII = STM->getInstrInfo(); 2060 TRI = &TII->getRegisterInfo(); 2061 2062 MRI = &MF.getRegInfo(); 2063 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 2064 2065 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 2066 2067 bool Modified = false; 2068 2069 // Contains the list of instructions for which constant offsets are being 2070 // promoted to the IMM. This is tracked for an entire block at time. 2071 SmallPtrSet<MachineInstr *, 4> AnchorList; 2072 MemInfoMap Visited; 2073 2074 for (MachineBasicBlock &MBB : MF) { 2075 MachineBasicBlock::iterator SectionEnd; 2076 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; 2077 I = SectionEnd) { 2078 bool CollectModified; 2079 std::list<std::list<CombineInfo>> MergeableInsts; 2080 2081 // First pass: Collect list of all instructions we know how to merge in a 2082 // subset of the block. 2083 std::tie(SectionEnd, CollectModified) = 2084 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts); 2085 2086 Modified |= CollectModified; 2087 2088 do { 2089 OptimizeAgain = false; 2090 Modified |= optimizeBlock(MergeableInsts); 2091 } while (OptimizeAgain); 2092 } 2093 2094 Visited.clear(); 2095 AnchorList.clear(); 2096 } 2097 2098 return Modified; 2099 } 2100