1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 43 #include "llvm/CodeGen/LiveVariables.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstrBuilder.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include "llvm/Target/TargetMachine.h" 51 52 using namespace llvm; 53 54 #define DEBUG_TYPE "si-load-store-opt" 55 56 namespace { 57 58 class SILoadStoreOptimizer : public MachineFunctionPass { 59 private: 60 const SIInstrInfo *TII; 61 const SIRegisterInfo *TRI; 62 MachineRegisterInfo *MRI; 63 AliasAnalysis *AA; 64 65 static bool offsetsCanBeCombined(unsigned Offset0, 66 unsigned Offset1, 67 unsigned EltSize); 68 69 MachineBasicBlock::iterator findMatchingDSInst( 70 MachineBasicBlock::iterator I, 71 unsigned EltSize, 72 SmallVectorImpl<MachineInstr*> &InstsToMove); 73 74 MachineBasicBlock::iterator mergeRead2Pair( 75 MachineBasicBlock::iterator I, 76 MachineBasicBlock::iterator Paired, 77 unsigned EltSize, 78 ArrayRef<MachineInstr*> InstsToMove); 79 80 MachineBasicBlock::iterator mergeWrite2Pair( 81 MachineBasicBlock::iterator I, 82 MachineBasicBlock::iterator Paired, 83 unsigned EltSize, 84 ArrayRef<MachineInstr*> InstsToMove); 85 86 public: 87 static char ID; 88 89 SILoadStoreOptimizer() 90 : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), 91 AA(nullptr) {} 92 93 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 94 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 95 } 96 97 bool optimizeBlock(MachineBasicBlock &MBB); 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 StringRef getPassName() const override { return "SI Load / Store Optimizer"; } 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.setPreservesCFG(); 105 AU.addRequired<AAResultsWrapperPass>(); 106 107 MachineFunctionPass::getAnalysisUsage(AU); 108 } 109 }; 110 111 } // End anonymous namespace. 112 113 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 114 "SI Load / Store Optimizer", false, false) 115 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 116 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 117 "SI Load / Store Optimizer", false, false) 118 119 char SILoadStoreOptimizer::ID = 0; 120 121 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 122 123 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 124 return new SILoadStoreOptimizer(TM); 125 } 126 127 static void moveInstsAfter(MachineBasicBlock::iterator I, 128 ArrayRef<MachineInstr*> InstsToMove) { 129 MachineBasicBlock *MBB = I->getParent(); 130 ++I; 131 for (MachineInstr *MI : InstsToMove) { 132 MI->removeFromParent(); 133 MBB->insert(I, MI); 134 } 135 } 136 137 static void addDefsToList(const MachineInstr &MI, 138 SmallVectorImpl<const MachineOperand *> &Defs) { 139 for (const MachineOperand &Def : MI.defs()) { 140 Defs.push_back(&Def); 141 } 142 } 143 144 static bool memAccessesCanBeReordered( 145 MachineBasicBlock::iterator A, 146 MachineBasicBlock::iterator B, 147 const SIInstrInfo *TII, 148 llvm::AliasAnalysis * AA) { 149 return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || 150 // RAW or WAR - cannot reorder 151 // WAW - cannot reorder 152 // RAR - safe to reorder 153 !(A->mayStore() || B->mayStore())); 154 } 155 156 // Add MI and its defs to the lists if MI reads one of the defs that are 157 // already in the list. Returns true in that case. 158 static bool 159 addToListsIfDependent(MachineInstr &MI, 160 SmallVectorImpl<const MachineOperand *> &Defs, 161 SmallVectorImpl<MachineInstr*> &Insts) { 162 for (const MachineOperand *Def : Defs) { 163 bool ReadDef = MI.readsVirtualRegister(Def->getReg()); 164 // If ReadDef is true, then there is a use of Def between I 165 // and the instruction that I will potentially be merged with. We 166 // will need to move this instruction after the merged instructions. 167 if (ReadDef) { 168 Insts.push_back(&MI); 169 addDefsToList(MI, Defs); 170 return true; 171 } 172 } 173 174 return false; 175 } 176 177 static bool 178 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 179 ArrayRef<MachineInstr*> InstsToMove, 180 const SIInstrInfo *TII, 181 AliasAnalysis *AA) { 182 183 assert(MemOp.mayLoadOrStore()); 184 185 for (MachineInstr *InstToMove : InstsToMove) { 186 if (!InstToMove->mayLoadOrStore()) 187 continue; 188 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 189 return false; 190 } 191 return true; 192 } 193 194 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 195 unsigned Offset1, 196 unsigned Size) { 197 // XXX - Would the same offset be OK? Is there any reason this would happen or 198 // be useful? 199 if (Offset0 == Offset1) 200 return false; 201 202 // This won't be valid if the offset isn't aligned. 203 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 204 return false; 205 206 unsigned EltOffset0 = Offset0 / Size; 207 unsigned EltOffset1 = Offset1 / Size; 208 209 // Check if the new offsets fit in the reduced 8-bit range. 210 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 211 return true; 212 213 // If the offset in elements doesn't fit in 8-bits, we might be able to use 214 // the stride 64 versions. 215 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 216 return false; 217 218 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 219 } 220 221 MachineBasicBlock::iterator 222 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 223 unsigned EltSize, 224 SmallVectorImpl<MachineInstr*> &InstsToMove) { 225 MachineBasicBlock::iterator E = I->getParent()->end(); 226 MachineBasicBlock::iterator MBBI = I; 227 ++MBBI; 228 229 SmallVector<const MachineOperand *, 8> DefsToMove; 230 addDefsToList(*I, DefsToMove); 231 232 for ( ; MBBI != E; ++MBBI) { 233 234 if (MBBI->getOpcode() != I->getOpcode()) { 235 236 // This is not a matching DS instruction, but we can keep looking as 237 // long as one of these conditions are met: 238 // 1. It is safe to move I down past MBBI. 239 // 2. It is safe to move MBBI down past the instruction that I will 240 // be merged into. 241 242 if (MBBI->hasUnmodeledSideEffects()) 243 // We can't re-order this instruction with respect to other memory 244 // opeations, so we fail both conditions mentioned above. 245 return E; 246 247 if (MBBI->mayLoadOrStore() && 248 !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { 249 // We fail condition #1, but we may still be able to satisfy condition 250 // #2. Add this instruction to the move list and then we will check 251 // if condition #2 holds once we have selected the matching instruction. 252 InstsToMove.push_back(&*MBBI); 253 addDefsToList(*MBBI, DefsToMove); 254 continue; 255 } 256 257 // When we match I with another DS instruction we will be moving I down 258 // to the location of the matched instruction any uses of I will need to 259 // be moved down as well. 260 addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); 261 continue; 262 } 263 264 // Don't merge volatiles. 265 if (MBBI->hasOrderedMemoryRef()) 266 return E; 267 268 // Handle a case like 269 // DS_WRITE_B32 addr, v, idx0 270 // w = DS_READ_B32 addr, idx0 271 // DS_WRITE_B32 addr, f(w), idx1 272 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 273 // merging of the two writes. 274 if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) 275 continue; 276 277 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 278 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 279 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 280 281 // Check same base pointer. Be careful of subregisters, which can occur with 282 // vectors of pointers. 283 if (AddrReg0.getReg() == AddrReg1.getReg() && 284 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 285 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 286 AMDGPU::OpName::offset); 287 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 288 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 289 290 // Check both offsets fit in the reduced range. 291 // We also need to go through the list of instructions that we plan to 292 // move and make sure they are all safe to move down past the merged 293 // instruction. 294 if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && 295 canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) 296 return MBBI; 297 } 298 299 // We've found a load/store that we couldn't merge for some reason. 300 // We could potentially keep looking, but we'd need to make sure that 301 // it was safe to move I and also all the instruction in InstsToMove 302 // down past this instruction. 303 if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI 304 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users 305 ) 306 break; 307 } 308 return E; 309 } 310 311 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 312 MachineBasicBlock::iterator I, 313 MachineBasicBlock::iterator Paired, 314 unsigned EltSize, 315 ArrayRef<MachineInstr*> InstsToMove) { 316 MachineBasicBlock *MBB = I->getParent(); 317 318 // Be careful, since the addresses could be subregisters themselves in weird 319 // cases, like vectors of pointers. 320 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 321 322 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 323 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 324 325 unsigned Offset0 326 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 327 unsigned Offset1 328 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 329 330 unsigned NewOffset0 = Offset0 / EltSize; 331 unsigned NewOffset1 = Offset1 / EltSize; 332 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 333 334 // Prefer the st64 form if we can use it, even if we can fit the offset in the 335 // non st64 version. I'm not sure if there's any real reason to do this. 336 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 337 if (UseST64) { 338 NewOffset0 /= 64; 339 NewOffset1 /= 64; 340 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 341 } 342 343 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 344 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 345 346 if (NewOffset0 > NewOffset1) { 347 // Canonicalize the merged instruction so the smaller offset comes first. 348 std::swap(NewOffset0, NewOffset1); 349 std::swap(SubRegIdx0, SubRegIdx1); 350 } 351 352 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 353 (NewOffset0 != NewOffset1) && 354 "Computed offset doesn't fit"); 355 356 const MCInstrDesc &Read2Desc = TII->get(Opc); 357 358 const TargetRegisterClass *SuperRC 359 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 360 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 361 362 DebugLoc DL = I->getDebugLoc(); 363 MachineInstrBuilder Read2 364 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) 365 .addOperand(*AddrReg) // addr 366 .addImm(NewOffset0) // offset0 367 .addImm(NewOffset1) // offset1 368 .addImm(0) // gds 369 .addMemOperand(*I->memoperands_begin()) 370 .addMemOperand(*Paired->memoperands_begin()); 371 (void)Read2; 372 373 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 374 375 // Copy to the old destination registers. 376 BuildMI(*MBB, Paired, DL, CopyDesc) 377 .addOperand(*Dest0) // Copy to same destination including flags and sub reg. 378 .addReg(DestReg, 0, SubRegIdx0); 379 MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) 380 .addOperand(*Dest1) 381 .addReg(DestReg, RegState::Kill, SubRegIdx1); 382 383 moveInstsAfter(Copy1, InstsToMove); 384 385 MachineBasicBlock::iterator Next = std::next(I); 386 I->eraseFromParent(); 387 Paired->eraseFromParent(); 388 389 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 390 return Next; 391 } 392 393 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 394 MachineBasicBlock::iterator I, 395 MachineBasicBlock::iterator Paired, 396 unsigned EltSize, 397 ArrayRef<MachineInstr*> InstsToMove) { 398 MachineBasicBlock *MBB = I->getParent(); 399 400 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 401 // sure we preserve the subregister index and any register flags set on them. 402 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 403 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 404 const MachineOperand *Data1 405 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 406 407 408 unsigned Offset0 409 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 410 unsigned Offset1 411 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 412 413 unsigned NewOffset0 = Offset0 / EltSize; 414 unsigned NewOffset1 = Offset1 / EltSize; 415 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 416 417 // Prefer the st64 form if we can use it, even if we can fit the offset in the 418 // non st64 version. I'm not sure if there's any real reason to do this. 419 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 420 if (UseST64) { 421 NewOffset0 /= 64; 422 NewOffset1 /= 64; 423 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 424 } 425 426 if (NewOffset0 > NewOffset1) { 427 // Canonicalize the merged instruction so the smaller offset comes first. 428 std::swap(NewOffset0, NewOffset1); 429 std::swap(Data0, Data1); 430 } 431 432 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 433 (NewOffset0 != NewOffset1) && 434 "Computed offset doesn't fit"); 435 436 const MCInstrDesc &Write2Desc = TII->get(Opc); 437 DebugLoc DL = I->getDebugLoc(); 438 439 MachineInstrBuilder Write2 440 = BuildMI(*MBB, Paired, DL, Write2Desc) 441 .addOperand(*Addr) // addr 442 .addOperand(*Data0) // data0 443 .addOperand(*Data1) // data1 444 .addImm(NewOffset0) // offset0 445 .addImm(NewOffset1) // offset1 446 .addImm(0) // gds 447 .addMemOperand(*I->memoperands_begin()) 448 .addMemOperand(*Paired->memoperands_begin()); 449 450 moveInstsAfter(Write2, InstsToMove); 451 452 MachineBasicBlock::iterator Next = std::next(I); 453 I->eraseFromParent(); 454 Paired->eraseFromParent(); 455 456 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 457 return Next; 458 } 459 460 // Scan through looking for adjacent LDS operations with constant offsets from 461 // the same base register. We rely on the scheduler to do the hard work of 462 // clustering nearby loads, and assume these are all adjacent. 463 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 464 bool Modified = false; 465 466 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 467 MachineInstr &MI = *I; 468 469 // Don't combine if volatile. 470 if (MI.hasOrderedMemoryRef()) { 471 ++I; 472 continue; 473 } 474 475 SmallVector<MachineInstr*, 8> InstsToMove; 476 unsigned Opc = MI.getOpcode(); 477 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 478 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 479 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 480 InstsToMove); 481 if (Match != E) { 482 Modified = true; 483 I = mergeRead2Pair(I, Match, Size, InstsToMove); 484 } else { 485 ++I; 486 } 487 488 continue; 489 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 490 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 491 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 492 InstsToMove); 493 if (Match != E) { 494 Modified = true; 495 I = mergeWrite2Pair(I, Match, Size, InstsToMove); 496 } else { 497 ++I; 498 } 499 500 continue; 501 } 502 503 ++I; 504 } 505 506 return Modified; 507 } 508 509 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 510 if (skipFunction(*MF.getFunction())) 511 return false; 512 513 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 514 if (!STM.loadStoreOptEnabled()) 515 return false; 516 517 TII = STM.getInstrInfo(); 518 TRI = &TII->getRegisterInfo(); 519 520 MRI = &MF.getRegInfo(); 521 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 522 523 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 524 525 bool Modified = false; 526 527 for (MachineBasicBlock &MBB : MF) 528 Modified |= optimizeBlock(MBB); 529 530 return Modified; 531 } 532