1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 43 #include "llvm/CodeGen/LiveVariables.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstrBuilder.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include "llvm/Target/TargetMachine.h" 51 52 using namespace llvm; 53 54 #define DEBUG_TYPE "si-load-store-opt" 55 56 namespace { 57 58 class SILoadStoreOptimizer : public MachineFunctionPass { 59 private: 60 const SIInstrInfo *TII; 61 const SIRegisterInfo *TRI; 62 MachineRegisterInfo *MRI; 63 AliasAnalysis *AA; 64 65 static bool offsetsCanBeCombined(unsigned Offset0, 66 unsigned Offset1, 67 unsigned EltSize); 68 69 MachineBasicBlock::iterator findMatchingDSInst( 70 MachineBasicBlock::iterator I, 71 unsigned EltSize, 72 SmallVectorImpl<MachineInstr*> &InstsToMove); 73 74 MachineBasicBlock::iterator mergeRead2Pair( 75 MachineBasicBlock::iterator I, 76 MachineBasicBlock::iterator Paired, 77 unsigned EltSize, 78 ArrayRef<MachineInstr*> InstsToMove); 79 80 MachineBasicBlock::iterator mergeWrite2Pair( 81 MachineBasicBlock::iterator I, 82 MachineBasicBlock::iterator Paired, 83 unsigned EltSize, 84 ArrayRef<MachineInstr*> InstsToMove); 85 86 public: 87 static char ID; 88 89 SILoadStoreOptimizer() 90 : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), 91 AA(nullptr) {} 92 93 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 94 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 95 } 96 97 bool optimizeBlock(MachineBasicBlock &MBB); 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 StringRef getPassName() const override { return "SI Load / Store Optimizer"; } 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.setPreservesCFG(); 105 AU.addRequired<AAResultsWrapperPass>(); 106 107 MachineFunctionPass::getAnalysisUsage(AU); 108 } 109 }; 110 111 } // End anonymous namespace. 112 113 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 114 "SI Load / Store Optimizer", false, false) 115 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 116 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 117 "SI Load / Store Optimizer", false, false) 118 119 char SILoadStoreOptimizer::ID = 0; 120 121 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 122 123 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 124 return new SILoadStoreOptimizer(TM); 125 } 126 127 static void moveInstsAfter(MachineBasicBlock::iterator I, 128 ArrayRef<MachineInstr*> InstsToMove) { 129 MachineBasicBlock *MBB = I->getParent(); 130 ++I; 131 for (MachineInstr *MI : InstsToMove) { 132 MI->removeFromParent(); 133 MBB->insert(I, MI); 134 } 135 } 136 137 static void addDefsToList(const MachineInstr &MI, 138 SmallVectorImpl<const MachineOperand *> &Defs) { 139 for (const MachineOperand &Def : MI.defs()) { 140 Defs.push_back(&Def); 141 } 142 } 143 144 static bool memAccessesCanBeReordered( 145 MachineBasicBlock::iterator A, 146 MachineBasicBlock::iterator B, 147 const SIInstrInfo *TII, 148 llvm::AliasAnalysis * AA) { 149 return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || 150 // RAW or WAR - cannot reorder 151 // WAW - cannot reorder 152 // RAR - safe to reorder 153 !(A->mayStore() || B->mayStore())); 154 } 155 156 // Add MI and its defs to the lists if MI reads one of the defs that are 157 // already in the list. Returns true in that case. 158 static bool 159 addToListsIfDependent(MachineInstr &MI, 160 SmallVectorImpl<const MachineOperand *> &Defs, 161 SmallVectorImpl<MachineInstr*> &Insts) { 162 for (const MachineOperand *Def : Defs) { 163 bool ReadDef = MI.readsVirtualRegister(Def->getReg()); 164 // If ReadDef is true, then there is a use of Def between I 165 // and the instruction that I will potentially be merged with. We 166 // will need to move this instruction after the merged instructions. 167 if (ReadDef) { 168 Insts.push_back(&MI); 169 addDefsToList(MI, Defs); 170 return true; 171 } 172 } 173 174 return false; 175 } 176 177 static bool 178 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 179 ArrayRef<MachineInstr*> InstsToMove, 180 const SIInstrInfo *TII, 181 AliasAnalysis *AA) { 182 183 assert(MemOp.mayLoadOrStore()); 184 185 for (MachineInstr *InstToMove : InstsToMove) { 186 if (!InstToMove->mayLoadOrStore()) 187 continue; 188 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 189 return false; 190 } 191 return true; 192 } 193 194 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 195 unsigned Offset1, 196 unsigned Size) { 197 // XXX - Would the same offset be OK? Is there any reason this would happen or 198 // be useful? 199 if (Offset0 == Offset1) 200 return false; 201 202 // This won't be valid if the offset isn't aligned. 203 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 204 return false; 205 206 unsigned EltOffset0 = Offset0 / Size; 207 unsigned EltOffset1 = Offset1 / Size; 208 209 // Check if the new offsets fit in the reduced 8-bit range. 210 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 211 return true; 212 213 // If the offset in elements doesn't fit in 8-bits, we might be able to use 214 // the stride 64 versions. 215 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 216 return false; 217 218 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 219 } 220 221 MachineBasicBlock::iterator 222 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 223 unsigned EltSize, 224 SmallVectorImpl<MachineInstr*> &InstsToMove) { 225 MachineBasicBlock::iterator E = I->getParent()->end(); 226 MachineBasicBlock::iterator MBBI = I; 227 ++MBBI; 228 229 SmallVector<const MachineOperand *, 8> DefsToMove; 230 addDefsToList(*I, DefsToMove); 231 232 for ( ; MBBI != E; ++MBBI) { 233 234 if (MBBI->getOpcode() != I->getOpcode()) { 235 236 // This is not a matching DS instruction, but we can keep looking as 237 // long as one of these conditions are met: 238 // 1. It is safe to move I down past MBBI. 239 // 2. It is safe to move MBBI down past the instruction that I will 240 // be merged into. 241 242 if (MBBI->hasUnmodeledSideEffects()) 243 // We can't re-order this instruction with respect to other memory 244 // opeations, so we fail both conditions mentioned above. 245 return E; 246 247 if (MBBI->mayLoadOrStore() && 248 !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { 249 // We fail condition #1, but we may still be able to satisfy condition 250 // #2. Add this instruction to the move list and then we will check 251 // if condition #2 holds once we have selected the matching instruction. 252 InstsToMove.push_back(&*MBBI); 253 addDefsToList(*MBBI, DefsToMove); 254 continue; 255 } 256 257 // When we match I with another DS instruction we will be moving I down 258 // to the location of the matched instruction any uses of I will need to 259 // be moved down as well. 260 addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); 261 continue; 262 } 263 264 // Don't merge volatiles. 265 if (MBBI->hasOrderedMemoryRef()) 266 return E; 267 268 // Handle a case like 269 // DS_WRITE_B32 addr, v, idx0 270 // w = DS_READ_B32 addr, idx0 271 // DS_WRITE_B32 addr, f(w), idx1 272 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 273 // merging of the two writes. 274 if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) 275 continue; 276 277 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 278 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 279 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 280 281 // Check same base pointer. Be careful of subregisters, which can occur with 282 // vectors of pointers. 283 if (AddrReg0.getReg() == AddrReg1.getReg() && 284 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 285 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 286 AMDGPU::OpName::offset); 287 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 288 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 289 290 // Check both offsets fit in the reduced range. 291 // We also need to go through the list of instructions that we plan to 292 // move and make sure they are all safe to move down past the merged 293 // instruction. 294 if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && 295 canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) 296 return MBBI; 297 } 298 299 // We've found a load/store that we couldn't merge for some reason. 300 // We could potentially keep looking, but we'd need to make sure that 301 // it was safe to move I and also all the instruction in InstsToMove 302 // down past this instruction. 303 if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI 304 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users 305 ) 306 break; 307 } 308 return E; 309 } 310 311 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 312 MachineBasicBlock::iterator I, 313 MachineBasicBlock::iterator Paired, 314 unsigned EltSize, 315 ArrayRef<MachineInstr*> InstsToMove) { 316 MachineBasicBlock *MBB = I->getParent(); 317 318 // Be careful, since the addresses could be subregisters themselves in weird 319 // cases, like vectors of pointers. 320 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 321 322 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 323 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 324 325 unsigned Offset0 326 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 327 unsigned Offset1 328 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 329 330 unsigned NewOffset0 = Offset0 / EltSize; 331 unsigned NewOffset1 = Offset1 / EltSize; 332 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 333 334 // Prefer the st64 form if we can use it, even if we can fit the offset in the 335 // non st64 version. I'm not sure if there's any real reason to do this. 336 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 337 if (UseST64) { 338 NewOffset0 /= 64; 339 NewOffset1 /= 64; 340 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 341 } 342 343 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 344 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 345 346 if (NewOffset0 > NewOffset1) { 347 // Canonicalize the merged instruction so the smaller offset comes first. 348 std::swap(NewOffset0, NewOffset1); 349 std::swap(SubRegIdx0, SubRegIdx1); 350 } 351 352 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 353 (NewOffset0 != NewOffset1) && 354 "Computed offset doesn't fit"); 355 356 const MCInstrDesc &Read2Desc = TII->get(Opc); 357 358 const TargetRegisterClass *SuperRC 359 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 360 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 361 362 DebugLoc DL = I->getDebugLoc(); 363 MachineInstrBuilder Read2 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) 364 .add(*AddrReg) // addr 365 .addImm(NewOffset0) // offset0 366 .addImm(NewOffset1) // offset1 367 .addImm(0) // gds 368 .addMemOperand(*I->memoperands_begin()) 369 .addMemOperand(*Paired->memoperands_begin()); 370 (void)Read2; 371 372 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 373 374 // Copy to the old destination registers. 375 BuildMI(*MBB, Paired, DL, CopyDesc) 376 .add(*Dest0) // Copy to same destination including flags and sub reg. 377 .addReg(DestReg, 0, SubRegIdx0); 378 MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) 379 .add(*Dest1) 380 .addReg(DestReg, RegState::Kill, SubRegIdx1); 381 382 moveInstsAfter(Copy1, InstsToMove); 383 384 MachineBasicBlock::iterator Next = std::next(I); 385 I->eraseFromParent(); 386 Paired->eraseFromParent(); 387 388 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 389 return Next; 390 } 391 392 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 393 MachineBasicBlock::iterator I, 394 MachineBasicBlock::iterator Paired, 395 unsigned EltSize, 396 ArrayRef<MachineInstr*> InstsToMove) { 397 MachineBasicBlock *MBB = I->getParent(); 398 399 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 400 // sure we preserve the subregister index and any register flags set on them. 401 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 402 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 403 const MachineOperand *Data1 404 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 405 406 407 unsigned Offset0 408 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 409 unsigned Offset1 410 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 411 412 unsigned NewOffset0 = Offset0 / EltSize; 413 unsigned NewOffset1 = Offset1 / EltSize; 414 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 415 416 // Prefer the st64 form if we can use it, even if we can fit the offset in the 417 // non st64 version. I'm not sure if there's any real reason to do this. 418 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 419 if (UseST64) { 420 NewOffset0 /= 64; 421 NewOffset1 /= 64; 422 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 423 } 424 425 if (NewOffset0 > NewOffset1) { 426 // Canonicalize the merged instruction so the smaller offset comes first. 427 std::swap(NewOffset0, NewOffset1); 428 std::swap(Data0, Data1); 429 } 430 431 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 432 (NewOffset0 != NewOffset1) && 433 "Computed offset doesn't fit"); 434 435 const MCInstrDesc &Write2Desc = TII->get(Opc); 436 DebugLoc DL = I->getDebugLoc(); 437 438 MachineInstrBuilder Write2 = BuildMI(*MBB, Paired, DL, Write2Desc) 439 .add(*Addr) // addr 440 .add(*Data0) // data0 441 .add(*Data1) // data1 442 .addImm(NewOffset0) // offset0 443 .addImm(NewOffset1) // offset1 444 .addImm(0) // gds 445 .addMemOperand(*I->memoperands_begin()) 446 .addMemOperand(*Paired->memoperands_begin()); 447 448 moveInstsAfter(Write2, InstsToMove); 449 450 MachineBasicBlock::iterator Next = std::next(I); 451 I->eraseFromParent(); 452 Paired->eraseFromParent(); 453 454 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 455 return Next; 456 } 457 458 // Scan through looking for adjacent LDS operations with constant offsets from 459 // the same base register. We rely on the scheduler to do the hard work of 460 // clustering nearby loads, and assume these are all adjacent. 461 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 462 bool Modified = false; 463 464 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 465 MachineInstr &MI = *I; 466 467 // Don't combine if volatile. 468 if (MI.hasOrderedMemoryRef()) { 469 ++I; 470 continue; 471 } 472 473 SmallVector<MachineInstr*, 8> InstsToMove; 474 unsigned Opc = MI.getOpcode(); 475 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 476 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 477 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 478 InstsToMove); 479 if (Match != E) { 480 Modified = true; 481 I = mergeRead2Pair(I, Match, Size, InstsToMove); 482 } else { 483 ++I; 484 } 485 486 continue; 487 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 488 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 489 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 490 InstsToMove); 491 if (Match != E) { 492 Modified = true; 493 I = mergeWrite2Pair(I, Match, Size, InstsToMove); 494 } else { 495 ++I; 496 } 497 498 continue; 499 } 500 501 ++I; 502 } 503 504 return Modified; 505 } 506 507 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 508 if (skipFunction(*MF.getFunction())) 509 return false; 510 511 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 512 if (!STM.loadStoreOptEnabled()) 513 return false; 514 515 TII = STM.getInstrInfo(); 516 TRI = &TII->getRegisterInfo(); 517 518 MRI = &MF.getRegInfo(); 519 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 520 521 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 522 523 bool Modified = false; 524 525 for (MachineBasicBlock &MBB : MF) 526 Modified |= optimizeBlock(MBB); 527 528 return Modified; 529 } 530