1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "Utils/AMDGPUBaseInfo.h" 43 #include "llvm/ADT/ArrayRef.h" 44 #include "llvm/ADT/SmallVector.h" 45 #include "llvm/ADT/StringRef.h" 46 #include "llvm/Analysis/AliasAnalysis.h" 47 #include "llvm/CodeGen/MachineBasicBlock.h" 48 #include "llvm/CodeGen/MachineFunction.h" 49 #include "llvm/CodeGen/MachineFunctionPass.h" 50 #include "llvm/CodeGen/MachineInstr.h" 51 #include "llvm/CodeGen/MachineInstrBuilder.h" 52 #include "llvm/CodeGen/MachineOperand.h" 53 #include "llvm/CodeGen/MachineRegisterInfo.h" 54 #include "llvm/IR/DebugLoc.h" 55 #include "llvm/Pass.h" 56 #include "llvm/Support/Debug.h" 57 #include "llvm/Support/MathExtras.h" 58 #include "llvm/Support/raw_ostream.h" 59 #include "llvm/Target/TargetMachine.h" 60 #include <cassert> 61 #include <iterator> 62 #include <utility> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "si-load-store-opt" 67 68 namespace { 69 70 class SILoadStoreOptimizer : public MachineFunctionPass { 71 private: 72 const SIInstrInfo *TII = nullptr; 73 const SIRegisterInfo *TRI = nullptr; 74 MachineRegisterInfo *MRI = nullptr; 75 AliasAnalysis *AA = nullptr; 76 77 static bool offsetsCanBeCombined(unsigned Offset0, 78 unsigned Offset1, 79 unsigned EltSize); 80 81 MachineBasicBlock::iterator findMatchingDSInst( 82 MachineBasicBlock::iterator I, 83 unsigned EltSize, 84 SmallVectorImpl<MachineInstr*> &InstsToMove); 85 86 MachineBasicBlock::iterator mergeRead2Pair( 87 MachineBasicBlock::iterator I, 88 MachineBasicBlock::iterator Paired, 89 unsigned EltSize, 90 ArrayRef<MachineInstr*> InstsToMove); 91 92 MachineBasicBlock::iterator mergeWrite2Pair( 93 MachineBasicBlock::iterator I, 94 MachineBasicBlock::iterator Paired, 95 unsigned EltSize, 96 ArrayRef<MachineInstr*> InstsToMove); 97 98 public: 99 static char ID; 100 101 SILoadStoreOptimizer() : MachineFunctionPass(ID) {} 102 103 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 104 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 105 } 106 107 bool optimizeBlock(MachineBasicBlock &MBB); 108 109 bool runOnMachineFunction(MachineFunction &MF) override; 110 111 StringRef getPassName() const override { return "SI Load / Store Optimizer"; } 112 113 void getAnalysisUsage(AnalysisUsage &AU) const override { 114 AU.setPreservesCFG(); 115 AU.addRequired<AAResultsWrapperPass>(); 116 117 MachineFunctionPass::getAnalysisUsage(AU); 118 } 119 }; 120 121 } // end anonymous namespace. 122 123 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 124 "SI Load / Store Optimizer", false, false) 125 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 126 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 127 "SI Load / Store Optimizer", false, false) 128 129 char SILoadStoreOptimizer::ID = 0; 130 131 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 132 133 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 134 return new SILoadStoreOptimizer(TM); 135 } 136 137 static void moveInstsAfter(MachineBasicBlock::iterator I, 138 ArrayRef<MachineInstr*> InstsToMove) { 139 MachineBasicBlock *MBB = I->getParent(); 140 ++I; 141 for (MachineInstr *MI : InstsToMove) { 142 MI->removeFromParent(); 143 MBB->insert(I, MI); 144 } 145 } 146 147 static void addDefsToList(const MachineInstr &MI, 148 SmallVectorImpl<const MachineOperand *> &Defs) { 149 for (const MachineOperand &Def : MI.defs()) { 150 Defs.push_back(&Def); 151 } 152 } 153 154 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, 155 MachineBasicBlock::iterator B, 156 const SIInstrInfo *TII, 157 AliasAnalysis * AA) { 158 return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) || 159 // RAW or WAR - cannot reorder 160 // WAW - cannot reorder 161 // RAR - safe to reorder 162 !(A->mayStore() || B->mayStore())); 163 } 164 165 // Add MI and its defs to the lists if MI reads one of the defs that are 166 // already in the list. Returns true in that case. 167 static bool 168 addToListsIfDependent(MachineInstr &MI, 169 SmallVectorImpl<const MachineOperand *> &Defs, 170 SmallVectorImpl<MachineInstr*> &Insts) { 171 for (const MachineOperand *Def : Defs) { 172 bool ReadDef = MI.readsVirtualRegister(Def->getReg()); 173 // If ReadDef is true, then there is a use of Def between I 174 // and the instruction that I will potentially be merged with. We 175 // will need to move this instruction after the merged instructions. 176 if (ReadDef) { 177 Insts.push_back(&MI); 178 addDefsToList(MI, Defs); 179 return true; 180 } 181 } 182 183 return false; 184 } 185 186 static bool 187 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 188 ArrayRef<MachineInstr*> InstsToMove, 189 const SIInstrInfo *TII, 190 AliasAnalysis *AA) { 191 assert(MemOp.mayLoadOrStore()); 192 193 for (MachineInstr *InstToMove : InstsToMove) { 194 if (!InstToMove->mayLoadOrStore()) 195 continue; 196 if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA)) 197 return false; 198 } 199 return true; 200 } 201 202 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 203 unsigned Offset1, 204 unsigned Size) { 205 // XXX - Would the same offset be OK? Is there any reason this would happen or 206 // be useful? 207 if (Offset0 == Offset1) 208 return false; 209 210 // This won't be valid if the offset isn't aligned. 211 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 212 return false; 213 214 unsigned EltOffset0 = Offset0 / Size; 215 unsigned EltOffset1 = Offset1 / Size; 216 217 // Check if the new offsets fit in the reduced 8-bit range. 218 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 219 return true; 220 221 // If the offset in elements doesn't fit in 8-bits, we might be able to use 222 // the stride 64 versions. 223 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 224 return false; 225 226 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 227 } 228 229 MachineBasicBlock::iterator 230 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 231 unsigned EltSize, 232 SmallVectorImpl<MachineInstr*> &InstsToMove) { 233 MachineBasicBlock::iterator E = I->getParent()->end(); 234 MachineBasicBlock::iterator MBBI = I; 235 ++MBBI; 236 237 SmallVector<const MachineOperand *, 8> DefsToMove; 238 addDefsToList(*I, DefsToMove); 239 240 for ( ; MBBI != E; ++MBBI) { 241 if (MBBI->getOpcode() != I->getOpcode()) { 242 243 // This is not a matching DS instruction, but we can keep looking as 244 // long as one of these conditions are met: 245 // 1. It is safe to move I down past MBBI. 246 // 2. It is safe to move MBBI down past the instruction that I will 247 // be merged into. 248 249 if (MBBI->hasUnmodeledSideEffects()) 250 // We can't re-order this instruction with respect to other memory 251 // opeations, so we fail both conditions mentioned above. 252 return E; 253 254 if (MBBI->mayLoadOrStore() && 255 !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) { 256 // We fail condition #1, but we may still be able to satisfy condition 257 // #2. Add this instruction to the move list and then we will check 258 // if condition #2 holds once we have selected the matching instruction. 259 InstsToMove.push_back(&*MBBI); 260 addDefsToList(*MBBI, DefsToMove); 261 continue; 262 } 263 264 // When we match I with another DS instruction we will be moving I down 265 // to the location of the matched instruction any uses of I will need to 266 // be moved down as well. 267 addToListsIfDependent(*MBBI, DefsToMove, InstsToMove); 268 continue; 269 } 270 271 // Don't merge volatiles. 272 if (MBBI->hasOrderedMemoryRef()) 273 return E; 274 275 // Handle a case like 276 // DS_WRITE_B32 addr, v, idx0 277 // w = DS_READ_B32 addr, idx0 278 // DS_WRITE_B32 addr, f(w), idx1 279 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents 280 // merging of the two writes. 281 if (addToListsIfDependent(*MBBI, DefsToMove, InstsToMove)) 282 continue; 283 284 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 285 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 286 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 287 288 // Check same base pointer. Be careful of subregisters, which can occur with 289 // vectors of pointers. 290 if (AddrReg0.getReg() == AddrReg1.getReg() && 291 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 292 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 293 AMDGPU::OpName::offset); 294 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 295 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 296 297 // Check both offsets fit in the reduced range. 298 // We also need to go through the list of instructions that we plan to 299 // move and make sure they are all safe to move down past the merged 300 // instruction. 301 if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && 302 canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) 303 return MBBI; 304 } 305 306 // We've found a load/store that we couldn't merge for some reason. 307 // We could potentially keep looking, but we'd need to make sure that 308 // it was safe to move I and also all the instruction in InstsToMove 309 // down past this instruction. 310 if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) || // check if we can move I across MBBI 311 !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users 312 ) 313 break; 314 } 315 return E; 316 } 317 318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 319 MachineBasicBlock::iterator I, 320 MachineBasicBlock::iterator Paired, 321 unsigned EltSize, 322 ArrayRef<MachineInstr*> InstsToMove) { 323 MachineBasicBlock *MBB = I->getParent(); 324 325 // Be careful, since the addresses could be subregisters themselves in weird 326 // cases, like vectors of pointers. 327 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 328 329 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 330 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 331 332 unsigned Offset0 333 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 334 unsigned Offset1 335 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 336 337 unsigned NewOffset0 = Offset0 / EltSize; 338 unsigned NewOffset1 = Offset1 / EltSize; 339 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 340 341 // Prefer the st64 form if we can use it, even if we can fit the offset in the 342 // non st64 version. I'm not sure if there's any real reason to do this. 343 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 344 if (UseST64) { 345 NewOffset0 /= 64; 346 NewOffset1 /= 64; 347 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 348 } 349 350 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 351 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 352 353 if (NewOffset0 > NewOffset1) { 354 // Canonicalize the merged instruction so the smaller offset comes first. 355 std::swap(NewOffset0, NewOffset1); 356 std::swap(SubRegIdx0, SubRegIdx1); 357 } 358 359 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 360 (NewOffset0 != NewOffset1) && 361 "Computed offset doesn't fit"); 362 363 const MCInstrDesc &Read2Desc = TII->get(Opc); 364 365 const TargetRegisterClass *SuperRC 366 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 367 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 368 369 DebugLoc DL = I->getDebugLoc(); 370 MachineInstrBuilder Read2 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) 371 .add(*AddrReg) // addr 372 .addImm(NewOffset0) // offset0 373 .addImm(NewOffset1) // offset1 374 .addImm(0) // gds 375 .addMemOperand(*I->memoperands_begin()) 376 .addMemOperand(*Paired->memoperands_begin()); 377 (void)Read2; 378 379 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 380 381 // Copy to the old destination registers. 382 BuildMI(*MBB, Paired, DL, CopyDesc) 383 .add(*Dest0) // Copy to same destination including flags and sub reg. 384 .addReg(DestReg, 0, SubRegIdx0); 385 MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) 386 .add(*Dest1) 387 .addReg(DestReg, RegState::Kill, SubRegIdx1); 388 389 moveInstsAfter(Copy1, InstsToMove); 390 391 MachineBasicBlock::iterator Next = std::next(I); 392 I->eraseFromParent(); 393 Paired->eraseFromParent(); 394 395 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 396 return Next; 397 } 398 399 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 400 MachineBasicBlock::iterator I, 401 MachineBasicBlock::iterator Paired, 402 unsigned EltSize, 403 ArrayRef<MachineInstr*> InstsToMove) { 404 MachineBasicBlock *MBB = I->getParent(); 405 406 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 407 // sure we preserve the subregister index and any register flags set on them. 408 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 409 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 410 const MachineOperand *Data1 411 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 412 413 414 unsigned Offset0 415 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 416 unsigned Offset1 417 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 418 419 unsigned NewOffset0 = Offset0 / EltSize; 420 unsigned NewOffset1 = Offset1 / EltSize; 421 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 422 423 // Prefer the st64 form if we can use it, even if we can fit the offset in the 424 // non st64 version. I'm not sure if there's any real reason to do this. 425 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 426 if (UseST64) { 427 NewOffset0 /= 64; 428 NewOffset1 /= 64; 429 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 430 } 431 432 if (NewOffset0 > NewOffset1) { 433 // Canonicalize the merged instruction so the smaller offset comes first. 434 std::swap(NewOffset0, NewOffset1); 435 std::swap(Data0, Data1); 436 } 437 438 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 439 (NewOffset0 != NewOffset1) && 440 "Computed offset doesn't fit"); 441 442 const MCInstrDesc &Write2Desc = TII->get(Opc); 443 DebugLoc DL = I->getDebugLoc(); 444 445 MachineInstrBuilder Write2 = BuildMI(*MBB, Paired, DL, Write2Desc) 446 .add(*Addr) // addr 447 .add(*Data0) // data0 448 .add(*Data1) // data1 449 .addImm(NewOffset0) // offset0 450 .addImm(NewOffset1) // offset1 451 .addImm(0) // gds 452 .addMemOperand(*I->memoperands_begin()) 453 .addMemOperand(*Paired->memoperands_begin()); 454 455 moveInstsAfter(Write2, InstsToMove); 456 457 MachineBasicBlock::iterator Next = std::next(I); 458 I->eraseFromParent(); 459 Paired->eraseFromParent(); 460 461 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 462 return Next; 463 } 464 465 // Scan through looking for adjacent LDS operations with constant offsets from 466 // the same base register. We rely on the scheduler to do the hard work of 467 // clustering nearby loads, and assume these are all adjacent. 468 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 469 bool Modified = false; 470 471 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 472 MachineInstr &MI = *I; 473 474 // Don't combine if volatile. 475 if (MI.hasOrderedMemoryRef()) { 476 ++I; 477 continue; 478 } 479 480 SmallVector<MachineInstr*, 8> InstsToMove; 481 unsigned Opc = MI.getOpcode(); 482 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 483 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 484 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 485 InstsToMove); 486 if (Match != E) { 487 Modified = true; 488 I = mergeRead2Pair(I, Match, Size, InstsToMove); 489 } else { 490 ++I; 491 } 492 493 continue; 494 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 495 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 496 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 497 InstsToMove); 498 if (Match != E) { 499 Modified = true; 500 I = mergeWrite2Pair(I, Match, Size, InstsToMove); 501 } else { 502 ++I; 503 } 504 505 continue; 506 } 507 508 ++I; 509 } 510 511 return Modified; 512 } 513 514 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 515 if (skipFunction(*MF.getFunction())) 516 return false; 517 518 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 519 if (!STM.loadStoreOptEnabled()) 520 return false; 521 522 TII = STM.getInstrInfo(); 523 TRI = &TII->getRegisterInfo(); 524 525 MRI = &MF.getRegInfo(); 526 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 527 528 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 529 530 bool Modified = false; 531 532 for (MachineBasicBlock &MBB : MF) 533 Modified |= optimizeBlock(MBB); 534 535 return Modified; 536 } 537