1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 43 #include "llvm/CodeGen/LiveVariables.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstrBuilder.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include "llvm/Target/TargetMachine.h" 51 52 using namespace llvm; 53 54 #define DEBUG_TYPE "si-load-store-opt" 55 56 namespace { 57 58 class SILoadStoreOptimizer : public MachineFunctionPass { 59 private: 60 const SIInstrInfo *TII; 61 const SIRegisterInfo *TRI; 62 MachineRegisterInfo *MRI; 63 AliasAnalysis *AA; 64 65 static bool offsetsCanBeCombined(unsigned Offset0, 66 unsigned Offset1, 67 unsigned EltSize); 68 69 MachineBasicBlock::iterator findMatchingDSInst( 70 MachineBasicBlock::iterator I, 71 unsigned EltSize, 72 SmallVectorImpl<MachineInstr*> &InstsToMove); 73 74 MachineBasicBlock::iterator mergeRead2Pair( 75 MachineBasicBlock::iterator I, 76 MachineBasicBlock::iterator Paired, 77 unsigned EltSize, 78 ArrayRef<MachineInstr*> InstsToMove); 79 80 MachineBasicBlock::iterator mergeWrite2Pair( 81 MachineBasicBlock::iterator I, 82 MachineBasicBlock::iterator Paired, 83 unsigned EltSize, 84 ArrayRef<MachineInstr*> InstsToMove); 85 86 public: 87 static char ID; 88 89 SILoadStoreOptimizer() 90 : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), 91 AA(nullptr) {} 92 93 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 94 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 95 } 96 97 bool optimizeBlock(MachineBasicBlock &MBB); 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 StringRef getPassName() const override { return "SI Load / Store Optimizer"; } 102 103 void getAnalysisUsage(AnalysisUsage &AU) const override { 104 AU.setPreservesCFG(); 105 AU.addRequired<AAResultsWrapperPass>(); 106 107 MachineFunctionPass::getAnalysisUsage(AU); 108 } 109 }; 110 111 } // End anonymous namespace. 112 113 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 114 "SI Load / Store Optimizer", false, false) 115 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 116 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 117 "SI Load / Store Optimizer", false, false) 118 119 char SILoadStoreOptimizer::ID = 0; 120 121 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 122 123 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 124 return new SILoadStoreOptimizer(TM); 125 } 126 127 static void moveInstsAfter(MachineBasicBlock::iterator I, 128 ArrayRef<MachineInstr*> InstsToMove) { 129 MachineBasicBlock *MBB = I->getParent(); 130 ++I; 131 for (MachineInstr *MI : InstsToMove) { 132 MI->removeFromParent(); 133 MBB->insert(I, MI); 134 } 135 } 136 137 static void addDefsToList(const MachineInstr &MI, 138 SmallVectorImpl<const MachineOperand *> &Defs) { 139 for (const MachineOperand &Def : MI.defs()) { 140 Defs.push_back(&Def); 141 } 142 } 143 144 static bool 145 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 146 ArrayRef<MachineInstr*> InstsToMove, 147 const SIInstrInfo *TII, 148 AliasAnalysis *AA) { 149 150 assert(MemOp.mayLoadOrStore()); 151 152 for (MachineInstr *InstToMove : InstsToMove) { 153 if (!InstToMove->mayLoadOrStore()) 154 continue; 155 if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) 156 return false; 157 } 158 return true; 159 } 160 161 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 162 unsigned Offset1, 163 unsigned Size) { 164 // XXX - Would the same offset be OK? Is there any reason this would happen or 165 // be useful? 166 if (Offset0 == Offset1) 167 return false; 168 169 // This won't be valid if the offset isn't aligned. 170 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 171 return false; 172 173 unsigned EltOffset0 = Offset0 / Size; 174 unsigned EltOffset1 = Offset1 / Size; 175 176 // Check if the new offsets fit in the reduced 8-bit range. 177 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 178 return true; 179 180 // If the offset in elements doesn't fit in 8-bits, we might be able to use 181 // the stride 64 versions. 182 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 183 return false; 184 185 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 186 } 187 188 MachineBasicBlock::iterator 189 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 190 unsigned EltSize, 191 SmallVectorImpl<MachineInstr*> &InstsToMove) { 192 MachineBasicBlock::iterator E = I->getParent()->end(); 193 MachineBasicBlock::iterator MBBI = I; 194 ++MBBI; 195 196 SmallVector<const MachineOperand *, 8> DefsToMove; 197 addDefsToList(*I, DefsToMove); 198 199 for ( ; MBBI != E; ++MBBI) { 200 201 if (MBBI->getOpcode() != I->getOpcode()) { 202 203 // This is not a matching DS instruction, but we can keep looking as 204 // long as one of these conditions are met: 205 // 1. It is safe to move I down past MBBI. 206 // 2. It is safe to move MBBI down past the instruction that I will 207 // be merged into. 208 209 if (MBBI->hasUnmodeledSideEffects()) 210 // We can't re-order this instruction with respect to other memory 211 // opeations, so we fail both conditions mentioned above. 212 return E; 213 214 if (MBBI->mayLoadOrStore() && 215 !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { 216 // We fail condition #1, but we may still be able to satisfy condition 217 // #2. Add this instruction to the move list and then we will check 218 // if condition #2 holds once we have selected the matching instruction. 219 InstsToMove.push_back(&*MBBI); 220 addDefsToList(*MBBI, DefsToMove); 221 continue; 222 } 223 224 // When we match I with another DS instruction we will be moving I down 225 // to the location of the matched instruction any uses of I will need to 226 // be moved down as well. 227 for (const MachineOperand *Def : DefsToMove) { 228 bool ReadDef = MBBI->readsVirtualRegister(Def->getReg()); 229 // If ReadDef is true, then there is a use of Def between I 230 // and the instruction that I will potentially be merged with. We 231 // will need to move this instruction after the merged instructions. 232 if (ReadDef) { 233 InstsToMove.push_back(&*MBBI); 234 addDefsToList(*MBBI, DefsToMove); 235 break; 236 } 237 } 238 continue; 239 } 240 241 // Don't merge volatiles. 242 if (MBBI->hasOrderedMemoryRef()) 243 return E; 244 245 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 246 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 247 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 248 249 // Check same base pointer. Be careful of subregisters, which can occur with 250 // vectors of pointers. 251 if (AddrReg0.getReg() == AddrReg1.getReg() && 252 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 253 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 254 AMDGPU::OpName::offset); 255 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 256 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 257 258 // Check both offsets fit in the reduced range. 259 // We also need to go through the list of instructions that we plan to 260 // move and make sure they are all safe to move down past the merged 261 // instruction. 262 if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && 263 canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) 264 return MBBI; 265 } 266 267 // We've found a load/store that we couldn't merge for some reason. 268 // We could potentially keep looking, but we'd need to make sure that 269 // it was safe to move I and also all the instruction in InstsToMove 270 // down past this instruction. 271 // FIXME: This is too conservative. 272 break; 273 } 274 return E; 275 } 276 277 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 278 MachineBasicBlock::iterator I, 279 MachineBasicBlock::iterator Paired, 280 unsigned EltSize, 281 ArrayRef<MachineInstr*> InstsToMove) { 282 MachineBasicBlock *MBB = I->getParent(); 283 284 // Be careful, since the addresses could be subregisters themselves in weird 285 // cases, like vectors of pointers. 286 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 287 288 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 289 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 290 291 unsigned Offset0 292 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 293 unsigned Offset1 294 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 295 296 unsigned NewOffset0 = Offset0 / EltSize; 297 unsigned NewOffset1 = Offset1 / EltSize; 298 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 299 300 // Prefer the st64 form if we can use it, even if we can fit the offset in the 301 // non st64 version. I'm not sure if there's any real reason to do this. 302 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 303 if (UseST64) { 304 NewOffset0 /= 64; 305 NewOffset1 /= 64; 306 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 307 } 308 309 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 310 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 311 312 if (NewOffset0 > NewOffset1) { 313 // Canonicalize the merged instruction so the smaller offset comes first. 314 std::swap(NewOffset0, NewOffset1); 315 std::swap(SubRegIdx0, SubRegIdx1); 316 } 317 318 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 319 (NewOffset0 != NewOffset1) && 320 "Computed offset doesn't fit"); 321 322 const MCInstrDesc &Read2Desc = TII->get(Opc); 323 324 const TargetRegisterClass *SuperRC 325 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 326 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 327 328 DebugLoc DL = I->getDebugLoc(); 329 MachineInstrBuilder Read2 330 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) 331 .addOperand(*AddrReg) // addr 332 .addImm(NewOffset0) // offset0 333 .addImm(NewOffset1) // offset1 334 .addImm(0) // gds 335 .addMemOperand(*I->memoperands_begin()) 336 .addMemOperand(*Paired->memoperands_begin()); 337 (void)Read2; 338 339 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 340 341 // Copy to the old destination registers. 342 BuildMI(*MBB, Paired, DL, CopyDesc) 343 .addOperand(*Dest0) // Copy to same destination including flags and sub reg. 344 .addReg(DestReg, 0, SubRegIdx0); 345 MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) 346 .addOperand(*Dest1) 347 .addReg(DestReg, RegState::Kill, SubRegIdx1); 348 349 moveInstsAfter(Copy1, InstsToMove); 350 351 MachineBasicBlock::iterator Next = std::next(I); 352 I->eraseFromParent(); 353 Paired->eraseFromParent(); 354 355 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 356 return Next; 357 } 358 359 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 360 MachineBasicBlock::iterator I, 361 MachineBasicBlock::iterator Paired, 362 unsigned EltSize, 363 ArrayRef<MachineInstr*> InstsToMove) { 364 MachineBasicBlock *MBB = I->getParent(); 365 366 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 367 // sure we preserve the subregister index and any register flags set on them. 368 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 369 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 370 const MachineOperand *Data1 371 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 372 373 374 unsigned Offset0 375 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 376 unsigned Offset1 377 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 378 379 unsigned NewOffset0 = Offset0 / EltSize; 380 unsigned NewOffset1 = Offset1 / EltSize; 381 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 382 383 // Prefer the st64 form if we can use it, even if we can fit the offset in the 384 // non st64 version. I'm not sure if there's any real reason to do this. 385 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 386 if (UseST64) { 387 NewOffset0 /= 64; 388 NewOffset1 /= 64; 389 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 390 } 391 392 if (NewOffset0 > NewOffset1) { 393 // Canonicalize the merged instruction so the smaller offset comes first. 394 std::swap(NewOffset0, NewOffset1); 395 std::swap(Data0, Data1); 396 } 397 398 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 399 (NewOffset0 != NewOffset1) && 400 "Computed offset doesn't fit"); 401 402 const MCInstrDesc &Write2Desc = TII->get(Opc); 403 DebugLoc DL = I->getDebugLoc(); 404 405 MachineInstrBuilder Write2 406 = BuildMI(*MBB, Paired, DL, Write2Desc) 407 .addOperand(*Addr) // addr 408 .addOperand(*Data0) // data0 409 .addOperand(*Data1) // data1 410 .addImm(NewOffset0) // offset0 411 .addImm(NewOffset1) // offset1 412 .addImm(0) // gds 413 .addMemOperand(*I->memoperands_begin()) 414 .addMemOperand(*Paired->memoperands_begin()); 415 416 moveInstsAfter(Write2, InstsToMove); 417 418 MachineBasicBlock::iterator Next = std::next(I); 419 I->eraseFromParent(); 420 Paired->eraseFromParent(); 421 422 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 423 return Next; 424 } 425 426 // Scan through looking for adjacent LDS operations with constant offsets from 427 // the same base register. We rely on the scheduler to do the hard work of 428 // clustering nearby loads, and assume these are all adjacent. 429 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 430 bool Modified = false; 431 432 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 433 MachineInstr &MI = *I; 434 435 // Don't combine if volatile. 436 if (MI.hasOrderedMemoryRef()) { 437 ++I; 438 continue; 439 } 440 441 SmallVector<MachineInstr*, 8> InstsToMove; 442 unsigned Opc = MI.getOpcode(); 443 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 444 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 445 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 446 InstsToMove); 447 if (Match != E) { 448 Modified = true; 449 I = mergeRead2Pair(I, Match, Size, InstsToMove); 450 } else { 451 ++I; 452 } 453 454 continue; 455 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 456 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 457 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 458 InstsToMove); 459 if (Match != E) { 460 Modified = true; 461 I = mergeWrite2Pair(I, Match, Size, InstsToMove); 462 } else { 463 ++I; 464 } 465 466 continue; 467 } 468 469 ++I; 470 } 471 472 return Modified; 473 } 474 475 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 476 if (skipFunction(*MF.getFunction())) 477 return false; 478 479 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 480 if (!STM.loadStoreOptEnabled()) 481 return false; 482 483 TII = STM.getInstrInfo(); 484 TRI = &TII->getRegisterInfo(); 485 486 MRI = &MF.getRegInfo(); 487 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 488 489 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 490 491 bool Modified = false; 492 493 for (MachineBasicBlock &MBB : MF) 494 Modified |= optimizeBlock(MBB); 495 496 return Modified; 497 } 498