1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 43 #include "llvm/CodeGen/LiveVariables.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstrBuilder.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include "llvm/Target/TargetMachine.h" 51 52 using namespace llvm; 53 54 #define DEBUG_TYPE "si-load-store-opt" 55 56 namespace { 57 58 class SILoadStoreOptimizer : public MachineFunctionPass { 59 private: 60 const SIInstrInfo *TII; 61 const SIRegisterInfo *TRI; 62 MachineRegisterInfo *MRI; 63 AliasAnalysis *AA; 64 65 static bool offsetsCanBeCombined(unsigned Offset0, 66 unsigned Offset1, 67 unsigned EltSize); 68 69 MachineBasicBlock::iterator findMatchingDSInst( 70 MachineBasicBlock::iterator I, 71 unsigned EltSize, 72 SmallVectorImpl<MachineInstr*> &InstsToMove); 73 74 MachineBasicBlock::iterator mergeRead2Pair( 75 MachineBasicBlock::iterator I, 76 MachineBasicBlock::iterator Paired, 77 unsigned EltSize, 78 ArrayRef<MachineInstr*> InstsToMove); 79 80 MachineBasicBlock::iterator mergeWrite2Pair( 81 MachineBasicBlock::iterator I, 82 MachineBasicBlock::iterator Paired, 83 unsigned EltSize, 84 ArrayRef<MachineInstr*> InstsToMove); 85 86 public: 87 static char ID; 88 89 SILoadStoreOptimizer() 90 : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), 91 AA(nullptr) {} 92 93 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 94 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 95 } 96 97 bool optimizeBlock(MachineBasicBlock &MBB); 98 99 bool runOnMachineFunction(MachineFunction &MF) override; 100 101 const char *getPassName() const override { 102 return "SI Load / Store Optimizer"; 103 } 104 105 void getAnalysisUsage(AnalysisUsage &AU) const override { 106 AU.setPreservesCFG(); 107 AU.addRequired<AAResultsWrapperPass>(); 108 109 MachineFunctionPass::getAnalysisUsage(AU); 110 } 111 }; 112 113 } // End anonymous namespace. 114 115 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 116 "SI Load / Store Optimizer", false, false) 117 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) 118 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 119 "SI Load / Store Optimizer", false, false) 120 121 char SILoadStoreOptimizer::ID = 0; 122 123 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 124 125 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 126 return new SILoadStoreOptimizer(TM); 127 } 128 129 static void moveInstsAfter(MachineBasicBlock::iterator I, 130 ArrayRef<MachineInstr*> InstsToMove) { 131 MachineBasicBlock *MBB = I->getParent(); 132 ++I; 133 for (MachineInstr *MI : InstsToMove) { 134 MI->removeFromParent(); 135 MBB->insert(I, MI); 136 } 137 } 138 139 static void addDefsToList(const MachineInstr &MI, 140 SmallVectorImpl<const MachineOperand *> &Defs) { 141 for (const MachineOperand &Def : MI.defs()) { 142 Defs.push_back(&Def); 143 } 144 } 145 146 static bool 147 canMoveInstsAcrossMemOp(MachineInstr &MemOp, 148 ArrayRef<MachineInstr*> InstsToMove, 149 const SIInstrInfo *TII, 150 AliasAnalysis *AA) { 151 152 assert(MemOp.mayLoadOrStore()); 153 154 for (MachineInstr *InstToMove : InstsToMove) { 155 if (!InstToMove->mayLoadOrStore()) 156 continue; 157 if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) 158 return false; 159 } 160 return true; 161 } 162 163 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 164 unsigned Offset1, 165 unsigned Size) { 166 // XXX - Would the same offset be OK? Is there any reason this would happen or 167 // be useful? 168 if (Offset0 == Offset1) 169 return false; 170 171 // This won't be valid if the offset isn't aligned. 172 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 173 return false; 174 175 unsigned EltOffset0 = Offset0 / Size; 176 unsigned EltOffset1 = Offset1 / Size; 177 178 // Check if the new offsets fit in the reduced 8-bit range. 179 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 180 return true; 181 182 // If the offset in elements doesn't fit in 8-bits, we might be able to use 183 // the stride 64 versions. 184 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 185 return false; 186 187 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 188 } 189 190 MachineBasicBlock::iterator 191 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 192 unsigned EltSize, 193 SmallVectorImpl<MachineInstr*> &InstsToMove) { 194 MachineBasicBlock::iterator E = I->getParent()->end(); 195 MachineBasicBlock::iterator MBBI = I; 196 ++MBBI; 197 198 SmallVector<const MachineOperand *, 8> DefsToMove; 199 addDefsToList(*I, DefsToMove); 200 201 for ( ; MBBI != E; ++MBBI) { 202 203 if (MBBI->getOpcode() != I->getOpcode()) { 204 205 // This is not a matching DS instruction, but we can keep looking as 206 // long as one of these conditions are met: 207 // 1. It is safe to move I down past MBBI. 208 // 2. It is safe to move MBBI down past the instruction that I will 209 // be merged into. 210 211 if (MBBI->hasUnmodeledSideEffects()) 212 // We can't re-order this instruction with respect to other memory 213 // opeations, so we fail both conditions mentioned above. 214 return E; 215 216 if (MBBI->mayLoadOrStore() && 217 !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { 218 // We fail condition #1, but we may still be able to satisfy condition 219 // #2. Add this instruction to the move list and then we will check 220 // if condition #2 holds once we have selected the matching instruction. 221 InstsToMove.push_back(&*MBBI); 222 addDefsToList(*MBBI, DefsToMove); 223 continue; 224 } 225 226 // When we match I with another DS instruction we will be moving I down 227 // to the location of the matched instruction any uses of I will need to 228 // be moved down as well. 229 for (const MachineOperand *Def : DefsToMove) { 230 bool ReadDef = MBBI->readsVirtualRegister(Def->getReg()); 231 // If ReadDef is true, then there is a use of Def between I 232 // and the instruction that I will potentially be merged with. We 233 // will need to move this instruction after the merged instructions. 234 if (ReadDef) { 235 InstsToMove.push_back(&*MBBI); 236 addDefsToList(*MBBI, DefsToMove); 237 break; 238 } 239 } 240 continue; 241 } 242 243 // Don't merge volatiles. 244 if (MBBI->hasOrderedMemoryRef()) 245 return E; 246 247 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 248 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 249 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 250 251 // Check same base pointer. Be careful of subregisters, which can occur with 252 // vectors of pointers. 253 if (AddrReg0.getReg() == AddrReg1.getReg() && 254 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 255 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 256 AMDGPU::OpName::offset); 257 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 258 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 259 260 // Check both offsets fit in the reduced range. 261 // We also need to go through the list of instructions that we plan to 262 // move and make sure they are all safe to move down past the merged 263 // instruction. 264 if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && 265 canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) 266 return MBBI; 267 } 268 269 // We've found a load/store that we couldn't merge for some reason. 270 // We could potentially keep looking, but we'd need to make sure that 271 // it was safe to move I and also all the instruction in InstsToMove 272 // down past this instruction. 273 // FIXME: This is too conservative. 274 break; 275 } 276 return E; 277 } 278 279 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 280 MachineBasicBlock::iterator I, 281 MachineBasicBlock::iterator Paired, 282 unsigned EltSize, 283 ArrayRef<MachineInstr*> InstsToMove) { 284 MachineBasicBlock *MBB = I->getParent(); 285 286 // Be careful, since the addresses could be subregisters themselves in weird 287 // cases, like vectors of pointers. 288 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 289 290 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 291 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 292 293 unsigned Offset0 294 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 295 unsigned Offset1 296 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 297 298 unsigned NewOffset0 = Offset0 / EltSize; 299 unsigned NewOffset1 = Offset1 / EltSize; 300 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 301 302 // Prefer the st64 form if we can use it, even if we can fit the offset in the 303 // non st64 version. I'm not sure if there's any real reason to do this. 304 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 305 if (UseST64) { 306 NewOffset0 /= 64; 307 NewOffset1 /= 64; 308 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 309 } 310 311 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 312 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 313 314 if (NewOffset0 > NewOffset1) { 315 // Canonicalize the merged instruction so the smaller offset comes first. 316 std::swap(NewOffset0, NewOffset1); 317 std::swap(SubRegIdx0, SubRegIdx1); 318 } 319 320 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 321 (NewOffset0 != NewOffset1) && 322 "Computed offset doesn't fit"); 323 324 const MCInstrDesc &Read2Desc = TII->get(Opc); 325 326 const TargetRegisterClass *SuperRC 327 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 328 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 329 330 DebugLoc DL = I->getDebugLoc(); 331 MachineInstrBuilder Read2 332 = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) 333 .addOperand(*AddrReg) // addr 334 .addImm(NewOffset0) // offset0 335 .addImm(NewOffset1) // offset1 336 .addImm(0) // gds 337 .addMemOperand(*I->memoperands_begin()) 338 .addMemOperand(*Paired->memoperands_begin()); 339 (void)Read2; 340 341 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 342 343 // Copy to the old destination registers. 344 BuildMI(*MBB, Paired, DL, CopyDesc) 345 .addOperand(*Dest0) // Copy to same destination including flags and sub reg. 346 .addReg(DestReg, 0, SubRegIdx0); 347 MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) 348 .addOperand(*Dest1) 349 .addReg(DestReg, RegState::Kill, SubRegIdx1); 350 351 moveInstsAfter(Copy1, InstsToMove); 352 353 MachineBasicBlock::iterator Next = std::next(I); 354 I->eraseFromParent(); 355 Paired->eraseFromParent(); 356 357 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 358 return Next; 359 } 360 361 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 362 MachineBasicBlock::iterator I, 363 MachineBasicBlock::iterator Paired, 364 unsigned EltSize, 365 ArrayRef<MachineInstr*> InstsToMove) { 366 MachineBasicBlock *MBB = I->getParent(); 367 368 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 369 // sure we preserve the subregister index and any register flags set on them. 370 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 371 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 372 const MachineOperand *Data1 373 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 374 375 376 unsigned Offset0 377 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 378 unsigned Offset1 379 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 380 381 unsigned NewOffset0 = Offset0 / EltSize; 382 unsigned NewOffset1 = Offset1 / EltSize; 383 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 384 385 // Prefer the st64 form if we can use it, even if we can fit the offset in the 386 // non st64 version. I'm not sure if there's any real reason to do this. 387 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 388 if (UseST64) { 389 NewOffset0 /= 64; 390 NewOffset1 /= 64; 391 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 392 } 393 394 if (NewOffset0 > NewOffset1) { 395 // Canonicalize the merged instruction so the smaller offset comes first. 396 std::swap(NewOffset0, NewOffset1); 397 std::swap(Data0, Data1); 398 } 399 400 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 401 (NewOffset0 != NewOffset1) && 402 "Computed offset doesn't fit"); 403 404 const MCInstrDesc &Write2Desc = TII->get(Opc); 405 DebugLoc DL = I->getDebugLoc(); 406 407 MachineInstrBuilder Write2 408 = BuildMI(*MBB, Paired, DL, Write2Desc) 409 .addOperand(*Addr) // addr 410 .addOperand(*Data0) // data0 411 .addOperand(*Data1) // data1 412 .addImm(NewOffset0) // offset0 413 .addImm(NewOffset1) // offset1 414 .addImm(0) // gds 415 .addMemOperand(*I->memoperands_begin()) 416 .addMemOperand(*Paired->memoperands_begin()); 417 418 moveInstsAfter(Write2, InstsToMove); 419 420 MachineBasicBlock::iterator Next = std::next(I); 421 I->eraseFromParent(); 422 Paired->eraseFromParent(); 423 424 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 425 return Next; 426 } 427 428 // Scan through looking for adjacent LDS operations with constant offsets from 429 // the same base register. We rely on the scheduler to do the hard work of 430 // clustering nearby loads, and assume these are all adjacent. 431 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 432 bool Modified = false; 433 434 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 435 MachineInstr &MI = *I; 436 437 // Don't combine if volatile. 438 if (MI.hasOrderedMemoryRef()) { 439 ++I; 440 continue; 441 } 442 443 SmallVector<MachineInstr*, 8> InstsToMove; 444 unsigned Opc = MI.getOpcode(); 445 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 446 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 447 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 448 InstsToMove); 449 if (Match != E) { 450 Modified = true; 451 I = mergeRead2Pair(I, Match, Size, InstsToMove); 452 } else { 453 ++I; 454 } 455 456 continue; 457 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 458 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 459 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, 460 InstsToMove); 461 if (Match != E) { 462 Modified = true; 463 I = mergeWrite2Pair(I, Match, Size, InstsToMove); 464 } else { 465 ++I; 466 } 467 468 continue; 469 } 470 471 ++I; 472 } 473 474 return Modified; 475 } 476 477 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 478 if (skipFunction(*MF.getFunction())) 479 return false; 480 481 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 482 if (!STM.loadStoreOptEnabled()) 483 return false; 484 485 TII = STM.getInstrInfo(); 486 TRI = &TII->getRegisterInfo(); 487 488 MRI = &MF.getRegInfo(); 489 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 490 491 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 492 493 bool Modified = false; 494 495 for (MachineBasicBlock &MBB : MF) 496 Modified |= optimizeBlock(MBB); 497 498 return Modified; 499 } 500