1 //===-- SILoadStoreOptimizer.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass tries to fuse DS instructions with close by immediate offsets. 11 // This will fuse operations such as 12 // ds_read_b32 v0, v2 offset:16 13 // ds_read_b32 v1, v2 offset:32 14 // ==> 15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16 // 17 // 18 // Future improvements: 19 // 20 // - This currently relies on the scheduler to place loads and stores next to 21 // each other, and then only merges adjacent pairs of instructions. It would 22 // be good to be more flexible with interleaved instructions, and possibly run 23 // before scheduling. It currently missing stores of constants because loading 24 // the constant into the data register is placed between the stores, although 25 // this is arguably a scheduling problem. 26 // 27 // - Live interval recomputing seems inefficient. This currently only matches 28 // one pair, and recomputes live intervals and moves on to the next pair. It 29 // would be better to compute a list of all merges that need to occur. 30 // 31 // - With a list of instructions to process, we can also merge more. If a 32 // cluster of loads have offsets that are too large to fit in the 8-bit 33 // offsets, but are close enough to fit in the 8 bits, we can add to the base 34 // pointer and use the new reduced offsets. 35 // 36 //===----------------------------------------------------------------------===// 37 38 #include "AMDGPU.h" 39 #include "AMDGPUSubtarget.h" 40 #include "SIInstrInfo.h" 41 #include "SIRegisterInfo.h" 42 #include "llvm/CodeGen/LiveIntervalAnalysis.h" 43 #include "llvm/CodeGen/LiveVariables.h" 44 #include "llvm/CodeGen/MachineFunction.h" 45 #include "llvm/CodeGen/MachineFunctionPass.h" 46 #include "llvm/CodeGen/MachineInstrBuilder.h" 47 #include "llvm/CodeGen/MachineRegisterInfo.h" 48 #include "llvm/Support/Debug.h" 49 #include "llvm/Support/raw_ostream.h" 50 #include "llvm/Target/TargetMachine.h" 51 52 using namespace llvm; 53 54 #define DEBUG_TYPE "si-load-store-opt" 55 56 namespace { 57 58 class SILoadStoreOptimizer : public MachineFunctionPass { 59 private: 60 const SIInstrInfo *TII; 61 const SIRegisterInfo *TRI; 62 MachineRegisterInfo *MRI; 63 LiveIntervals *LIS; 64 65 static bool offsetsCanBeCombined(unsigned Offset0, 66 unsigned Offset1, 67 unsigned EltSize); 68 69 MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, 70 unsigned EltSize); 71 72 MachineBasicBlock::iterator mergeRead2Pair( 73 MachineBasicBlock::iterator I, 74 MachineBasicBlock::iterator Paired, 75 unsigned EltSize); 76 77 MachineBasicBlock::iterator mergeWrite2Pair( 78 MachineBasicBlock::iterator I, 79 MachineBasicBlock::iterator Paired, 80 unsigned EltSize); 81 82 public: 83 static char ID; 84 85 SILoadStoreOptimizer() 86 : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), 87 LIS(nullptr) {} 88 89 SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { 90 initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); 91 } 92 93 bool optimizeBlock(MachineBasicBlock &MBB); 94 95 bool runOnMachineFunction(MachineFunction &MF) override; 96 97 const char *getPassName() const override { 98 return "SI Load / Store Optimizer"; 99 } 100 101 MachineFunctionProperties getRequiredProperties() const override { 102 return MachineFunctionProperties().set( 103 MachineFunctionProperties::Property::NoPHIs); 104 } 105 106 void getAnalysisUsage(AnalysisUsage &AU) const override { 107 AU.setPreservesCFG(); 108 AU.addPreserved<SlotIndexes>(); 109 AU.addPreserved<LiveIntervals>(); 110 AU.addPreserved<LiveVariables>(); 111 AU.addRequired<LiveIntervals>(); 112 113 MachineFunctionPass::getAnalysisUsage(AU); 114 } 115 }; 116 117 } // End anonymous namespace. 118 119 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, 120 "SI Load / Store Optimizer", false, false) 121 INITIALIZE_PASS_DEPENDENCY(LiveIntervals) 122 INITIALIZE_PASS_DEPENDENCY(LiveVariables) 123 INITIALIZE_PASS_DEPENDENCY(SlotIndexes) 124 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, 125 "SI Load / Store Optimizer", false, false) 126 127 char SILoadStoreOptimizer::ID = 0; 128 129 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID; 130 131 FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { 132 return new SILoadStoreOptimizer(TM); 133 } 134 135 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, 136 unsigned Offset1, 137 unsigned Size) { 138 // XXX - Would the same offset be OK? Is there any reason this would happen or 139 // be useful? 140 if (Offset0 == Offset1) 141 return false; 142 143 // This won't be valid if the offset isn't aligned. 144 if ((Offset0 % Size != 0) || (Offset1 % Size != 0)) 145 return false; 146 147 unsigned EltOffset0 = Offset0 / Size; 148 unsigned EltOffset1 = Offset1 / Size; 149 150 // Check if the new offsets fit in the reduced 8-bit range. 151 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) 152 return true; 153 154 // If the offset in elements doesn't fit in 8-bits, we might be able to use 155 // the stride 64 versions. 156 if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0) 157 return false; 158 159 return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64); 160 } 161 162 MachineBasicBlock::iterator 163 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, 164 unsigned EltSize){ 165 MachineBasicBlock::iterator E = I->getParent()->end(); 166 MachineBasicBlock &MBB = *I->getParent(); 167 MachineBasicBlock::iterator MBBI = I; 168 ++MBBI; 169 170 if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode()) 171 return E; 172 173 // Don't merge volatiles. 174 if (MBBI->hasOrderedMemoryRef()) 175 return E; 176 177 int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); 178 const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); 179 const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); 180 181 // Check same base pointer. Be careful of subregisters, which can occur with 182 // vectors of pointers. 183 if (AddrReg0.getReg() == AddrReg1.getReg() && 184 AddrReg0.getSubReg() == AddrReg1.getSubReg()) { 185 int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), 186 AMDGPU::OpName::offset); 187 unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; 188 unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; 189 190 // Check both offsets fit in the reduced range. 191 if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) 192 return MBBI; 193 } 194 195 return E; 196 } 197 198 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( 199 MachineBasicBlock::iterator I, 200 MachineBasicBlock::iterator Paired, 201 unsigned EltSize) { 202 MachineBasicBlock *MBB = I->getParent(); 203 204 // Be careful, since the addresses could be subregisters themselves in weird 205 // cases, like vectors of pointers. 206 const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 207 208 const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst); 209 const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst); 210 211 unsigned Offset0 212 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 213 unsigned Offset1 214 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 215 216 unsigned NewOffset0 = Offset0 / EltSize; 217 unsigned NewOffset1 = Offset1 / EltSize; 218 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64; 219 220 // Prefer the st64 form if we can use it, even if we can fit the offset in the 221 // non st64 version. I'm not sure if there's any real reason to do this. 222 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 223 if (UseST64) { 224 NewOffset0 /= 64; 225 NewOffset1 /= 64; 226 Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64; 227 } 228 229 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 230 (NewOffset0 != NewOffset1) && 231 "Computed offset doesn't fit"); 232 233 const MCInstrDesc &Read2Desc = TII->get(Opc); 234 235 const TargetRegisterClass *SuperRC 236 = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass; 237 unsigned DestReg = MRI->createVirtualRegister(SuperRC); 238 239 DebugLoc DL = I->getDebugLoc(); 240 MachineInstrBuilder Read2 241 = BuildMI(*MBB, I, DL, Read2Desc, DestReg) 242 .addOperand(*AddrReg) // addr 243 .addImm(NewOffset0) // offset0 244 .addImm(NewOffset1) // offset1 245 .addImm(0) // gds 246 .addMemOperand(*I->memoperands_begin()) 247 .addMemOperand(*Paired->memoperands_begin()); 248 249 unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1; 250 unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3; 251 252 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); 253 254 // Copy to the old destination registers. 255 MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) 256 .addOperand(*Dest0) // Copy to same destination including flags and sub reg. 257 .addReg(DestReg, 0, SubRegIdx0); 258 MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) 259 .addOperand(*Dest1) 260 .addReg(DestReg, RegState::Kill, SubRegIdx1); 261 262 LIS->InsertMachineInstrInMaps(*Read2); 263 264 // repairLiveintervalsInRange() doesn't handle physical register, so we have 265 // to update the M0 range manually. 266 SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); 267 LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); 268 LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); 269 bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); 270 271 // The new write to the original destination register is now the copy. Steal 272 // the old SlotIndex. 273 LIS->ReplaceMachineInstrInMaps(*I, *Copy0); 274 LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); 275 276 I->eraseFromParent(); 277 Paired->eraseFromParent(); 278 279 LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); 280 LIS->shrinkToUses(&AddrRegLI); 281 282 LIS->createAndComputeVirtRegInterval(DestReg); 283 284 if (UpdateM0Range) { 285 SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); 286 M0Segment->end = Read2Index.getRegSlot(); 287 } 288 289 DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); 290 return Read2.getInstr(); 291 } 292 293 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( 294 MachineBasicBlock::iterator I, 295 MachineBasicBlock::iterator Paired, 296 unsigned EltSize) { 297 MachineBasicBlock *MBB = I->getParent(); 298 299 // Be sure to use .addOperand(), and not .addReg() with these. We want to be 300 // sure we preserve the subregister index and any register flags set on them. 301 const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr); 302 const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0); 303 const MachineOperand *Data1 304 = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0); 305 306 307 unsigned Offset0 308 = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff; 309 unsigned Offset1 310 = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff; 311 312 unsigned NewOffset0 = Offset0 / EltSize; 313 unsigned NewOffset1 = Offset1 / EltSize; 314 unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64; 315 316 // Prefer the st64 form if we can use it, even if we can fit the offset in the 317 // non st64 version. I'm not sure if there's any real reason to do this. 318 bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0); 319 if (UseST64) { 320 NewOffset0 /= 64; 321 NewOffset1 /= 64; 322 Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64; 323 } 324 325 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) && 326 (NewOffset0 != NewOffset1) && 327 "Computed offset doesn't fit"); 328 329 const MCInstrDesc &Write2Desc = TII->get(Opc); 330 DebugLoc DL = I->getDebugLoc(); 331 332 // repairLiveintervalsInRange() doesn't handle physical register, so we have 333 // to update the M0 range manually. 334 SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); 335 LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); 336 LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); 337 bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); 338 339 MachineInstrBuilder Write2 340 = BuildMI(*MBB, I, DL, Write2Desc) 341 .addOperand(*Addr) // addr 342 .addOperand(*Data0) // data0 343 .addOperand(*Data1) // data1 344 .addImm(NewOffset0) // offset0 345 .addImm(NewOffset1) // offset1 346 .addImm(0) // gds 347 .addMemOperand(*I->memoperands_begin()) 348 .addMemOperand(*Paired->memoperands_begin()); 349 350 // XXX - How do we express subregisters here? 351 unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; 352 353 LIS->RemoveMachineInstrFromMaps(*I); 354 LIS->RemoveMachineInstrFromMaps(*Paired); 355 I->eraseFromParent(); 356 Paired->eraseFromParent(); 357 358 // This doesn't handle physical registers like M0 359 LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); 360 361 if (UpdateM0Range) { 362 SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); 363 M0Segment->end = Write2Index.getRegSlot(); 364 } 365 366 DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); 367 return Write2.getInstr(); 368 } 369 370 // Scan through looking for adjacent LDS operations with constant offsets from 371 // the same base register. We rely on the scheduler to do the hard work of 372 // clustering nearby loads, and assume these are all adjacent. 373 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { 374 bool Modified = false; 375 376 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { 377 MachineInstr &MI = *I; 378 379 // Don't combine if volatile. 380 if (MI.hasOrderedMemoryRef()) { 381 ++I; 382 continue; 383 } 384 385 unsigned Opc = MI.getOpcode(); 386 if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { 387 unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; 388 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); 389 if (Match != E) { 390 Modified = true; 391 I = mergeRead2Pair(I, Match, Size); 392 } else { 393 ++I; 394 } 395 396 continue; 397 } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { 398 unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; 399 MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); 400 if (Match != E) { 401 Modified = true; 402 I = mergeWrite2Pair(I, Match, Size); 403 } else { 404 ++I; 405 } 406 407 continue; 408 } 409 410 ++I; 411 } 412 413 return Modified; 414 } 415 416 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { 417 if (skipFunction(*MF.getFunction())) 418 return false; 419 420 const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); 421 if (!STM.loadStoreOptEnabled()) 422 return false; 423 424 TII = STM.getInstrInfo(); 425 TRI = &TII->getRegisterInfo(); 426 427 MRI = &MF.getRegInfo(); 428 429 LIS = &getAnalysis<LiveIntervals>(); 430 431 DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); 432 433 bool Modified = false; 434 435 for (MachineBasicBlock &MBB : MF) 436 Modified |= optimizeBlock(MBB); 437 438 return Modified; 439 } 440