1*cfb7ffdeSJay Foad //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===// 2*cfb7ffdeSJay Foad // 3*cfb7ffdeSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*cfb7ffdeSJay Foad // See https://llvm.org/LICENSE.txt for license information. 5*cfb7ffdeSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*cfb7ffdeSJay Foad // 7*cfb7ffdeSJay Foad //===----------------------------------------------------------------------===// 8*cfb7ffdeSJay Foad // 9*cfb7ffdeSJay Foad /// \file 10*cfb7ffdeSJay Foad /// Insert s_delay_alu instructions to avoid stalls on GFX11+. 11*cfb7ffdeSJay Foad // 12*cfb7ffdeSJay Foad //===----------------------------------------------------------------------===// 13*cfb7ffdeSJay Foad 14*cfb7ffdeSJay Foad #include "AMDGPU.h" 15*cfb7ffdeSJay Foad #include "GCNSubtarget.h" 16*cfb7ffdeSJay Foad #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17*cfb7ffdeSJay Foad #include "SIInstrInfo.h" 18*cfb7ffdeSJay Foad #include "llvm/ADT/SetVector.h" 19*cfb7ffdeSJay Foad 20*cfb7ffdeSJay Foad using namespace llvm; 21*cfb7ffdeSJay Foad 22*cfb7ffdeSJay Foad #define DEBUG_TYPE "amdgpu-insert-delay-alu" 23*cfb7ffdeSJay Foad 24*cfb7ffdeSJay Foad namespace { 25*cfb7ffdeSJay Foad 26*cfb7ffdeSJay Foad class AMDGPUInsertDelayAlu : public MachineFunctionPass { 27*cfb7ffdeSJay Foad public: 28*cfb7ffdeSJay Foad static char ID; 29*cfb7ffdeSJay Foad 30*cfb7ffdeSJay Foad const SIInstrInfo *SII; 31*cfb7ffdeSJay Foad const TargetRegisterInfo *TRI; 32*cfb7ffdeSJay Foad 33*cfb7ffdeSJay Foad TargetSchedModel SchedModel; 34*cfb7ffdeSJay Foad AMDGPUInsertDelayAlu()35*cfb7ffdeSJay Foad AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {} 36*cfb7ffdeSJay Foad getAnalysisUsage(AnalysisUsage & AU) const37*cfb7ffdeSJay Foad void getAnalysisUsage(AnalysisUsage &AU) const override { 38*cfb7ffdeSJay Foad AU.setPreservesCFG(); 39*cfb7ffdeSJay Foad MachineFunctionPass::getAnalysisUsage(AU); 40*cfb7ffdeSJay Foad } 41*cfb7ffdeSJay Foad 42*cfb7ffdeSJay Foad // Return true if MI waits for all outstanding VALU instructions to complete. instructionWaitsForVALU(const MachineInstr & MI)43*cfb7ffdeSJay Foad static bool instructionWaitsForVALU(const MachineInstr &MI) { 44*cfb7ffdeSJay Foad // These instruction types wait for VA_VDST==0 before issuing. 45*cfb7ffdeSJay Foad const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP | 46*cfb7ffdeSJay Foad SIInstrFlags::FLAT | SIInstrFlags::MIMG | 47*cfb7ffdeSJay Foad SIInstrFlags::MTBUF | SIInstrFlags::MUBUF; 48*cfb7ffdeSJay Foad if (MI.getDesc().TSFlags & VA_VDST_0) 49*cfb7ffdeSJay Foad return true; 50*cfb7ffdeSJay Foad if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 || 51*cfb7ffdeSJay Foad MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64) 52*cfb7ffdeSJay Foad return true; 53*cfb7ffdeSJay Foad if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && 54*cfb7ffdeSJay Foad (MI.getOperand(0).getImm() & 0xf000) == 0) 55*cfb7ffdeSJay Foad return true; 56*cfb7ffdeSJay Foad return false; 57*cfb7ffdeSJay Foad } 58*cfb7ffdeSJay Foad 59*cfb7ffdeSJay Foad // Types of delay that can be encoded in an s_delay_alu instruction. 60*cfb7ffdeSJay Foad enum DelayType { VALU, TRANS, SALU, OTHER }; 61*cfb7ffdeSJay Foad 62*cfb7ffdeSJay Foad // Get the delay type for an instruction with the specified TSFlags. getDelayType(uint64_t TSFlags)63*cfb7ffdeSJay Foad static DelayType getDelayType(uint64_t TSFlags) { 64*cfb7ffdeSJay Foad if (TSFlags & SIInstrFlags::TRANS) 65*cfb7ffdeSJay Foad return TRANS; 66*cfb7ffdeSJay Foad if (TSFlags & SIInstrFlags::VALU) 67*cfb7ffdeSJay Foad return VALU; 68*cfb7ffdeSJay Foad if (TSFlags & SIInstrFlags::SALU) 69*cfb7ffdeSJay Foad return SALU; 70*cfb7ffdeSJay Foad return OTHER; 71*cfb7ffdeSJay Foad } 72*cfb7ffdeSJay Foad 73*cfb7ffdeSJay Foad // Information about the last instruction(s) that wrote to a particular 74*cfb7ffdeSJay Foad // regunit. In straight-line code there will only be one such instruction, but 75*cfb7ffdeSJay Foad // when control flow converges we merge the delay information from each path 76*cfb7ffdeSJay Foad // to represent the union of the worst-case delays of each type. 77*cfb7ffdeSJay Foad struct DelayInfo { 78*cfb7ffdeSJay Foad // One larger than the maximum number of (non-TRANS) VALU instructions we 79*cfb7ffdeSJay Foad // can encode in an s_delay_alu instruction. 80*cfb7ffdeSJay Foad static const unsigned VALU_MAX = 5; 81*cfb7ffdeSJay Foad 82*cfb7ffdeSJay Foad // One larger than the maximum number of TRANS instructions we can encode in 83*cfb7ffdeSJay Foad // an s_delay_alu instruction. 84*cfb7ffdeSJay Foad static const unsigned TRANS_MAX = 4; 85*cfb7ffdeSJay Foad 86*cfb7ffdeSJay Foad // If it was written by a (non-TRANS) VALU, remember how many clock cycles 87*cfb7ffdeSJay Foad // are left until it completes, and how many other (non-TRANS) VALU we have 88*cfb7ffdeSJay Foad // seen since it was issued. 89*cfb7ffdeSJay Foad uint8_t VALUCycles = 0; 90*cfb7ffdeSJay Foad uint8_t VALUNum = VALU_MAX; 91*cfb7ffdeSJay Foad 92*cfb7ffdeSJay Foad // If it was written by a TRANS, remember how many clock cycles are left 93*cfb7ffdeSJay Foad // until it completes, and how many other TRANS we have seen since it was 94*cfb7ffdeSJay Foad // issued. 95*cfb7ffdeSJay Foad uint8_t TRANSCycles = 0; 96*cfb7ffdeSJay Foad uint8_t TRANSNum = TRANS_MAX; 97*cfb7ffdeSJay Foad // Also remember how many other (non-TRANS) VALU we have seen since it was 98*cfb7ffdeSJay Foad // issued. When an instruction depends on both a prior TRANS and a prior 99*cfb7ffdeSJay Foad // non-TRANS VALU, this is used to decide whether to encode a wait for just 100*cfb7ffdeSJay Foad // one or both of them. 101*cfb7ffdeSJay Foad uint8_t TRANSNumVALU = VALU_MAX; 102*cfb7ffdeSJay Foad 103*cfb7ffdeSJay Foad // If it was written by an SALU, remember how many clock cycles are left 104*cfb7ffdeSJay Foad // until it completes. 105*cfb7ffdeSJay Foad uint8_t SALUCycles = 0; 106*cfb7ffdeSJay Foad 107*cfb7ffdeSJay Foad DelayInfo() = default; 108*cfb7ffdeSJay Foad DelayInfo__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo109*cfb7ffdeSJay Foad DelayInfo(DelayType Type, unsigned Cycles) { 110*cfb7ffdeSJay Foad switch (Type) { 111*cfb7ffdeSJay Foad default: 112*cfb7ffdeSJay Foad llvm_unreachable("unexpected type"); 113*cfb7ffdeSJay Foad case VALU: 114*cfb7ffdeSJay Foad VALUCycles = Cycles; 115*cfb7ffdeSJay Foad VALUNum = 0; 116*cfb7ffdeSJay Foad break; 117*cfb7ffdeSJay Foad case TRANS: 118*cfb7ffdeSJay Foad TRANSCycles = Cycles; 119*cfb7ffdeSJay Foad TRANSNum = 0; 120*cfb7ffdeSJay Foad TRANSNumVALU = 0; 121*cfb7ffdeSJay Foad break; 122*cfb7ffdeSJay Foad case SALU: 123*cfb7ffdeSJay Foad SALUCycles = Cycles; 124*cfb7ffdeSJay Foad break; 125*cfb7ffdeSJay Foad } 126*cfb7ffdeSJay Foad } 127*cfb7ffdeSJay Foad operator ==__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo128*cfb7ffdeSJay Foad bool operator==(const DelayInfo &RHS) const { 129*cfb7ffdeSJay Foad return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum && 130*cfb7ffdeSJay Foad TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum && 131*cfb7ffdeSJay Foad TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles; 132*cfb7ffdeSJay Foad } 133*cfb7ffdeSJay Foad operator !=__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo134*cfb7ffdeSJay Foad bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); } 135*cfb7ffdeSJay Foad 136*cfb7ffdeSJay Foad // Merge another DelayInfo into this one, to represent the union of the 137*cfb7ffdeSJay Foad // worst-case delays of each type. merge__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo138*cfb7ffdeSJay Foad void merge(const DelayInfo &RHS) { 139*cfb7ffdeSJay Foad VALUCycles = std::max(VALUCycles, RHS.VALUCycles); 140*cfb7ffdeSJay Foad VALUNum = std::min(VALUNum, RHS.VALUNum); 141*cfb7ffdeSJay Foad TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles); 142*cfb7ffdeSJay Foad TRANSNum = std::min(TRANSNum, RHS.TRANSNum); 143*cfb7ffdeSJay Foad TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU); 144*cfb7ffdeSJay Foad SALUCycles = std::max(SALUCycles, RHS.SALUCycles); 145*cfb7ffdeSJay Foad } 146*cfb7ffdeSJay Foad 147*cfb7ffdeSJay Foad // Update this DelayInfo after issuing an instruction. IsVALU should be 1 148*cfb7ffdeSJay Foad // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing 149*cfb7ffdeSJay Foad // a TRANS, else 0. Cycles is the number of cycles it takes to issue the 150*cfb7ffdeSJay Foad // instruction. Return true if there is no longer any useful delay info. advance__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo151*cfb7ffdeSJay Foad bool advance(DelayType Type, unsigned Cycles) { 152*cfb7ffdeSJay Foad bool Erase = true; 153*cfb7ffdeSJay Foad 154*cfb7ffdeSJay Foad VALUNum += (Type == VALU); 155*cfb7ffdeSJay Foad if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) { 156*cfb7ffdeSJay Foad // Forget about the VALU instruction. It was too far back or has 157*cfb7ffdeSJay Foad // definitely completed by now. 158*cfb7ffdeSJay Foad VALUNum = VALU_MAX; 159*cfb7ffdeSJay Foad VALUCycles = 0; 160*cfb7ffdeSJay Foad } else { 161*cfb7ffdeSJay Foad VALUCycles -= Cycles; 162*cfb7ffdeSJay Foad Erase = false; 163*cfb7ffdeSJay Foad } 164*cfb7ffdeSJay Foad 165*cfb7ffdeSJay Foad TRANSNum += (Type == TRANS); 166*cfb7ffdeSJay Foad TRANSNumVALU += (Type == VALU); 167*cfb7ffdeSJay Foad if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) { 168*cfb7ffdeSJay Foad // Forget about any TRANS instruction. It was too far back or has 169*cfb7ffdeSJay Foad // definitely completed by now. 170*cfb7ffdeSJay Foad TRANSNum = TRANS_MAX; 171*cfb7ffdeSJay Foad TRANSNumVALU = VALU_MAX; 172*cfb7ffdeSJay Foad TRANSCycles = 0; 173*cfb7ffdeSJay Foad } else { 174*cfb7ffdeSJay Foad TRANSCycles -= Cycles; 175*cfb7ffdeSJay Foad Erase = false; 176*cfb7ffdeSJay Foad } 177*cfb7ffdeSJay Foad 178*cfb7ffdeSJay Foad if (SALUCycles <= Cycles) { 179*cfb7ffdeSJay Foad // Forget about any SALU instruction. It has definitely completed by 180*cfb7ffdeSJay Foad // now. 181*cfb7ffdeSJay Foad SALUCycles = 0; 182*cfb7ffdeSJay Foad } else { 183*cfb7ffdeSJay Foad SALUCycles -= Cycles; 184*cfb7ffdeSJay Foad Erase = false; 185*cfb7ffdeSJay Foad } 186*cfb7ffdeSJay Foad 187*cfb7ffdeSJay Foad return Erase; 188*cfb7ffdeSJay Foad } 189*cfb7ffdeSJay Foad 190*cfb7ffdeSJay Foad #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dump__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo191*cfb7ffdeSJay Foad void dump() const { 192*cfb7ffdeSJay Foad if (VALUCycles) 193*cfb7ffdeSJay Foad dbgs() << " VALUCycles=" << (int)VALUCycles; 194*cfb7ffdeSJay Foad if (VALUNum < VALU_MAX) 195*cfb7ffdeSJay Foad dbgs() << " VALUNum=" << (int)VALUNum; 196*cfb7ffdeSJay Foad if (TRANSCycles) 197*cfb7ffdeSJay Foad dbgs() << " TRANSCycles=" << (int)TRANSCycles; 198*cfb7ffdeSJay Foad if (TRANSNum < TRANS_MAX) 199*cfb7ffdeSJay Foad dbgs() << " TRANSNum=" << (int)TRANSNum; 200*cfb7ffdeSJay Foad if (TRANSNumVALU < VALU_MAX) 201*cfb7ffdeSJay Foad dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU; 202*cfb7ffdeSJay Foad if (SALUCycles) 203*cfb7ffdeSJay Foad dbgs() << " SALUCycles=" << (int)SALUCycles; 204*cfb7ffdeSJay Foad } 205*cfb7ffdeSJay Foad #endif 206*cfb7ffdeSJay Foad }; 207*cfb7ffdeSJay Foad 208*cfb7ffdeSJay Foad // A map from regunits to the delay info for that regunit. 209*cfb7ffdeSJay Foad struct DelayState : DenseMap<unsigned, DelayInfo> { 210*cfb7ffdeSJay Foad // Merge another DelayState into this one by merging the delay info for each 211*cfb7ffdeSJay Foad // regunit. merge__anond01463870111::AMDGPUInsertDelayAlu::DelayState212*cfb7ffdeSJay Foad void merge(const DelayState &RHS) { 213*cfb7ffdeSJay Foad for (const auto &KV : RHS) { 214*cfb7ffdeSJay Foad iterator It; 215*cfb7ffdeSJay Foad bool Inserted; 216*cfb7ffdeSJay Foad std::tie(It, Inserted) = insert(KV); 217*cfb7ffdeSJay Foad if (!Inserted) 218*cfb7ffdeSJay Foad It->second.merge(KV.second); 219*cfb7ffdeSJay Foad } 220*cfb7ffdeSJay Foad } 221*cfb7ffdeSJay Foad 222*cfb7ffdeSJay Foad // Advance the delay info for each regunit, erasing any that are no longer 223*cfb7ffdeSJay Foad // useful. advance__anond01463870111::AMDGPUInsertDelayAlu::DelayState224*cfb7ffdeSJay Foad void advance(DelayType Type, unsigned Cycles) { 225*cfb7ffdeSJay Foad iterator Next; 226*cfb7ffdeSJay Foad for (auto I = begin(), E = end(); I != E; I = Next) { 227*cfb7ffdeSJay Foad Next = std::next(I); 228*cfb7ffdeSJay Foad if (I->second.advance(Type, Cycles)) 229*cfb7ffdeSJay Foad erase(I); 230*cfb7ffdeSJay Foad } 231*cfb7ffdeSJay Foad } 232*cfb7ffdeSJay Foad 233*cfb7ffdeSJay Foad #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) dump__anond01463870111::AMDGPUInsertDelayAlu::DelayState234*cfb7ffdeSJay Foad void dump(const TargetRegisterInfo *TRI) const { 235*cfb7ffdeSJay Foad if (empty()) { 236*cfb7ffdeSJay Foad dbgs() << " empty\n"; 237*cfb7ffdeSJay Foad return; 238*cfb7ffdeSJay Foad } 239*cfb7ffdeSJay Foad 240*cfb7ffdeSJay Foad // Dump DelayInfo for each RegUnit in numerical order. 241*cfb7ffdeSJay Foad SmallVector<const_iterator, 8> Order; 242*cfb7ffdeSJay Foad Order.reserve(size()); 243*cfb7ffdeSJay Foad for (const_iterator I = begin(), E = end(); I != E; ++I) 244*cfb7ffdeSJay Foad Order.push_back(I); 245*cfb7ffdeSJay Foad llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) { 246*cfb7ffdeSJay Foad return A->first < B->first; 247*cfb7ffdeSJay Foad }); 248*cfb7ffdeSJay Foad for (const_iterator I : Order) { 249*cfb7ffdeSJay Foad dbgs() << " " << printRegUnit(I->first, TRI); 250*cfb7ffdeSJay Foad I->second.dump(); 251*cfb7ffdeSJay Foad dbgs() << "\n"; 252*cfb7ffdeSJay Foad } 253*cfb7ffdeSJay Foad } 254*cfb7ffdeSJay Foad #endif 255*cfb7ffdeSJay Foad }; 256*cfb7ffdeSJay Foad 257*cfb7ffdeSJay Foad // The saved delay state at the end of each basic block. 258*cfb7ffdeSJay Foad DenseMap<MachineBasicBlock *, DelayState> BlockState; 259*cfb7ffdeSJay Foad 260*cfb7ffdeSJay Foad // Emit an s_delay_alu instruction if necessary before MI. emitDelayAlu(MachineInstr & MI,DelayInfo Delay,MachineInstr * LastDelayAlu)261*cfb7ffdeSJay Foad MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay, 262*cfb7ffdeSJay Foad MachineInstr *LastDelayAlu) { 263*cfb7ffdeSJay Foad unsigned Imm = 0; 264*cfb7ffdeSJay Foad 265*cfb7ffdeSJay Foad // Wait for a TRANS instruction. 266*cfb7ffdeSJay Foad if (Delay.TRANSNum < DelayInfo::TRANS_MAX) 267*cfb7ffdeSJay Foad Imm |= 4 + Delay.TRANSNum; 268*cfb7ffdeSJay Foad 269*cfb7ffdeSJay Foad // Wait for a VALU instruction (if it's more recent than any TRANS 270*cfb7ffdeSJay Foad // instruction that we're also waiting for). 271*cfb7ffdeSJay Foad if (Delay.VALUNum < DelayInfo::VALU_MAX && 272*cfb7ffdeSJay Foad Delay.VALUNum <= Delay.TRANSNumVALU) { 273*cfb7ffdeSJay Foad if (Imm & 0xf) 274*cfb7ffdeSJay Foad Imm |= Delay.VALUNum << 7; 275*cfb7ffdeSJay Foad else 276*cfb7ffdeSJay Foad Imm |= Delay.VALUNum; 277*cfb7ffdeSJay Foad } 278*cfb7ffdeSJay Foad 279*cfb7ffdeSJay Foad // Wait for an SALU instruction. 280*cfb7ffdeSJay Foad if (Delay.SALUCycles) { 281*cfb7ffdeSJay Foad if (Imm & 0x780) { 282*cfb7ffdeSJay Foad // We have already encoded a VALU and a TRANS delay. There's no room in 283*cfb7ffdeSJay Foad // the encoding for an SALU delay as well, so just drop it. 284*cfb7ffdeSJay Foad } else if (Imm & 0xf) { 285*cfb7ffdeSJay Foad Imm |= (Delay.SALUCycles + 8) << 7; 286*cfb7ffdeSJay Foad } else { 287*cfb7ffdeSJay Foad Imm |= Delay.SALUCycles + 8; 288*cfb7ffdeSJay Foad } 289*cfb7ffdeSJay Foad } 290*cfb7ffdeSJay Foad 291*cfb7ffdeSJay Foad // Don't emit the s_delay_alu instruction if there's nothing to wait for. 292*cfb7ffdeSJay Foad if (!Imm) 293*cfb7ffdeSJay Foad return LastDelayAlu; 294*cfb7ffdeSJay Foad 295*cfb7ffdeSJay Foad // If we only need to wait for one instruction, try encoding it in the last 296*cfb7ffdeSJay Foad // s_delay_alu that we emitted. 297*cfb7ffdeSJay Foad if (!(Imm & 0x780) && LastDelayAlu) { 298*cfb7ffdeSJay Foad unsigned Skip = 0; 299*cfb7ffdeSJay Foad for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu), 300*cfb7ffdeSJay Foad E = MachineBasicBlock::instr_iterator(MI); 301*cfb7ffdeSJay Foad ++I != E;) { 302*cfb7ffdeSJay Foad if (!I->isBundle() && !I->isMetaInstruction()) 303*cfb7ffdeSJay Foad ++Skip; 304*cfb7ffdeSJay Foad } 305*cfb7ffdeSJay Foad if (Skip < 6) { 306*cfb7ffdeSJay Foad MachineOperand &Op = LastDelayAlu->getOperand(0); 307*cfb7ffdeSJay Foad unsigned LastImm = Op.getImm(); 308*cfb7ffdeSJay Foad assert((LastImm & ~0xf) == 0 && 309*cfb7ffdeSJay Foad "Remembered an s_delay_alu with no room for another delay!"); 310*cfb7ffdeSJay Foad LastImm |= Imm << 7 | Skip << 4; 311*cfb7ffdeSJay Foad Op.setImm(LastImm); 312*cfb7ffdeSJay Foad return nullptr; 313*cfb7ffdeSJay Foad } 314*cfb7ffdeSJay Foad } 315*cfb7ffdeSJay Foad 316*cfb7ffdeSJay Foad auto &MBB = *MI.getParent(); 317*cfb7ffdeSJay Foad MachineInstr *DelayAlu = 318*cfb7ffdeSJay Foad BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm); 319*cfb7ffdeSJay Foad // Remember the s_delay_alu for next time if there is still room in it to 320*cfb7ffdeSJay Foad // encode another delay. 321*cfb7ffdeSJay Foad return (Imm & 0x780) ? nullptr : DelayAlu; 322*cfb7ffdeSJay Foad } 323*cfb7ffdeSJay Foad runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)324*cfb7ffdeSJay Foad bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) { 325*cfb7ffdeSJay Foad DelayState State; 326*cfb7ffdeSJay Foad for (auto *Pred : MBB.predecessors()) 327*cfb7ffdeSJay Foad State.merge(BlockState[Pred]); 328*cfb7ffdeSJay Foad 329*cfb7ffdeSJay Foad LLVM_DEBUG(dbgs() << " State at start of " << printMBBReference(MBB) 330*cfb7ffdeSJay Foad << "\n"; 331*cfb7ffdeSJay Foad State.dump(TRI);); 332*cfb7ffdeSJay Foad 333*cfb7ffdeSJay Foad bool Changed = false; 334*cfb7ffdeSJay Foad MachineInstr *LastDelayAlu = nullptr; 335*cfb7ffdeSJay Foad 336*cfb7ffdeSJay Foad // Iterate over the contents of bundles, but don't emit any instructions 337*cfb7ffdeSJay Foad // inside a bundle. 338*cfb7ffdeSJay Foad for (auto &MI : MBB.instrs()) { 339*cfb7ffdeSJay Foad if (MI.isBundle() || MI.isMetaInstruction()) 340*cfb7ffdeSJay Foad continue; 341*cfb7ffdeSJay Foad 342*cfb7ffdeSJay Foad // Ignore some more instructions that do not generate any code. 343*cfb7ffdeSJay Foad switch (MI.getOpcode()) { 344*cfb7ffdeSJay Foad case AMDGPU::SI_RETURN_TO_EPILOG: 345*cfb7ffdeSJay Foad continue; 346*cfb7ffdeSJay Foad } 347*cfb7ffdeSJay Foad 348*cfb7ffdeSJay Foad DelayType Type = getDelayType(MI.getDesc().TSFlags); 349*cfb7ffdeSJay Foad 350*cfb7ffdeSJay Foad if (instructionWaitsForVALU(MI)) { 351*cfb7ffdeSJay Foad // Forget about all outstanding VALU delays. 352*cfb7ffdeSJay Foad State = DelayState(); 353*cfb7ffdeSJay Foad } else if (Type != OTHER) { 354*cfb7ffdeSJay Foad DelayInfo Delay; 355*cfb7ffdeSJay Foad // TODO: Scan implicit uses too? 356*cfb7ffdeSJay Foad for (const auto &Op : MI.explicit_uses()) { 357*cfb7ffdeSJay Foad if (Op.isReg()) { 358*cfb7ffdeSJay Foad // One of the operands of the writelane is also the output operand. 359*cfb7ffdeSJay Foad // This creates the insertion of redundant delays. Hence, we have to 360*cfb7ffdeSJay Foad // ignore this operand. 361*cfb7ffdeSJay Foad if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied()) 362*cfb7ffdeSJay Foad continue; 363*cfb7ffdeSJay Foad for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) { 364*cfb7ffdeSJay Foad auto It = State.find(*UI); 365*cfb7ffdeSJay Foad if (It != State.end()) { 366*cfb7ffdeSJay Foad Delay.merge(It->second); 367*cfb7ffdeSJay Foad State.erase(*UI); 368*cfb7ffdeSJay Foad } 369*cfb7ffdeSJay Foad } 370*cfb7ffdeSJay Foad } 371*cfb7ffdeSJay Foad } 372*cfb7ffdeSJay Foad if (Emit && !MI.isBundledWithPred()) { 373*cfb7ffdeSJay Foad // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or 374*cfb7ffdeSJay Foad // just ignore them? 375*cfb7ffdeSJay Foad LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu); 376*cfb7ffdeSJay Foad } 377*cfb7ffdeSJay Foad } 378*cfb7ffdeSJay Foad 379*cfb7ffdeSJay Foad if (Type != OTHER) { 380*cfb7ffdeSJay Foad // TODO: Scan implicit defs too? 381*cfb7ffdeSJay Foad for (const auto &Op : MI.defs()) { 382*cfb7ffdeSJay Foad unsigned Latency = SchedModel.computeOperandLatency( 383*cfb7ffdeSJay Foad &MI, MI.getOperandNo(&Op), nullptr, 0); 384*cfb7ffdeSJay Foad for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) 385*cfb7ffdeSJay Foad State[*UI] = DelayInfo(Type, Latency); 386*cfb7ffdeSJay Foad } 387*cfb7ffdeSJay Foad } 388*cfb7ffdeSJay Foad 389*cfb7ffdeSJay Foad // Advance by the number of cycles it takes to issue this instruction. 390*cfb7ffdeSJay Foad // TODO: Use a more advanced model that accounts for instructions that 391*cfb7ffdeSJay Foad // take multiple cycles to issue on a particular pipeline. 392*cfb7ffdeSJay Foad unsigned Cycles = SIInstrInfo::getNumWaitStates(MI); 393*cfb7ffdeSJay Foad // TODO: In wave64 mode, double the number of cycles for VALU and VMEM 394*cfb7ffdeSJay Foad // instructions on the assumption that they will usually have to be issued 395*cfb7ffdeSJay Foad // twice? 396*cfb7ffdeSJay Foad State.advance(Type, Cycles); 397*cfb7ffdeSJay Foad 398*cfb7ffdeSJay Foad LLVM_DEBUG(dbgs() << " State after " << MI; State.dump(TRI);); 399*cfb7ffdeSJay Foad } 400*cfb7ffdeSJay Foad 401*cfb7ffdeSJay Foad if (Emit) { 402*cfb7ffdeSJay Foad assert(State == BlockState[&MBB] && 403*cfb7ffdeSJay Foad "Basic block state should not have changed on final pass!"); 404*cfb7ffdeSJay Foad } else if (State != BlockState[&MBB]) { 405*cfb7ffdeSJay Foad BlockState[&MBB] = std::move(State); 406*cfb7ffdeSJay Foad Changed = true; 407*cfb7ffdeSJay Foad } 408*cfb7ffdeSJay Foad return Changed; 409*cfb7ffdeSJay Foad } 410*cfb7ffdeSJay Foad runOnMachineFunction(MachineFunction & MF)411*cfb7ffdeSJay Foad bool runOnMachineFunction(MachineFunction &MF) override { 412*cfb7ffdeSJay Foad if (skipFunction(MF.getFunction())) 413*cfb7ffdeSJay Foad return false; 414*cfb7ffdeSJay Foad 415*cfb7ffdeSJay Foad LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() 416*cfb7ffdeSJay Foad << "\n"); 417*cfb7ffdeSJay Foad 418*cfb7ffdeSJay Foad const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 419*cfb7ffdeSJay Foad if (!ST.hasDelayAlu()) 420*cfb7ffdeSJay Foad return false; 421*cfb7ffdeSJay Foad 422*cfb7ffdeSJay Foad SII = ST.getInstrInfo(); 423*cfb7ffdeSJay Foad TRI = ST.getRegisterInfo(); 424*cfb7ffdeSJay Foad 425*cfb7ffdeSJay Foad SchedModel.init(&ST); 426*cfb7ffdeSJay Foad 427*cfb7ffdeSJay Foad // Calculate the delay state for each basic block, iterating until we reach 428*cfb7ffdeSJay Foad // a fixed point. 429*cfb7ffdeSJay Foad SetVector<MachineBasicBlock *> WorkList; 430*cfb7ffdeSJay Foad for (auto &MBB : reverse(MF)) 431*cfb7ffdeSJay Foad WorkList.insert(&MBB); 432*cfb7ffdeSJay Foad while (!WorkList.empty()) { 433*cfb7ffdeSJay Foad auto &MBB = *WorkList.pop_back_val(); 434*cfb7ffdeSJay Foad bool Changed = runOnMachineBasicBlock(MBB, false); 435*cfb7ffdeSJay Foad if (Changed) 436*cfb7ffdeSJay Foad WorkList.insert(MBB.succ_begin(), MBB.succ_end()); 437*cfb7ffdeSJay Foad } 438*cfb7ffdeSJay Foad 439*cfb7ffdeSJay Foad LLVM_DEBUG(dbgs() << "Final pass over all BBs\n"); 440*cfb7ffdeSJay Foad 441*cfb7ffdeSJay Foad // Make one last pass over all basic blocks to emit s_delay_alu 442*cfb7ffdeSJay Foad // instructions. 443*cfb7ffdeSJay Foad bool Changed = false; 444*cfb7ffdeSJay Foad for (auto &MBB : MF) 445*cfb7ffdeSJay Foad Changed |= runOnMachineBasicBlock(MBB, true); 446*cfb7ffdeSJay Foad return Changed; 447*cfb7ffdeSJay Foad } 448*cfb7ffdeSJay Foad }; 449*cfb7ffdeSJay Foad 450*cfb7ffdeSJay Foad } // namespace 451*cfb7ffdeSJay Foad 452*cfb7ffdeSJay Foad char AMDGPUInsertDelayAlu::ID = 0; 453*cfb7ffdeSJay Foad 454*cfb7ffdeSJay Foad char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID; 455*cfb7ffdeSJay Foad 456*cfb7ffdeSJay Foad INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU", 457*cfb7ffdeSJay Foad false, false) 458