1*cfb7ffdeSJay Foad //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2*cfb7ffdeSJay Foad //
3*cfb7ffdeSJay Foad // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*cfb7ffdeSJay Foad // See https://llvm.org/LICENSE.txt for license information.
5*cfb7ffdeSJay Foad // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*cfb7ffdeSJay Foad //
7*cfb7ffdeSJay Foad //===----------------------------------------------------------------------===//
8*cfb7ffdeSJay Foad //
9*cfb7ffdeSJay Foad /// \file
10*cfb7ffdeSJay Foad /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11*cfb7ffdeSJay Foad //
12*cfb7ffdeSJay Foad //===----------------------------------------------------------------------===//
13*cfb7ffdeSJay Foad 
14*cfb7ffdeSJay Foad #include "AMDGPU.h"
15*cfb7ffdeSJay Foad #include "GCNSubtarget.h"
16*cfb7ffdeSJay Foad #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17*cfb7ffdeSJay Foad #include "SIInstrInfo.h"
18*cfb7ffdeSJay Foad #include "llvm/ADT/SetVector.h"
19*cfb7ffdeSJay Foad 
20*cfb7ffdeSJay Foad using namespace llvm;
21*cfb7ffdeSJay Foad 
22*cfb7ffdeSJay Foad #define DEBUG_TYPE "amdgpu-insert-delay-alu"
23*cfb7ffdeSJay Foad 
24*cfb7ffdeSJay Foad namespace {
25*cfb7ffdeSJay Foad 
26*cfb7ffdeSJay Foad class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27*cfb7ffdeSJay Foad public:
28*cfb7ffdeSJay Foad   static char ID;
29*cfb7ffdeSJay Foad 
30*cfb7ffdeSJay Foad   const SIInstrInfo *SII;
31*cfb7ffdeSJay Foad   const TargetRegisterInfo *TRI;
32*cfb7ffdeSJay Foad 
33*cfb7ffdeSJay Foad   TargetSchedModel SchedModel;
34*cfb7ffdeSJay Foad 
AMDGPUInsertDelayAlu()35*cfb7ffdeSJay Foad   AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36*cfb7ffdeSJay Foad 
getAnalysisUsage(AnalysisUsage & AU) const37*cfb7ffdeSJay Foad   void getAnalysisUsage(AnalysisUsage &AU) const override {
38*cfb7ffdeSJay Foad     AU.setPreservesCFG();
39*cfb7ffdeSJay Foad     MachineFunctionPass::getAnalysisUsage(AU);
40*cfb7ffdeSJay Foad   }
41*cfb7ffdeSJay Foad 
42*cfb7ffdeSJay Foad   // Return true if MI waits for all outstanding VALU instructions to complete.
instructionWaitsForVALU(const MachineInstr & MI)43*cfb7ffdeSJay Foad   static bool instructionWaitsForVALU(const MachineInstr &MI) {
44*cfb7ffdeSJay Foad     // These instruction types wait for VA_VDST==0 before issuing.
45*cfb7ffdeSJay Foad     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46*cfb7ffdeSJay Foad                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47*cfb7ffdeSJay Foad                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48*cfb7ffdeSJay Foad     if (MI.getDesc().TSFlags & VA_VDST_0)
49*cfb7ffdeSJay Foad       return true;
50*cfb7ffdeSJay Foad     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51*cfb7ffdeSJay Foad         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52*cfb7ffdeSJay Foad       return true;
53*cfb7ffdeSJay Foad     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54*cfb7ffdeSJay Foad         (MI.getOperand(0).getImm() & 0xf000) == 0)
55*cfb7ffdeSJay Foad       return true;
56*cfb7ffdeSJay Foad     return false;
57*cfb7ffdeSJay Foad   }
58*cfb7ffdeSJay Foad 
59*cfb7ffdeSJay Foad   // Types of delay that can be encoded in an s_delay_alu instruction.
60*cfb7ffdeSJay Foad   enum DelayType { VALU, TRANS, SALU, OTHER };
61*cfb7ffdeSJay Foad 
62*cfb7ffdeSJay Foad   // Get the delay type for an instruction with the specified TSFlags.
getDelayType(uint64_t TSFlags)63*cfb7ffdeSJay Foad   static DelayType getDelayType(uint64_t TSFlags) {
64*cfb7ffdeSJay Foad     if (TSFlags & SIInstrFlags::TRANS)
65*cfb7ffdeSJay Foad       return TRANS;
66*cfb7ffdeSJay Foad     if (TSFlags & SIInstrFlags::VALU)
67*cfb7ffdeSJay Foad       return VALU;
68*cfb7ffdeSJay Foad     if (TSFlags & SIInstrFlags::SALU)
69*cfb7ffdeSJay Foad       return SALU;
70*cfb7ffdeSJay Foad     return OTHER;
71*cfb7ffdeSJay Foad   }
72*cfb7ffdeSJay Foad 
73*cfb7ffdeSJay Foad   // Information about the last instruction(s) that wrote to a particular
74*cfb7ffdeSJay Foad   // regunit. In straight-line code there will only be one such instruction, but
75*cfb7ffdeSJay Foad   // when control flow converges we merge the delay information from each path
76*cfb7ffdeSJay Foad   // to represent the union of the worst-case delays of each type.
77*cfb7ffdeSJay Foad   struct DelayInfo {
78*cfb7ffdeSJay Foad     // One larger than the maximum number of (non-TRANS) VALU instructions we
79*cfb7ffdeSJay Foad     // can encode in an s_delay_alu instruction.
80*cfb7ffdeSJay Foad     static const unsigned VALU_MAX = 5;
81*cfb7ffdeSJay Foad 
82*cfb7ffdeSJay Foad     // One larger than the maximum number of TRANS instructions we can encode in
83*cfb7ffdeSJay Foad     // an s_delay_alu instruction.
84*cfb7ffdeSJay Foad     static const unsigned TRANS_MAX = 4;
85*cfb7ffdeSJay Foad 
86*cfb7ffdeSJay Foad     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
87*cfb7ffdeSJay Foad     // are left until it completes, and how many other (non-TRANS) VALU we have
88*cfb7ffdeSJay Foad     // seen since it was issued.
89*cfb7ffdeSJay Foad     uint8_t VALUCycles = 0;
90*cfb7ffdeSJay Foad     uint8_t VALUNum = VALU_MAX;
91*cfb7ffdeSJay Foad 
92*cfb7ffdeSJay Foad     // If it was written by a TRANS, remember how many clock cycles are left
93*cfb7ffdeSJay Foad     // until it completes, and how many other TRANS we have seen since it was
94*cfb7ffdeSJay Foad     // issued.
95*cfb7ffdeSJay Foad     uint8_t TRANSCycles = 0;
96*cfb7ffdeSJay Foad     uint8_t TRANSNum = TRANS_MAX;
97*cfb7ffdeSJay Foad     // Also remember how many other (non-TRANS) VALU we have seen since it was
98*cfb7ffdeSJay Foad     // issued. When an instruction depends on both a prior TRANS and a prior
99*cfb7ffdeSJay Foad     // non-TRANS VALU, this is used to decide whether to encode a wait for just
100*cfb7ffdeSJay Foad     // one or both of them.
101*cfb7ffdeSJay Foad     uint8_t TRANSNumVALU = VALU_MAX;
102*cfb7ffdeSJay Foad 
103*cfb7ffdeSJay Foad     // If it was written by an SALU, remember how many clock cycles are left
104*cfb7ffdeSJay Foad     // until it completes.
105*cfb7ffdeSJay Foad     uint8_t SALUCycles = 0;
106*cfb7ffdeSJay Foad 
107*cfb7ffdeSJay Foad     DelayInfo() = default;
108*cfb7ffdeSJay Foad 
DelayInfo__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo109*cfb7ffdeSJay Foad     DelayInfo(DelayType Type, unsigned Cycles) {
110*cfb7ffdeSJay Foad       switch (Type) {
111*cfb7ffdeSJay Foad       default:
112*cfb7ffdeSJay Foad         llvm_unreachable("unexpected type");
113*cfb7ffdeSJay Foad       case VALU:
114*cfb7ffdeSJay Foad         VALUCycles = Cycles;
115*cfb7ffdeSJay Foad         VALUNum = 0;
116*cfb7ffdeSJay Foad         break;
117*cfb7ffdeSJay Foad       case TRANS:
118*cfb7ffdeSJay Foad         TRANSCycles = Cycles;
119*cfb7ffdeSJay Foad         TRANSNum = 0;
120*cfb7ffdeSJay Foad         TRANSNumVALU = 0;
121*cfb7ffdeSJay Foad         break;
122*cfb7ffdeSJay Foad       case SALU:
123*cfb7ffdeSJay Foad         SALUCycles = Cycles;
124*cfb7ffdeSJay Foad         break;
125*cfb7ffdeSJay Foad       }
126*cfb7ffdeSJay Foad     }
127*cfb7ffdeSJay Foad 
operator ==__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo128*cfb7ffdeSJay Foad     bool operator==(const DelayInfo &RHS) const {
129*cfb7ffdeSJay Foad       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
130*cfb7ffdeSJay Foad              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
131*cfb7ffdeSJay Foad              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
132*cfb7ffdeSJay Foad     }
133*cfb7ffdeSJay Foad 
operator !=__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo134*cfb7ffdeSJay Foad     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
135*cfb7ffdeSJay Foad 
136*cfb7ffdeSJay Foad     // Merge another DelayInfo into this one, to represent the union of the
137*cfb7ffdeSJay Foad     // worst-case delays of each type.
merge__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo138*cfb7ffdeSJay Foad     void merge(const DelayInfo &RHS) {
139*cfb7ffdeSJay Foad       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
140*cfb7ffdeSJay Foad       VALUNum = std::min(VALUNum, RHS.VALUNum);
141*cfb7ffdeSJay Foad       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
142*cfb7ffdeSJay Foad       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
143*cfb7ffdeSJay Foad       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
144*cfb7ffdeSJay Foad       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
145*cfb7ffdeSJay Foad     }
146*cfb7ffdeSJay Foad 
147*cfb7ffdeSJay Foad     // Update this DelayInfo after issuing an instruction. IsVALU should be 1
148*cfb7ffdeSJay Foad     // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
149*cfb7ffdeSJay Foad     // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
150*cfb7ffdeSJay Foad     // instruction.  Return true if there is no longer any useful delay info.
advance__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo151*cfb7ffdeSJay Foad     bool advance(DelayType Type, unsigned Cycles) {
152*cfb7ffdeSJay Foad       bool Erase = true;
153*cfb7ffdeSJay Foad 
154*cfb7ffdeSJay Foad       VALUNum += (Type == VALU);
155*cfb7ffdeSJay Foad       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
156*cfb7ffdeSJay Foad         // Forget about the VALU instruction. It was too far back or has
157*cfb7ffdeSJay Foad         // definitely completed by now.
158*cfb7ffdeSJay Foad         VALUNum = VALU_MAX;
159*cfb7ffdeSJay Foad         VALUCycles = 0;
160*cfb7ffdeSJay Foad       } else {
161*cfb7ffdeSJay Foad         VALUCycles -= Cycles;
162*cfb7ffdeSJay Foad         Erase = false;
163*cfb7ffdeSJay Foad       }
164*cfb7ffdeSJay Foad 
165*cfb7ffdeSJay Foad       TRANSNum += (Type == TRANS);
166*cfb7ffdeSJay Foad       TRANSNumVALU += (Type == VALU);
167*cfb7ffdeSJay Foad       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
168*cfb7ffdeSJay Foad         // Forget about any TRANS instruction. It was too far back or has
169*cfb7ffdeSJay Foad         // definitely completed by now.
170*cfb7ffdeSJay Foad         TRANSNum = TRANS_MAX;
171*cfb7ffdeSJay Foad         TRANSNumVALU = VALU_MAX;
172*cfb7ffdeSJay Foad         TRANSCycles = 0;
173*cfb7ffdeSJay Foad       } else {
174*cfb7ffdeSJay Foad         TRANSCycles -= Cycles;
175*cfb7ffdeSJay Foad         Erase = false;
176*cfb7ffdeSJay Foad       }
177*cfb7ffdeSJay Foad 
178*cfb7ffdeSJay Foad       if (SALUCycles <= Cycles) {
179*cfb7ffdeSJay Foad         // Forget about any SALU instruction. It has definitely completed by
180*cfb7ffdeSJay Foad         // now.
181*cfb7ffdeSJay Foad         SALUCycles = 0;
182*cfb7ffdeSJay Foad       } else {
183*cfb7ffdeSJay Foad         SALUCycles -= Cycles;
184*cfb7ffdeSJay Foad         Erase = false;
185*cfb7ffdeSJay Foad       }
186*cfb7ffdeSJay Foad 
187*cfb7ffdeSJay Foad       return Erase;
188*cfb7ffdeSJay Foad     }
189*cfb7ffdeSJay Foad 
190*cfb7ffdeSJay Foad #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anond01463870111::AMDGPUInsertDelayAlu::DelayInfo191*cfb7ffdeSJay Foad     void dump() const {
192*cfb7ffdeSJay Foad       if (VALUCycles)
193*cfb7ffdeSJay Foad         dbgs() << " VALUCycles=" << (int)VALUCycles;
194*cfb7ffdeSJay Foad       if (VALUNum < VALU_MAX)
195*cfb7ffdeSJay Foad         dbgs() << " VALUNum=" << (int)VALUNum;
196*cfb7ffdeSJay Foad       if (TRANSCycles)
197*cfb7ffdeSJay Foad         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
198*cfb7ffdeSJay Foad       if (TRANSNum < TRANS_MAX)
199*cfb7ffdeSJay Foad         dbgs() << " TRANSNum=" << (int)TRANSNum;
200*cfb7ffdeSJay Foad       if (TRANSNumVALU < VALU_MAX)
201*cfb7ffdeSJay Foad         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
202*cfb7ffdeSJay Foad       if (SALUCycles)
203*cfb7ffdeSJay Foad         dbgs() << " SALUCycles=" << (int)SALUCycles;
204*cfb7ffdeSJay Foad     }
205*cfb7ffdeSJay Foad #endif
206*cfb7ffdeSJay Foad   };
207*cfb7ffdeSJay Foad 
208*cfb7ffdeSJay Foad   // A map from regunits to the delay info for that regunit.
209*cfb7ffdeSJay Foad   struct DelayState : DenseMap<unsigned, DelayInfo> {
210*cfb7ffdeSJay Foad     // Merge another DelayState into this one by merging the delay info for each
211*cfb7ffdeSJay Foad     // regunit.
merge__anond01463870111::AMDGPUInsertDelayAlu::DelayState212*cfb7ffdeSJay Foad     void merge(const DelayState &RHS) {
213*cfb7ffdeSJay Foad       for (const auto &KV : RHS) {
214*cfb7ffdeSJay Foad         iterator It;
215*cfb7ffdeSJay Foad         bool Inserted;
216*cfb7ffdeSJay Foad         std::tie(It, Inserted) = insert(KV);
217*cfb7ffdeSJay Foad         if (!Inserted)
218*cfb7ffdeSJay Foad           It->second.merge(KV.second);
219*cfb7ffdeSJay Foad       }
220*cfb7ffdeSJay Foad     }
221*cfb7ffdeSJay Foad 
222*cfb7ffdeSJay Foad     // Advance the delay info for each regunit, erasing any that are no longer
223*cfb7ffdeSJay Foad     // useful.
advance__anond01463870111::AMDGPUInsertDelayAlu::DelayState224*cfb7ffdeSJay Foad     void advance(DelayType Type, unsigned Cycles) {
225*cfb7ffdeSJay Foad       iterator Next;
226*cfb7ffdeSJay Foad       for (auto I = begin(), E = end(); I != E; I = Next) {
227*cfb7ffdeSJay Foad         Next = std::next(I);
228*cfb7ffdeSJay Foad         if (I->second.advance(Type, Cycles))
229*cfb7ffdeSJay Foad           erase(I);
230*cfb7ffdeSJay Foad       }
231*cfb7ffdeSJay Foad     }
232*cfb7ffdeSJay Foad 
233*cfb7ffdeSJay Foad #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anond01463870111::AMDGPUInsertDelayAlu::DelayState234*cfb7ffdeSJay Foad     void dump(const TargetRegisterInfo *TRI) const {
235*cfb7ffdeSJay Foad       if (empty()) {
236*cfb7ffdeSJay Foad         dbgs() << "    empty\n";
237*cfb7ffdeSJay Foad         return;
238*cfb7ffdeSJay Foad       }
239*cfb7ffdeSJay Foad 
240*cfb7ffdeSJay Foad       // Dump DelayInfo for each RegUnit in numerical order.
241*cfb7ffdeSJay Foad       SmallVector<const_iterator, 8> Order;
242*cfb7ffdeSJay Foad       Order.reserve(size());
243*cfb7ffdeSJay Foad       for (const_iterator I = begin(), E = end(); I != E; ++I)
244*cfb7ffdeSJay Foad         Order.push_back(I);
245*cfb7ffdeSJay Foad       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
246*cfb7ffdeSJay Foad         return A->first < B->first;
247*cfb7ffdeSJay Foad       });
248*cfb7ffdeSJay Foad       for (const_iterator I : Order) {
249*cfb7ffdeSJay Foad         dbgs() << "    " << printRegUnit(I->first, TRI);
250*cfb7ffdeSJay Foad         I->second.dump();
251*cfb7ffdeSJay Foad         dbgs() << "\n";
252*cfb7ffdeSJay Foad       }
253*cfb7ffdeSJay Foad     }
254*cfb7ffdeSJay Foad #endif
255*cfb7ffdeSJay Foad   };
256*cfb7ffdeSJay Foad 
257*cfb7ffdeSJay Foad   // The saved delay state at the end of each basic block.
258*cfb7ffdeSJay Foad   DenseMap<MachineBasicBlock *, DelayState> BlockState;
259*cfb7ffdeSJay Foad 
260*cfb7ffdeSJay Foad   // Emit an s_delay_alu instruction if necessary before MI.
emitDelayAlu(MachineInstr & MI,DelayInfo Delay,MachineInstr * LastDelayAlu)261*cfb7ffdeSJay Foad   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
262*cfb7ffdeSJay Foad                              MachineInstr *LastDelayAlu) {
263*cfb7ffdeSJay Foad     unsigned Imm = 0;
264*cfb7ffdeSJay Foad 
265*cfb7ffdeSJay Foad     // Wait for a TRANS instruction.
266*cfb7ffdeSJay Foad     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
267*cfb7ffdeSJay Foad       Imm |= 4 + Delay.TRANSNum;
268*cfb7ffdeSJay Foad 
269*cfb7ffdeSJay Foad     // Wait for a VALU instruction (if it's more recent than any TRANS
270*cfb7ffdeSJay Foad     // instruction that we're also waiting for).
271*cfb7ffdeSJay Foad     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
272*cfb7ffdeSJay Foad         Delay.VALUNum <= Delay.TRANSNumVALU) {
273*cfb7ffdeSJay Foad       if (Imm & 0xf)
274*cfb7ffdeSJay Foad         Imm |= Delay.VALUNum << 7;
275*cfb7ffdeSJay Foad       else
276*cfb7ffdeSJay Foad         Imm |= Delay.VALUNum;
277*cfb7ffdeSJay Foad     }
278*cfb7ffdeSJay Foad 
279*cfb7ffdeSJay Foad     // Wait for an SALU instruction.
280*cfb7ffdeSJay Foad     if (Delay.SALUCycles) {
281*cfb7ffdeSJay Foad       if (Imm & 0x780) {
282*cfb7ffdeSJay Foad         // We have already encoded a VALU and a TRANS delay. There's no room in
283*cfb7ffdeSJay Foad         // the encoding for an SALU delay as well, so just drop it.
284*cfb7ffdeSJay Foad       } else if (Imm & 0xf) {
285*cfb7ffdeSJay Foad         Imm |= (Delay.SALUCycles + 8) << 7;
286*cfb7ffdeSJay Foad       } else {
287*cfb7ffdeSJay Foad         Imm |= Delay.SALUCycles + 8;
288*cfb7ffdeSJay Foad       }
289*cfb7ffdeSJay Foad     }
290*cfb7ffdeSJay Foad 
291*cfb7ffdeSJay Foad     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
292*cfb7ffdeSJay Foad     if (!Imm)
293*cfb7ffdeSJay Foad       return LastDelayAlu;
294*cfb7ffdeSJay Foad 
295*cfb7ffdeSJay Foad     // If we only need to wait for one instruction, try encoding it in the last
296*cfb7ffdeSJay Foad     // s_delay_alu that we emitted.
297*cfb7ffdeSJay Foad     if (!(Imm & 0x780) && LastDelayAlu) {
298*cfb7ffdeSJay Foad       unsigned Skip = 0;
299*cfb7ffdeSJay Foad       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
300*cfb7ffdeSJay Foad                 E = MachineBasicBlock::instr_iterator(MI);
301*cfb7ffdeSJay Foad            ++I != E;) {
302*cfb7ffdeSJay Foad         if (!I->isBundle() && !I->isMetaInstruction())
303*cfb7ffdeSJay Foad           ++Skip;
304*cfb7ffdeSJay Foad       }
305*cfb7ffdeSJay Foad       if (Skip < 6) {
306*cfb7ffdeSJay Foad         MachineOperand &Op = LastDelayAlu->getOperand(0);
307*cfb7ffdeSJay Foad         unsigned LastImm = Op.getImm();
308*cfb7ffdeSJay Foad         assert((LastImm & ~0xf) == 0 &&
309*cfb7ffdeSJay Foad                "Remembered an s_delay_alu with no room for another delay!");
310*cfb7ffdeSJay Foad         LastImm |= Imm << 7 | Skip << 4;
311*cfb7ffdeSJay Foad         Op.setImm(LastImm);
312*cfb7ffdeSJay Foad         return nullptr;
313*cfb7ffdeSJay Foad       }
314*cfb7ffdeSJay Foad     }
315*cfb7ffdeSJay Foad 
316*cfb7ffdeSJay Foad     auto &MBB = *MI.getParent();
317*cfb7ffdeSJay Foad     MachineInstr *DelayAlu =
318*cfb7ffdeSJay Foad         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
319*cfb7ffdeSJay Foad     // Remember the s_delay_alu for next time if there is still room in it to
320*cfb7ffdeSJay Foad     // encode another delay.
321*cfb7ffdeSJay Foad     return (Imm & 0x780) ? nullptr : DelayAlu;
322*cfb7ffdeSJay Foad   }
323*cfb7ffdeSJay Foad 
runOnMachineBasicBlock(MachineBasicBlock & MBB,bool Emit)324*cfb7ffdeSJay Foad   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
325*cfb7ffdeSJay Foad     DelayState State;
326*cfb7ffdeSJay Foad     for (auto *Pred : MBB.predecessors())
327*cfb7ffdeSJay Foad       State.merge(BlockState[Pred]);
328*cfb7ffdeSJay Foad 
329*cfb7ffdeSJay Foad     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
330*cfb7ffdeSJay Foad                       << "\n";
331*cfb7ffdeSJay Foad                State.dump(TRI););
332*cfb7ffdeSJay Foad 
333*cfb7ffdeSJay Foad     bool Changed = false;
334*cfb7ffdeSJay Foad     MachineInstr *LastDelayAlu = nullptr;
335*cfb7ffdeSJay Foad 
336*cfb7ffdeSJay Foad     // Iterate over the contents of bundles, but don't emit any instructions
337*cfb7ffdeSJay Foad     // inside a bundle.
338*cfb7ffdeSJay Foad     for (auto &MI : MBB.instrs()) {
339*cfb7ffdeSJay Foad       if (MI.isBundle() || MI.isMetaInstruction())
340*cfb7ffdeSJay Foad         continue;
341*cfb7ffdeSJay Foad 
342*cfb7ffdeSJay Foad       // Ignore some more instructions that do not generate any code.
343*cfb7ffdeSJay Foad       switch (MI.getOpcode()) {
344*cfb7ffdeSJay Foad       case AMDGPU::SI_RETURN_TO_EPILOG:
345*cfb7ffdeSJay Foad         continue;
346*cfb7ffdeSJay Foad       }
347*cfb7ffdeSJay Foad 
348*cfb7ffdeSJay Foad       DelayType Type = getDelayType(MI.getDesc().TSFlags);
349*cfb7ffdeSJay Foad 
350*cfb7ffdeSJay Foad       if (instructionWaitsForVALU(MI)) {
351*cfb7ffdeSJay Foad         // Forget about all outstanding VALU delays.
352*cfb7ffdeSJay Foad         State = DelayState();
353*cfb7ffdeSJay Foad       } else if (Type != OTHER) {
354*cfb7ffdeSJay Foad         DelayInfo Delay;
355*cfb7ffdeSJay Foad         // TODO: Scan implicit uses too?
356*cfb7ffdeSJay Foad         for (const auto &Op : MI.explicit_uses()) {
357*cfb7ffdeSJay Foad           if (Op.isReg()) {
358*cfb7ffdeSJay Foad             // One of the operands of the writelane is also the output operand.
359*cfb7ffdeSJay Foad             // This creates the insertion of redundant delays. Hence, we have to
360*cfb7ffdeSJay Foad             // ignore this operand.
361*cfb7ffdeSJay Foad             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
362*cfb7ffdeSJay Foad               continue;
363*cfb7ffdeSJay Foad             for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
364*cfb7ffdeSJay Foad               auto It = State.find(*UI);
365*cfb7ffdeSJay Foad               if (It != State.end()) {
366*cfb7ffdeSJay Foad                 Delay.merge(It->second);
367*cfb7ffdeSJay Foad                 State.erase(*UI);
368*cfb7ffdeSJay Foad               }
369*cfb7ffdeSJay Foad             }
370*cfb7ffdeSJay Foad           }
371*cfb7ffdeSJay Foad         }
372*cfb7ffdeSJay Foad         if (Emit && !MI.isBundledWithPred()) {
373*cfb7ffdeSJay Foad           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
374*cfb7ffdeSJay Foad           // just ignore them?
375*cfb7ffdeSJay Foad           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
376*cfb7ffdeSJay Foad         }
377*cfb7ffdeSJay Foad       }
378*cfb7ffdeSJay Foad 
379*cfb7ffdeSJay Foad       if (Type != OTHER) {
380*cfb7ffdeSJay Foad         // TODO: Scan implicit defs too?
381*cfb7ffdeSJay Foad         for (const auto &Op : MI.defs()) {
382*cfb7ffdeSJay Foad           unsigned Latency = SchedModel.computeOperandLatency(
383*cfb7ffdeSJay Foad               &MI, MI.getOperandNo(&Op), nullptr, 0);
384*cfb7ffdeSJay Foad           for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
385*cfb7ffdeSJay Foad             State[*UI] = DelayInfo(Type, Latency);
386*cfb7ffdeSJay Foad         }
387*cfb7ffdeSJay Foad       }
388*cfb7ffdeSJay Foad 
389*cfb7ffdeSJay Foad       // Advance by the number of cycles it takes to issue this instruction.
390*cfb7ffdeSJay Foad       // TODO: Use a more advanced model that accounts for instructions that
391*cfb7ffdeSJay Foad       // take multiple cycles to issue on a particular pipeline.
392*cfb7ffdeSJay Foad       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
393*cfb7ffdeSJay Foad       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
394*cfb7ffdeSJay Foad       // instructions on the assumption that they will usually have to be issued
395*cfb7ffdeSJay Foad       // twice?
396*cfb7ffdeSJay Foad       State.advance(Type, Cycles);
397*cfb7ffdeSJay Foad 
398*cfb7ffdeSJay Foad       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
399*cfb7ffdeSJay Foad     }
400*cfb7ffdeSJay Foad 
401*cfb7ffdeSJay Foad     if (Emit) {
402*cfb7ffdeSJay Foad       assert(State == BlockState[&MBB] &&
403*cfb7ffdeSJay Foad              "Basic block state should not have changed on final pass!");
404*cfb7ffdeSJay Foad     } else if (State != BlockState[&MBB]) {
405*cfb7ffdeSJay Foad       BlockState[&MBB] = std::move(State);
406*cfb7ffdeSJay Foad       Changed = true;
407*cfb7ffdeSJay Foad     }
408*cfb7ffdeSJay Foad     return Changed;
409*cfb7ffdeSJay Foad   }
410*cfb7ffdeSJay Foad 
runOnMachineFunction(MachineFunction & MF)411*cfb7ffdeSJay Foad   bool runOnMachineFunction(MachineFunction &MF) override {
412*cfb7ffdeSJay Foad     if (skipFunction(MF.getFunction()))
413*cfb7ffdeSJay Foad       return false;
414*cfb7ffdeSJay Foad 
415*cfb7ffdeSJay Foad     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
416*cfb7ffdeSJay Foad                       << "\n");
417*cfb7ffdeSJay Foad 
418*cfb7ffdeSJay Foad     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
419*cfb7ffdeSJay Foad     if (!ST.hasDelayAlu())
420*cfb7ffdeSJay Foad       return false;
421*cfb7ffdeSJay Foad 
422*cfb7ffdeSJay Foad     SII = ST.getInstrInfo();
423*cfb7ffdeSJay Foad     TRI = ST.getRegisterInfo();
424*cfb7ffdeSJay Foad 
425*cfb7ffdeSJay Foad     SchedModel.init(&ST);
426*cfb7ffdeSJay Foad 
427*cfb7ffdeSJay Foad     // Calculate the delay state for each basic block, iterating until we reach
428*cfb7ffdeSJay Foad     // a fixed point.
429*cfb7ffdeSJay Foad     SetVector<MachineBasicBlock *> WorkList;
430*cfb7ffdeSJay Foad     for (auto &MBB : reverse(MF))
431*cfb7ffdeSJay Foad       WorkList.insert(&MBB);
432*cfb7ffdeSJay Foad     while (!WorkList.empty()) {
433*cfb7ffdeSJay Foad       auto &MBB = *WorkList.pop_back_val();
434*cfb7ffdeSJay Foad       bool Changed = runOnMachineBasicBlock(MBB, false);
435*cfb7ffdeSJay Foad       if (Changed)
436*cfb7ffdeSJay Foad         WorkList.insert(MBB.succ_begin(), MBB.succ_end());
437*cfb7ffdeSJay Foad     }
438*cfb7ffdeSJay Foad 
439*cfb7ffdeSJay Foad     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
440*cfb7ffdeSJay Foad 
441*cfb7ffdeSJay Foad     // Make one last pass over all basic blocks to emit s_delay_alu
442*cfb7ffdeSJay Foad     // instructions.
443*cfb7ffdeSJay Foad     bool Changed = false;
444*cfb7ffdeSJay Foad     for (auto &MBB : MF)
445*cfb7ffdeSJay Foad       Changed |= runOnMachineBasicBlock(MBB, true);
446*cfb7ffdeSJay Foad     return Changed;
447*cfb7ffdeSJay Foad   }
448*cfb7ffdeSJay Foad };
449*cfb7ffdeSJay Foad 
450*cfb7ffdeSJay Foad } // namespace
451*cfb7ffdeSJay Foad 
452*cfb7ffdeSJay Foad char AMDGPUInsertDelayAlu::ID = 0;
453*cfb7ffdeSJay Foad 
454*cfb7ffdeSJay Foad char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
455*cfb7ffdeSJay Foad 
456*cfb7ffdeSJay Foad INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
457*cfb7ffdeSJay Foad                 false, false)
458