1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::make_pair(StateWQM, "WQM"),
111       std::make_pair(StateStrictWWM, "StrictWWM"),
112       std::make_pair(StateStrictWQM, "StrictWQM"),
113       std::make_pair(StateExact, "Exact")};
114   char State = PS.State;
115   for (auto M : Mapping) {
116     if (State & M.first) {
117       OS << M.second;
118       State &= ~M.first;
119 
120       if (State)
121         OS << '|';
122     }
123   }
124   assert(State == 0);
125   return OS;
126 }
127 #endif
128 
129 struct InstrInfo {
130   char Needs = 0;
131   char Disabled = 0;
132   char OutNeeds = 0;
133 };
134 
135 struct BlockInfo {
136   char Needs = 0;
137   char InNeeds = 0;
138   char OutNeeds = 0;
139   char InitialState = 0;
140   bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144   MachineBasicBlock *MBB = nullptr;
145   MachineInstr *MI = nullptr;
146 
147   WorkItem() = default;
148   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149   WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154   const SIInstrInfo *TII;
155   const SIRegisterInfo *TRI;
156   const GCNSubtarget *ST;
157   MachineRegisterInfo *MRI;
158   LiveIntervals *LIS;
159   MachineDominatorTree *MDT;
160   MachinePostDominatorTree *PDT;
161 
162   unsigned AndOpc;
163   unsigned AndN2Opc;
164   unsigned XorOpc;
165   unsigned AndSaveExecOpc;
166   unsigned OrSaveExecOpc;
167   unsigned WQMOpc;
168   Register Exec;
169   Register LiveMaskReg;
170 
171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
173 
174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175   DenseMap<const MachineInstr *, char> StateTransition;
176 
177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
180   SmallVector<MachineInstr *, 4> KillInstrs;
181 
182   void printInfo();
183 
184   void markInstruction(MachineInstr &MI, char Flag,
185                        std::vector<WorkItem> &Worklist);
186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189                    std::vector<WorkItem> &Worklist);
190   void markInstructionUses(const MachineInstr &MI, char Flag,
191                            std::vector<WorkItem> &Worklist);
192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195   char analyzeFunction(MachineFunction &MF);
196 
197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198                                       MachineBasicBlock::iterator Before);
199   MachineBasicBlock::iterator
200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201                    MachineBasicBlock::iterator Last, bool PreferLast,
202                    bool SaveSCC);
203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204                Register SaveWQM);
205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206              Register SavedWQM);
207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208                     Register SaveOrig, char StrictStateNeeded);
209   void fromStrictMode(MachineBasicBlock &MBB,
210                       MachineBasicBlock::iterator Before, Register SavedOrig,
211                       char NonStrictState, char CurrentStrictState);
212 
213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214 
215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216                             bool IsWQM);
217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218 
219   void lowerBlock(MachineBasicBlock &MBB);
220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222   void lowerLiveMaskQueries();
223   void lowerCopyInstrs();
224   void lowerKillInstrs(bool IsWQM);
225 
226 public:
227   static char ID;
228 
229   SIWholeQuadMode() :
230     MachineFunctionPass(ID) { }
231 
232   bool runOnMachineFunction(MachineFunction &MF) override;
233 
234   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236   void getAnalysisUsage(AnalysisUsage &AU) const override {
237     AU.addRequired<LiveIntervals>();
238     AU.addPreserved<SlotIndexes>();
239     AU.addPreserved<LiveIntervals>();
240     AU.addRequired<MachineDominatorTree>();
241     AU.addPreserved<MachineDominatorTree>();
242     AU.addRequired<MachinePostDominatorTree>();
243     AU.addPreserved<MachinePostDominatorTree>();
244     MachineFunctionPass::getAnalysisUsage(AU);
245   }
246 
247   MachineFunctionProperties getClearedProperties() const override {
248     return MachineFunctionProperties().set(
249         MachineFunctionProperties::Property::IsSSA);
250   }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258                       false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263                     false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
267 FunctionPass *llvm::createSIWholeQuadModePass() {
268   return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273   for (const auto &BII : Blocks) {
274     dbgs() << "\n"
275            << printMBBReference(*BII.first) << ":\n"
276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
277            << ", Needs = " << PrintState(BII.second.Needs)
278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280     for (const MachineInstr &MI : *BII.first) {
281       auto III = Instructions.find(&MI);
282       if (III == Instructions.end())
283         continue;
284 
285       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
286              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287     }
288   }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                       std::vector<WorkItem> &Worklist) {
294   InstrInfo &II = Instructions[&MI];
295 
296   assert(!(Flag & StateExact) && Flag != 0);
297 
298   // Remove any disabled states from the flag. The user that required it gets
299   // an undefined value in the helper lanes. For example, this can happen if
300   // the result of an atomic is used by instruction that requires WQM, where
301   // ignoring the request for WQM is correct as per the relevant specs.
302   Flag &= ~II.Disabled;
303 
304   // Ignore if the flag is already encompassed by the existing needs, or we
305   // just disabled everything.
306   if ((II.Needs & Flag) == Flag)
307     return;
308 
309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310   II.Needs |= Flag;
311   Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316                                Register Reg, unsigned SubReg, char Flag,
317                                std::vector<WorkItem> &Worklist) {
318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321   if (!UseLRQ.valueIn())
322     return;
323 
324   // Note: this code assumes that lane masks on AMDGPU completely
325   // cover registers.
326   LaneBitmask DefinedLanes;
327   LaneBitmask UseLanes;
328   if (SubReg) {
329     UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
330   } else if (Reg.isVirtual()) {
331     UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
332   }
333 
334   SmallPtrSet<const VNInfo *, 4> Visited;
335   SmallVector<const VNInfo *, 4> ToProcess;
336   ToProcess.push_back(UseLRQ.valueIn());
337   do {
338     const VNInfo *Value = ToProcess.pop_back_val();
339     Visited.insert(Value);
340 
341     if (Value->isPHIDef()) {
342       // Need to mark all defs used in the PHI node
343       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
344       assert(MBB && "Phi-def has no defining MBB");
345       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
346                                                   PE = MBB->pred_end();
347            PI != PE; ++PI) {
348         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
349           if (!Visited.count(VN))
350             ToProcess.push_back(VN);
351         }
352       }
353     } else {
354       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
355       assert(MI && "Def has no defining instruction");
356 
357       if (Reg.isVirtual()) {
358         // Iterate over all operands to find relevant definitions
359         bool HasDef = false;
360         for (const MachineOperand &Op : MI->operands()) {
361           if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
362             continue;
363 
364           // Compute lanes defined and overlap with use
365           LaneBitmask OpLanes =
366               Op.isUndef() ? LaneBitmask::getAll()
367                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
368           LaneBitmask Overlap = (UseLanes & OpLanes);
369 
370           // Record if this instruction defined any of use
371           HasDef |= Overlap.any();
372 
373           // Check if all lanes of use have been defined
374           DefinedLanes |= OpLanes;
375           if ((DefinedLanes & UseLanes) != UseLanes) {
376             // Definition not complete; need to process input value
377             LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
378             if (const VNInfo *VN = LRQ.valueIn()) {
379               if (!Visited.count(VN))
380                 ToProcess.push_back(VN);
381             }
382           }
383         }
384         // Only mark the instruction if it defines some part of the use
385         if (HasDef)
386           markInstruction(*MI, Flag, Worklist);
387       } else {
388         // For physical registers simply mark the defining instruction
389         markInstruction(*MI, Flag, Worklist);
390       }
391     }
392   } while (!ToProcess.empty());
393 
394   assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes));
395 }
396 
397 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
398                                   const MachineOperand &Op, char Flag,
399                                   std::vector<WorkItem> &Worklist) {
400   assert(Op.isReg());
401   Register Reg = Op.getReg();
402 
403   // Ignore some hardware registers
404   switch (Reg) {
405   case AMDGPU::EXEC:
406   case AMDGPU::EXEC_LO:
407     return;
408   default:
409     break;
410   }
411 
412   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
413                     << " for " << MI);
414   if (Reg.isVirtual()) {
415     LiveRange &LR = LIS->getInterval(Reg);
416     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
417   } else {
418     // Handle physical registers that we need to track; this is mostly relevant
419     // for VCC, which can appear as the (implicit) input of a uniform branch,
420     // e.g. when a loop counter is stored in a VGPR.
421     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
422          ++RegUnit) {
423       LiveRange &LR = LIS->getRegUnit(*RegUnit);
424       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
425       if (!Value)
426         continue;
427 
428       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
429     }
430   }
431 }
432 
433 /// Mark all instructions defining the uses in \p MI with \p Flag.
434 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
435                                           std::vector<WorkItem> &Worklist) {
436   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
437                     << MI);
438 
439   for (const MachineOperand &Use : MI.uses()) {
440     if (!Use.isReg() || !Use.isUse())
441       continue;
442     markOperand(MI, Use, Flag, Worklist);
443   }
444 }
445 
446 // Scan instructions to determine which ones require an Exact execmask and
447 // which ones seed WQM requirements.
448 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
449                                        std::vector<WorkItem> &Worklist) {
450   char GlobalFlags = 0;
451   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
452   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
453   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
454 
455   // We need to visit the basic blocks in reverse post-order so that we visit
456   // defs before uses, in particular so that we don't accidentally mark an
457   // instruction as needing e.g. WQM before visiting it and realizing it needs
458   // WQM disabled.
459   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
460   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
461     MachineBasicBlock &MBB = **BI;
462     BlockInfo &BBI = Blocks[&MBB];
463 
464     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
465       MachineInstr &MI = *II;
466       InstrInfo &III = Instructions[&MI];
467       unsigned Opcode = MI.getOpcode();
468       char Flags = 0;
469 
470       if (TII->isWQM(Opcode)) {
471         // If LOD is not supported WQM is not needed.
472         if (!ST->hasExtendedImageInsts())
473           continue;
474         // Sampling instructions don't need to produce results for all pixels
475         // in a quad, they just require all inputs of a quad to have been
476         // computed for derivatives.
477         markInstructionUses(MI, StateWQM, Worklist);
478         GlobalFlags |= StateWQM;
479         continue;
480       } else if (Opcode == AMDGPU::WQM) {
481         // The WQM intrinsic requires its output to have all the helper lanes
482         // correct, so we need it to be in WQM.
483         Flags = StateWQM;
484         LowerToCopyInstrs.push_back(&MI);
485       } else if (Opcode == AMDGPU::SOFT_WQM) {
486         LowerToCopyInstrs.push_back(&MI);
487         SoftWQMInstrs.push_back(&MI);
488         continue;
489       } else if (Opcode == AMDGPU::STRICT_WWM) {
490         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
491         // it needs to be executed in WQM or Exact so that its copy doesn't
492         // clobber inactive lanes.
493         markInstructionUses(MI, StateStrictWWM, Worklist);
494         GlobalFlags |= StateStrictWWM;
495         LowerToMovInstrs.push_back(&MI);
496         continue;
497       } else if (Opcode == AMDGPU::STRICT_WQM) {
498         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
499         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
500         // quads that have at least one active thread.
501         markInstructionUses(MI, StateStrictWQM, Worklist);
502         GlobalFlags |= StateStrictWQM;
503         LowerToMovInstrs.push_back(&MI);
504         continue;
505       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
506                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
507         III.Disabled = StateStrict;
508         MachineOperand &Inactive = MI.getOperand(2);
509         if (Inactive.isReg()) {
510           if (Inactive.isUndef()) {
511             LowerToCopyInstrs.push_back(&MI);
512           } else {
513             markOperand(MI, Inactive, StateStrictWWM, Worklist);
514           }
515         }
516         SetInactiveInstrs.push_back(&MI);
517         continue;
518       } else if (TII->isDisableWQM(MI)) {
519         BBI.Needs |= StateExact;
520         if (!(BBI.InNeeds & StateExact)) {
521           BBI.InNeeds |= StateExact;
522           Worklist.push_back(&MBB);
523         }
524         GlobalFlags |= StateExact;
525         III.Disabled = StateWQM | StateStrict;
526         continue;
527       } else {
528         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
529           LiveMaskQueries.push_back(&MI);
530         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
531                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
532                    Opcode == AMDGPU::SI_DEMOTE_I1) {
533           KillInstrs.push_back(&MI);
534           BBI.NeedsLowering = true;
535         } else if (WQMOutputs) {
536           // The function is in machine SSA form, which means that physical
537           // VGPRs correspond to shader inputs and outputs. Inputs are
538           // only used, outputs are only defined.
539           // FIXME: is this still valid?
540           for (const MachineOperand &MO : MI.defs()) {
541             if (!MO.isReg())
542               continue;
543 
544             Register Reg = MO.getReg();
545 
546             if (!Reg.isVirtual() &&
547                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
548               Flags = StateWQM;
549               break;
550             }
551           }
552         }
553 
554         if (!Flags)
555           continue;
556       }
557 
558       markInstruction(MI, Flags, Worklist);
559       GlobalFlags |= Flags;
560     }
561   }
562 
563   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
564   // ever used anywhere in the function. This implements the corresponding
565   // semantics of @llvm.amdgcn.set.inactive.
566   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
567   if (GlobalFlags & StateWQM) {
568     for (MachineInstr *MI : SetInactiveInstrs)
569       markInstruction(*MI, StateWQM, Worklist);
570     for (MachineInstr *MI : SoftWQMInstrs)
571       markInstruction(*MI, StateWQM, Worklist);
572   }
573 
574   return GlobalFlags;
575 }
576 
577 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
578                                            std::vector<WorkItem>& Worklist) {
579   MachineBasicBlock *MBB = MI.getParent();
580   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
581   BlockInfo &BI = Blocks[MBB];
582 
583   // Control flow-type instructions and stores to temporary memory that are
584   // followed by WQM computations must themselves be in WQM.
585   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
586       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
587     Instructions[&MI].Needs = StateWQM;
588     II.Needs = StateWQM;
589   }
590 
591   // Propagate to block level
592   if (II.Needs & StateWQM) {
593     BI.Needs |= StateWQM;
594     if (!(BI.InNeeds & StateWQM)) {
595       BI.InNeeds |= StateWQM;
596       Worklist.push_back(MBB);
597     }
598   }
599 
600   // Propagate backwards within block
601   if (MachineInstr *PrevMI = MI.getPrevNode()) {
602     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
603     if (!PrevMI->isPHI()) {
604       InstrInfo &PrevII = Instructions[PrevMI];
605       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
606         PrevII.OutNeeds |= InNeeds;
607         Worklist.push_back(PrevMI);
608       }
609     }
610   }
611 
612   // Propagate WQM flag to instruction inputs
613   assert(!(II.Needs & StateExact));
614 
615   if (II.Needs != 0)
616     markInstructionUses(MI, II.Needs, Worklist);
617 
618   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
619   // not require any WQM transitions.
620   if (II.Needs & StateStrictWWM)
621     BI.Needs |= StateStrictWWM;
622   if (II.Needs & StateStrictWQM)
623     BI.Needs |= StateStrictWQM;
624 }
625 
626 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
627                                      std::vector<WorkItem>& Worklist) {
628   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
629 
630   // Propagate through instructions
631   if (!MBB.empty()) {
632     MachineInstr *LastMI = &*MBB.rbegin();
633     InstrInfo &LastII = Instructions[LastMI];
634     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
635       LastII.OutNeeds |= BI.OutNeeds;
636       Worklist.push_back(LastMI);
637     }
638   }
639 
640   // Predecessor blocks must provide for our WQM/Exact needs.
641   for (MachineBasicBlock *Pred : MBB.predecessors()) {
642     BlockInfo &PredBI = Blocks[Pred];
643     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
644       continue;
645 
646     PredBI.OutNeeds |= BI.InNeeds;
647     PredBI.InNeeds |= BI.InNeeds;
648     Worklist.push_back(Pred);
649   }
650 
651   // All successors must be prepared to accept the same set of WQM/Exact data.
652   for (MachineBasicBlock *Succ : MBB.successors()) {
653     BlockInfo &SuccBI = Blocks[Succ];
654     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
655       continue;
656 
657     SuccBI.InNeeds |= BI.OutNeeds;
658     Worklist.push_back(Succ);
659   }
660 }
661 
662 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
663   std::vector<WorkItem> Worklist;
664   char GlobalFlags = scanInstructions(MF, Worklist);
665 
666   while (!Worklist.empty()) {
667     WorkItem WI = Worklist.back();
668     Worklist.pop_back();
669 
670     if (WI.MI)
671       propagateInstruction(*WI.MI, Worklist);
672     else
673       propagateBlock(*WI.MBB, Worklist);
674   }
675 
676   return GlobalFlags;
677 }
678 
679 MachineBasicBlock::iterator
680 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
681                          MachineBasicBlock::iterator Before) {
682   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
683 
684   MachineInstr *Save =
685       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
686           .addReg(AMDGPU::SCC);
687   MachineInstr *Restore =
688       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
689           .addReg(SaveReg);
690 
691   LIS->InsertMachineInstrInMaps(*Save);
692   LIS->InsertMachineInstrInMaps(*Restore);
693   LIS->createAndComputeVirtRegInterval(SaveReg);
694 
695   return Restore;
696 }
697 
698 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
699                                                MachineInstr *TermMI) {
700   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
701                     << *TermMI << "\n");
702 
703   MachineBasicBlock *SplitBB =
704       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
705 
706   // Convert last instruction in block to a terminator.
707   // Note: this only covers the expected patterns
708   unsigned NewOpcode = 0;
709   switch (TermMI->getOpcode()) {
710   case AMDGPU::S_AND_B32:
711     NewOpcode = AMDGPU::S_AND_B32_term;
712     break;
713   case AMDGPU::S_AND_B64:
714     NewOpcode = AMDGPU::S_AND_B64_term;
715     break;
716   case AMDGPU::S_MOV_B32:
717     NewOpcode = AMDGPU::S_MOV_B32_term;
718     break;
719   case AMDGPU::S_MOV_B64:
720     NewOpcode = AMDGPU::S_MOV_B64_term;
721     break;
722   default:
723     break;
724   }
725   if (NewOpcode)
726     TermMI->setDesc(TII->get(NewOpcode));
727 
728   if (SplitBB != BB) {
729     // Update dominator trees
730     using DomTreeT = DomTreeBase<MachineBasicBlock>;
731     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
732     for (MachineBasicBlock *Succ : SplitBB->successors()) {
733       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
734       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
735     }
736     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
737     if (MDT)
738       MDT->getBase().applyUpdates(DTUpdates);
739     if (PDT)
740       PDT->getBase().applyUpdates(DTUpdates);
741 
742     // Link blocks
743     MachineInstr *MI =
744         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
745             .addMBB(SplitBB);
746     LIS->InsertMachineInstrInMaps(*MI);
747   }
748 
749   return SplitBB;
750 }
751 
752 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
753                                             MachineInstr &MI) {
754   const DebugLoc &DL = MI.getDebugLoc();
755   unsigned Opcode = 0;
756 
757   assert(MI.getOperand(0).isReg());
758 
759   // Comparison is for live lanes; however here we compute the inverse
760   // (killed lanes).  This is because VCMP will always generate 0 bits
761   // for inactive lanes so a mask of live lanes would not be correct
762   // inside control flow.
763   // Invert the comparison by swapping the operands and adjusting
764   // the comparison codes.
765 
766   switch (MI.getOperand(2).getImm()) {
767   case ISD::SETUEQ:
768     Opcode = AMDGPU::V_CMP_LG_F32_e64;
769     break;
770   case ISD::SETUGT:
771     Opcode = AMDGPU::V_CMP_GE_F32_e64;
772     break;
773   case ISD::SETUGE:
774     Opcode = AMDGPU::V_CMP_GT_F32_e64;
775     break;
776   case ISD::SETULT:
777     Opcode = AMDGPU::V_CMP_LE_F32_e64;
778     break;
779   case ISD::SETULE:
780     Opcode = AMDGPU::V_CMP_LT_F32_e64;
781     break;
782   case ISD::SETUNE:
783     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
784     break;
785   case ISD::SETO:
786     Opcode = AMDGPU::V_CMP_O_F32_e64;
787     break;
788   case ISD::SETUO:
789     Opcode = AMDGPU::V_CMP_U_F32_e64;
790     break;
791   case ISD::SETOEQ:
792   case ISD::SETEQ:
793     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
794     break;
795   case ISD::SETOGT:
796   case ISD::SETGT:
797     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
798     break;
799   case ISD::SETOGE:
800   case ISD::SETGE:
801     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
802     break;
803   case ISD::SETOLT:
804   case ISD::SETLT:
805     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
806     break;
807   case ISD::SETOLE:
808   case ISD::SETLE:
809     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
810     break;
811   case ISD::SETONE:
812   case ISD::SETNE:
813     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
814     break;
815   default:
816     llvm_unreachable("invalid ISD:SET cond code");
817   }
818 
819   // Pick opcode based on comparison type.
820   MachineInstr *VcmpMI;
821   const MachineOperand &Op0 = MI.getOperand(0);
822   const MachineOperand &Op1 = MI.getOperand(1);
823   if (TRI->isVGPR(*MRI, Op0.getReg())) {
824     Opcode = AMDGPU::getVOPe32(Opcode);
825     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
826   } else {
827     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
828                  .addReg(AMDGPU::VCC, RegState::Define)
829                  .addImm(0) // src0 modifiers
830                  .add(Op1)
831                  .addImm(0) // src1 modifiers
832                  .add(Op0)
833                  .addImm(0); // omod
834   }
835 
836   // VCC represents lanes killed.
837   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
838 
839   MachineInstr *MaskUpdateMI =
840       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
841           .addReg(LiveMaskReg)
842           .addReg(VCC);
843 
844   // State of SCC represents whether any lanes are live in mask,
845   // if SCC is 0 then no lanes will be alive anymore.
846   MachineInstr *EarlyTermMI =
847       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
848 
849   MachineInstr *ExecMaskMI =
850       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
851 
852   assert(MBB.succ_size() == 1);
853   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
854                               .addMBB(*MBB.succ_begin());
855 
856   // Update live intervals
857   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
858   MBB.remove(&MI);
859 
860   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
861   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
862   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
863   LIS->InsertMachineInstrInMaps(*NewTerm);
864 
865   return NewTerm;
866 }
867 
868 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
869                                            MachineInstr &MI, bool IsWQM) {
870   const DebugLoc &DL = MI.getDebugLoc();
871   MachineInstr *MaskUpdateMI = nullptr;
872 
873   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
874   const MachineOperand &Op = MI.getOperand(0);
875   int64_t KillVal = MI.getOperand(1).getImm();
876   MachineInstr *ComputeKilledMaskMI = nullptr;
877   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
878   Register TmpReg;
879 
880   // Is this a static or dynamic kill?
881   if (Op.isImm()) {
882     if (Op.getImm() == KillVal) {
883       // Static: all active lanes are killed
884       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
885                          .addReg(LiveMaskReg)
886                          .addReg(Exec);
887     } else {
888       // Static: kill does nothing
889       MachineInstr *NewTerm = nullptr;
890       if (IsDemote) {
891         LIS->RemoveMachineInstrFromMaps(MI);
892       } else {
893         assert(MBB.succ_size() == 1);
894         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
895                       .addMBB(*MBB.succ_begin());
896         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
897       }
898       MBB.remove(&MI);
899       return NewTerm;
900     }
901   } else {
902     if (!KillVal) {
903       // Op represents live lanes after kill,
904       // so exec mask needs to be factored in.
905       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
906       ComputeKilledMaskMI =
907           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
908       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
909                          .addReg(LiveMaskReg)
910                          .addReg(TmpReg);
911     } else {
912       // Op represents lanes to kill
913       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
914                          .addReg(LiveMaskReg)
915                          .add(Op);
916     }
917   }
918 
919   // State of SCC represents whether any lanes are live in mask,
920   // if SCC is 0 then no lanes will be alive anymore.
921   MachineInstr *EarlyTermMI =
922       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
923 
924   // In the case we got this far some lanes are still live,
925   // update EXEC to deactivate lanes as appropriate.
926   MachineInstr *NewTerm;
927   MachineInstr *WQMMaskMI = nullptr;
928   Register LiveMaskWQM;
929   if (IsDemote) {
930     // Demotes deactive quads with only helper lanes
931     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
932     WQMMaskMI =
933         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
934     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
935                   .addReg(Exec)
936                   .addReg(LiveMaskWQM);
937   } else {
938     // Kills deactivate lanes
939     if (Op.isImm()) {
940       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
941       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
942     } else if (!IsWQM) {
943       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
944                     .addReg(Exec)
945                     .addReg(LiveMaskReg);
946     } else {
947       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
948       NewTerm =
949           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
950     }
951   }
952 
953   // Update live intervals
954   LIS->RemoveMachineInstrFromMaps(MI);
955   MBB.remove(&MI);
956   assert(EarlyTermMI);
957   assert(MaskUpdateMI);
958   assert(NewTerm);
959   if (ComputeKilledMaskMI)
960     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
961   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
962   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
963   if (WQMMaskMI)
964     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
965   LIS->InsertMachineInstrInMaps(*NewTerm);
966 
967   if (CndReg) {
968     LIS->removeInterval(CndReg);
969     LIS->createAndComputeVirtRegInterval(CndReg);
970   }
971   if (TmpReg)
972     LIS->createAndComputeVirtRegInterval(TmpReg);
973   if (LiveMaskWQM)
974     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
975 
976   return NewTerm;
977 }
978 
979 // Replace (or supplement) instructions accessing live mask.
980 // This can only happen once all the live mask registers have been created
981 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
982 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
983   auto BII = Blocks.find(&MBB);
984   if (BII == Blocks.end())
985     return;
986 
987   const BlockInfo &BI = BII->second;
988   if (!BI.NeedsLowering)
989     return;
990 
991   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
992 
993   SmallVector<MachineInstr *, 4> SplitPoints;
994   char State = BI.InitialState;
995 
996   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
997   while (II != IE) {
998     auto Next = std::next(II);
999     MachineInstr &MI = *II;
1000 
1001     if (StateTransition.count(&MI))
1002       State = StateTransition[&MI];
1003 
1004     MachineInstr *SplitPoint = nullptr;
1005     switch (MI.getOpcode()) {
1006     case AMDGPU::SI_DEMOTE_I1:
1007     case AMDGPU::SI_KILL_I1_TERMINATOR:
1008       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1009       break;
1010     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1011       SplitPoint = lowerKillF32(MBB, MI);
1012       break;
1013     default:
1014       break;
1015     }
1016     if (SplitPoint)
1017       SplitPoints.push_back(SplitPoint);
1018 
1019     II = Next;
1020   }
1021 
1022   // Perform splitting after instruction scan to simplify iteration.
1023   if (!SplitPoints.empty()) {
1024     MachineBasicBlock *BB = &MBB;
1025     for (MachineInstr *MI : SplitPoints) {
1026       BB = splitBlock(BB, MI);
1027     }
1028   }
1029 }
1030 
1031 // Return an iterator in the (inclusive) range [First, Last] at which
1032 // instructions can be safely inserted, keeping in mind that some of the
1033 // instructions we want to add necessarily clobber SCC.
1034 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1035     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1036     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1037   if (!SaveSCC)
1038     return PreferLast ? Last : First;
1039 
1040   LiveRange &LR =
1041       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1042   auto MBBE = MBB.end();
1043   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1044                                      : LIS->getMBBEndIdx(&MBB);
1045   SlotIndex LastIdx =
1046       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1047   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1048   const LiveRange::Segment *S;
1049 
1050   for (;;) {
1051     S = LR.getSegmentContaining(Idx);
1052     if (!S)
1053       break;
1054 
1055     if (PreferLast) {
1056       SlotIndex Next = S->start.getBaseIndex();
1057       if (Next < FirstIdx)
1058         break;
1059       Idx = Next;
1060     } else {
1061       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1062       assert(EndMI && "Segment does not end on valid instruction");
1063       auto NextI = std::next(EndMI->getIterator());
1064       if (NextI == MBB.end())
1065         break;
1066       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1067       if (Next > LastIdx)
1068         break;
1069       Idx = Next;
1070     }
1071   }
1072 
1073   MachineBasicBlock::iterator MBBI;
1074 
1075   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1076     MBBI = MI;
1077   else {
1078     assert(Idx == LIS->getMBBEndIdx(&MBB));
1079     MBBI = MBB.end();
1080   }
1081 
1082   // Move insertion point past any operations modifying EXEC.
1083   // This assumes that the value of SCC defined by any of these operations
1084   // does not need to be preserved.
1085   while (MBBI != Last) {
1086     bool IsExecDef = false;
1087     for (const MachineOperand &MO : MBBI->operands()) {
1088       if (MO.isReg() && MO.isDef()) {
1089         IsExecDef |=
1090             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1091       }
1092     }
1093     if (!IsExecDef)
1094       break;
1095     MBBI++;
1096     S = nullptr;
1097   }
1098 
1099   if (S)
1100     MBBI = saveSCC(MBB, MBBI);
1101 
1102   return MBBI;
1103 }
1104 
1105 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1106                               MachineBasicBlock::iterator Before,
1107                               Register SaveWQM) {
1108   MachineInstr *MI;
1109 
1110   if (SaveWQM) {
1111     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1112              .addReg(LiveMaskReg);
1113   } else {
1114     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1115              .addReg(Exec)
1116              .addReg(LiveMaskReg);
1117   }
1118 
1119   LIS->InsertMachineInstrInMaps(*MI);
1120   StateTransition[MI] = StateExact;
1121 }
1122 
1123 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1124                             MachineBasicBlock::iterator Before,
1125                             Register SavedWQM) {
1126   MachineInstr *MI;
1127 
1128   if (SavedWQM) {
1129     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1130              .addReg(SavedWQM);
1131   } else {
1132     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1133   }
1134 
1135   LIS->InsertMachineInstrInMaps(*MI);
1136   StateTransition[MI] = StateWQM;
1137 }
1138 
1139 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1140                                    MachineBasicBlock::iterator Before,
1141                                    Register SaveOrig, char StrictStateNeeded) {
1142   MachineInstr *MI;
1143   assert(SaveOrig);
1144   assert(StrictStateNeeded == StateStrictWWM ||
1145          StrictStateNeeded == StateStrictWQM);
1146 
1147   if (StrictStateNeeded == StateStrictWWM) {
1148     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1149                  SaveOrig)
1150              .addImm(-1);
1151   } else {
1152     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1153                  SaveOrig)
1154              .addImm(-1);
1155   }
1156   LIS->InsertMachineInstrInMaps(*MI);
1157   StateTransition[MI] = StateStrictWWM;
1158 }
1159 
1160 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1161                                      MachineBasicBlock::iterator Before,
1162                                      Register SavedOrig, char NonStrictState,
1163                                      char CurrentStrictState) {
1164   MachineInstr *MI;
1165 
1166   assert(SavedOrig);
1167   assert(CurrentStrictState == StateStrictWWM ||
1168          CurrentStrictState == StateStrictWQM);
1169 
1170   if (CurrentStrictState == StateStrictWWM) {
1171     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1172                  Exec)
1173              .addReg(SavedOrig);
1174   } else {
1175     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1176                  Exec)
1177              .addReg(SavedOrig);
1178   }
1179   LIS->InsertMachineInstrInMaps(*MI);
1180   StateTransition[MI] = NonStrictState;
1181 }
1182 
1183 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1184   auto BII = Blocks.find(&MBB);
1185   if (BII == Blocks.end())
1186     return;
1187 
1188   BlockInfo &BI = BII->second;
1189 
1190   // This is a non-entry block that is WQM throughout, so no need to do
1191   // anything.
1192   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1193     BI.InitialState = StateWQM;
1194     return;
1195   }
1196 
1197   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1198                     << ":\n");
1199 
1200   Register SavedWQMReg;
1201   Register SavedNonStrictReg;
1202   bool WQMFromExec = IsEntry;
1203   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1204   char NonStrictState = 0;
1205   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1206 
1207   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1208   if (IsEntry) {
1209     // Skip the instruction that saves LiveMask
1210     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1211       ++II;
1212   }
1213 
1214   // This stores the first instruction where it's safe to switch from WQM to
1215   // Exact or vice versa.
1216   MachineBasicBlock::iterator FirstWQM = IE;
1217 
1218   // This stores the first instruction where it's safe to switch from Strict
1219   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1220   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1221   // be safe to switch to/from WQM as well.
1222   MachineBasicBlock::iterator FirstStrict = IE;
1223 
1224   // Record initial state is block information.
1225   BI.InitialState = State;
1226 
1227   for (;;) {
1228     MachineBasicBlock::iterator Next = II;
1229     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1230     char OutNeeds = 0;
1231 
1232     if (FirstWQM == IE)
1233       FirstWQM = II;
1234 
1235     if (FirstStrict == IE)
1236       FirstStrict = II;
1237 
1238     // First, figure out the allowed states (Needs) based on the propagated
1239     // flags.
1240     if (II != IE) {
1241       MachineInstr &MI = *II;
1242 
1243       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1244         auto III = Instructions.find(&MI);
1245         if (III != Instructions.end()) {
1246           if (III->second.Needs & StateStrictWWM)
1247             Needs = StateStrictWWM;
1248           else if (III->second.Needs & StateStrictWQM)
1249             Needs = StateStrictWQM;
1250           else if (III->second.Needs & StateWQM)
1251             Needs = StateWQM;
1252           else
1253             Needs &= ~III->second.Disabled;
1254           OutNeeds = III->second.OutNeeds;
1255         }
1256       } else {
1257         // If the instruction doesn't actually need a correct EXEC, then we can
1258         // safely leave Strict mode enabled.
1259         Needs = StateExact | StateWQM | StateStrict;
1260       }
1261 
1262       if (MI.isTerminator() && OutNeeds == StateExact)
1263         Needs = StateExact;
1264 
1265       ++Next;
1266     } else {
1267       // End of basic block
1268       if (BI.OutNeeds & StateWQM)
1269         Needs = StateWQM;
1270       else if (BI.OutNeeds == StateExact)
1271         Needs = StateExact;
1272       else
1273         Needs = StateWQM | StateExact;
1274     }
1275 
1276     // Now, transition if necessary.
1277     if (!(Needs & State)) {
1278       MachineBasicBlock::iterator First;
1279       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1280           State == StateStrictWQM || Needs == StateStrictWQM) {
1281         // We must switch to or from Strict mode.
1282         First = FirstStrict;
1283       } else {
1284         // We only need to switch to/from WQM, so we can use FirstWQM.
1285         First = FirstWQM;
1286       }
1287 
1288       // Whether we need to save SCC depends on start and end states.
1289       bool SaveSCC = false;
1290       switch (State) {
1291       case StateExact:
1292       case StateStrictWWM:
1293       case StateStrictWQM:
1294         // Exact/Strict -> Strict: save SCC
1295         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1296         // Exact/Strict -> Exact: no save
1297         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1298         break;
1299       case StateWQM:
1300         // WQM -> Exact/Strict: save SCC
1301         SaveSCC = !(Needs & StateWQM);
1302         break;
1303       default:
1304         llvm_unreachable("Unknown state");
1305         break;
1306       }
1307       MachineBasicBlock::iterator Before =
1308           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1309 
1310       if (State & StateStrict) {
1311         assert(State == StateStrictWWM || State == StateStrictWQM);
1312         assert(SavedNonStrictReg);
1313         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1314 
1315         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1316         SavedNonStrictReg = 0;
1317         State = NonStrictState;
1318       }
1319 
1320       if (Needs & StateStrict) {
1321         NonStrictState = State;
1322         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1323         assert(!SavedNonStrictReg);
1324         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1325 
1326         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1327         State = Needs;
1328 
1329       } else {
1330         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1331           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1332             assert(!SavedWQMReg);
1333             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1334           }
1335 
1336           toExact(MBB, Before, SavedWQMReg);
1337           State = StateExact;
1338         } else if (State == StateExact && (Needs & StateWQM) &&
1339                    !(Needs & StateExact)) {
1340           assert(WQMFromExec == (SavedWQMReg == 0));
1341 
1342           toWQM(MBB, Before, SavedWQMReg);
1343 
1344           if (SavedWQMReg) {
1345             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1346             SavedWQMReg = 0;
1347           }
1348           State = StateWQM;
1349         } else {
1350           // We can get here if we transitioned from StrictWWM to a
1351           // non-StrictWWM state that already matches our needs, but we
1352           // shouldn't need to do anything.
1353           assert(Needs & State);
1354         }
1355       }
1356     }
1357 
1358     if (Needs != (StateExact | StateWQM | StateStrict)) {
1359       if (Needs != (StateExact | StateWQM))
1360         FirstWQM = IE;
1361       FirstStrict = IE;
1362     }
1363 
1364     if (II == IE)
1365       break;
1366 
1367     II = Next;
1368   }
1369   assert(!SavedWQMReg);
1370   assert(!SavedNonStrictReg);
1371 }
1372 
1373 void SIWholeQuadMode::lowerLiveMaskQueries() {
1374   for (MachineInstr *MI : LiveMaskQueries) {
1375     const DebugLoc &DL = MI->getDebugLoc();
1376     Register Dest = MI->getOperand(0).getReg();
1377 
1378     MachineInstr *Copy =
1379         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1380             .addReg(LiveMaskReg);
1381 
1382     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1383     MI->eraseFromParent();
1384   }
1385 }
1386 
1387 void SIWholeQuadMode::lowerCopyInstrs() {
1388   for (MachineInstr *MI : LowerToMovInstrs) {
1389     assert(MI->getNumExplicitOperands() == 2);
1390 
1391     const Register Reg = MI->getOperand(0).getReg();
1392     const unsigned SubReg = MI->getOperand(0).getSubReg();
1393 
1394     if (TRI->isVGPR(*MRI, Reg)) {
1395       const TargetRegisterClass *regClass =
1396           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1397       if (SubReg)
1398         regClass = TRI->getSubRegClass(regClass, SubReg);
1399 
1400       const unsigned MovOp = TII->getMovOpcode(regClass);
1401       MI->setDesc(TII->get(MovOp));
1402 
1403       // And make it implicitly depend on exec (like all VALU movs should do).
1404       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
1405     } else {
1406       // Remove early-clobber and exec dependency from simple SGPR copies.
1407       // This allows some to be eliminated during/post RA.
1408       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1409       if (MI->getOperand(0).isEarlyClobber()) {
1410         LIS->removeInterval(Reg);
1411         MI->getOperand(0).setIsEarlyClobber(false);
1412         LIS->createAndComputeVirtRegInterval(Reg);
1413       }
1414       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1415       while (Index >= 0) {
1416         MI->RemoveOperand(Index);
1417         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1418       }
1419       MI->setDesc(TII->get(AMDGPU::COPY));
1420       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1421     }
1422   }
1423   for (MachineInstr *MI : LowerToCopyInstrs) {
1424     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1425         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1426       assert(MI->getNumExplicitOperands() == 3);
1427       // the only reason we should be here is V_SET_INACTIVE has
1428       // an undef input so it is being replaced by a simple copy.
1429       // There should be a second undef source that we should remove.
1430       assert(MI->getOperand(2).isUndef());
1431       MI->RemoveOperand(2);
1432       MI->untieRegOperand(1);
1433     } else {
1434       assert(MI->getNumExplicitOperands() == 2);
1435     }
1436 
1437     MI->setDesc(TII->get(AMDGPU::COPY));
1438   }
1439 }
1440 
1441 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1442   for (MachineInstr *MI : KillInstrs) {
1443     MachineBasicBlock *MBB = MI->getParent();
1444     MachineInstr *SplitPoint = nullptr;
1445     switch (MI->getOpcode()) {
1446     case AMDGPU::SI_DEMOTE_I1:
1447     case AMDGPU::SI_KILL_I1_TERMINATOR:
1448       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1449       break;
1450     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1451       SplitPoint = lowerKillF32(*MBB, *MI);
1452       break;
1453     default:
1454       continue;
1455     }
1456     if (SplitPoint)
1457       splitBlock(MBB, SplitPoint);
1458   }
1459 }
1460 
1461 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1462   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1463                     << " ------------- \n");
1464   LLVM_DEBUG(MF.dump(););
1465 
1466   Instructions.clear();
1467   Blocks.clear();
1468   LiveMaskQueries.clear();
1469   LowerToCopyInstrs.clear();
1470   LowerToMovInstrs.clear();
1471   KillInstrs.clear();
1472   StateTransition.clear();
1473 
1474   ST = &MF.getSubtarget<GCNSubtarget>();
1475 
1476   TII = ST->getInstrInfo();
1477   TRI = &TII->getRegisterInfo();
1478   MRI = &MF.getRegInfo();
1479   LIS = &getAnalysis<LiveIntervals>();
1480   MDT = &getAnalysis<MachineDominatorTree>();
1481   PDT = &getAnalysis<MachinePostDominatorTree>();
1482 
1483   if (ST->isWave32()) {
1484     AndOpc = AMDGPU::S_AND_B32;
1485     AndN2Opc = AMDGPU::S_ANDN2_B32;
1486     XorOpc = AMDGPU::S_XOR_B32;
1487     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1488     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1489     WQMOpc = AMDGPU::S_WQM_B32;
1490     Exec = AMDGPU::EXEC_LO;
1491   } else {
1492     AndOpc = AMDGPU::S_AND_B64;
1493     AndN2Opc = AMDGPU::S_ANDN2_B64;
1494     XorOpc = AMDGPU::S_XOR_B64;
1495     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1496     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1497     WQMOpc = AMDGPU::S_WQM_B64;
1498     Exec = AMDGPU::EXEC;
1499   }
1500 
1501   const char GlobalFlags = analyzeFunction(MF);
1502   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1503 
1504   LiveMaskReg = Exec;
1505 
1506   // Shader is simple does not need any state changes or any complex lowering
1507   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1508       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1509     lowerLiveMaskQueries();
1510     return !LiveMaskQueries.empty();
1511   }
1512 
1513   MachineBasicBlock &Entry = MF.front();
1514   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1515 
1516   // Store a copy of the original live mask when required
1517   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1518     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1519     MachineInstr *MI =
1520         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1521             .addReg(Exec);
1522     LIS->InsertMachineInstrInMaps(*MI);
1523   }
1524 
1525   LLVM_DEBUG(printInfo());
1526 
1527   lowerLiveMaskQueries();
1528   lowerCopyInstrs();
1529 
1530   // Shader only needs WQM
1531   if (GlobalFlags == StateWQM) {
1532     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1533                   .addReg(Exec);
1534     LIS->InsertMachineInstrInMaps(*MI);
1535     lowerKillInstrs(true);
1536   } else {
1537     for (auto BII : Blocks)
1538       processBlock(*BII.first, BII.first == &Entry);
1539     // Lowering blocks causes block splitting so perform as a second pass.
1540     for (auto BII : Blocks)
1541       lowerBlock(*BII.first);
1542   }
1543 
1544   // Compute live range for live mask
1545   if (LiveMaskReg != Exec)
1546     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1547 
1548   // Physical registers like SCC aren't tracked by default anyway, so just
1549   // removing the ranges we computed is the simplest option for maintaining
1550   // the analysis results.
1551   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1552 
1553   // If we performed any kills then recompute EXEC
1554   if (!KillInstrs.empty())
1555     LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1556 
1557   return true;
1558 }
1559