1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). It ensures that WQM is
15 /// enabled when necessary, but disabled around stores and atomics.
16 ///
17 /// When necessary, this pass creates a function prolog
18 ///
19 ///   S_MOV_B64 LiveMask, EXEC
20 ///   S_WQM_B64 EXEC, EXEC
21 ///
22 /// to enter WQM at the top of the function and surrounds blocks of Exact
23 /// instructions by
24 ///
25 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
26 ///   ...
27 ///   S_MOV_B64 EXEC, Tmp
28 ///
29 /// We also compute when a sequence of instructions requires Whole Wavefront
30 /// Mode (WWM) and insert instructions to save and restore it:
31 ///
32 /// S_OR_SAVEEXEC_B64 Tmp, -1
33 /// ...
34 /// S_MOV_B64 EXEC, Tmp
35 ///
36 /// In order to avoid excessive switching during sequences of Exact
37 /// instructions, the pass first analyzes which instructions must be run in WQM
38 /// (aka which instructions produce values that lead to derivative
39 /// computations).
40 ///
41 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
42 ///
43 /// There is room for improvement given better control flow analysis:
44 ///
45 ///  (1) at the top level (outside of control flow statements, and as long as
46 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
47 ///      the LiveMask (this is implemented for the entry block).
48 ///
49 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
50 ///      consist of exact and don't-care instructions, the switch only has to
51 ///      be done at the entry and exit points rather than potentially in each
52 ///      block of the region.
53 ///
54 //===----------------------------------------------------------------------===//
55 
56 #include "AMDGPU.h"
57 #include "GCNSubtarget.h"
58 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
59 #include "llvm/ADT/MapVector.h"
60 #include "llvm/ADT/PostOrderIterator.h"
61 #include "llvm/CodeGen/LiveIntervals.h"
62 #include "llvm/CodeGen/MachineBasicBlock.h"
63 #include "llvm/CodeGen/MachineDominators.h"
64 #include "llvm/CodeGen/MachineFunctionPass.h"
65 #include "llvm/CodeGen/MachineInstr.h"
66 #include "llvm/CodeGen/MachinePostDominators.h"
67 #include "llvm/IR/CallingConv.h"
68 #include "llvm/InitializePasses.h"
69 #include "llvm/Support/raw_ostream.h"
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "si-wqm"
74 
75 namespace {
76 
77 enum {
78   StateWQM = 0x1,
79   StateWWM = 0x2,
80   StateExact = 0x4,
81 };
82 
83 struct PrintState {
84 public:
85   int State;
86 
87   explicit PrintState(int State) : State(State) {}
88 };
89 
90 #ifndef NDEBUG
91 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
92   if (PS.State & StateWQM)
93     OS << "WQM";
94   if (PS.State & StateWWM) {
95     if (PS.State & StateWQM)
96       OS << '|';
97     OS << "WWM";
98   }
99   if (PS.State & StateExact) {
100     if (PS.State & (StateWQM | StateWWM))
101       OS << '|';
102     OS << "Exact";
103   }
104 
105   return OS;
106 }
107 #endif
108 
109 struct InstrInfo {
110   char Needs = 0;
111   char Disabled = 0;
112   char OutNeeds = 0;
113 };
114 
115 struct BlockInfo {
116   char Needs = 0;
117   char InNeeds = 0;
118   char OutNeeds = 0;
119   char InitialState = 0;
120   bool NeedsLowering = false;
121 };
122 
123 struct WorkItem {
124   MachineBasicBlock *MBB = nullptr;
125   MachineInstr *MI = nullptr;
126 
127   WorkItem() = default;
128   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
129   WorkItem(MachineInstr *MI) : MI(MI) {}
130 };
131 
132 class SIWholeQuadMode : public MachineFunctionPass {
133 private:
134   const SIInstrInfo *TII;
135   const SIRegisterInfo *TRI;
136   const GCNSubtarget *ST;
137   MachineRegisterInfo *MRI;
138   LiveIntervals *LIS;
139   MachineDominatorTree *MDT;
140   MachinePostDominatorTree *PDT;
141 
142   unsigned AndOpc;
143   unsigned AndN2Opc;
144   unsigned XorOpc;
145   unsigned AndSaveExecOpc;
146   unsigned OrSaveExecOpc;
147   unsigned WQMOpc;
148   Register Exec;
149   Register LiveMaskReg;
150 
151   DenseMap<const MachineInstr *, InstrInfo> Instructions;
152   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
153 
154   // Tracks state (WQM/WWM/Exact) after a given instruction
155   DenseMap<const MachineInstr *, char> StateTransition;
156 
157   SmallVector<MachineInstr *, 2> LiveMaskQueries;
158   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
159   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
160   SmallVector<MachineInstr *, 4> KillInstrs;
161 
162   void printInfo();
163 
164   void markInstruction(MachineInstr &MI, char Flag,
165                        std::vector<WorkItem> &Worklist);
166   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
167                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
168   void markInstructionUses(const MachineInstr &MI, char Flag,
169                            std::vector<WorkItem> &Worklist);
170   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
171   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
172   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
173   char analyzeFunction(MachineFunction &MF);
174 
175   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
176                                       MachineBasicBlock::iterator Before);
177   MachineBasicBlock::iterator
178   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
179                    MachineBasicBlock::iterator Last, bool PreferLast,
180                    bool SaveSCC);
181   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
182                Register SaveWQM);
183   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
184              Register SavedWQM);
185   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
186              Register SaveOrig);
187   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
188                Register SavedOrig, char NonWWMState);
189 
190   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
191 
192   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
193                             bool IsWQM);
194   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
195 
196   void lowerBlock(MachineBasicBlock &MBB);
197   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
198 
199   void lowerLiveMaskQueries();
200   void lowerCopyInstrs();
201   void lowerKillInstrs(bool IsWQM);
202 
203 public:
204   static char ID;
205 
206   SIWholeQuadMode() :
207     MachineFunctionPass(ID) { }
208 
209   bool runOnMachineFunction(MachineFunction &MF) override;
210 
211   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
212 
213   void getAnalysisUsage(AnalysisUsage &AU) const override {
214     AU.addRequired<LiveIntervals>();
215     AU.addPreserved<SlotIndexes>();
216     AU.addPreserved<LiveIntervals>();
217     AU.addRequired<MachineDominatorTree>();
218     AU.addPreserved<MachineDominatorTree>();
219     AU.addRequired<MachinePostDominatorTree>();
220     AU.addPreserved<MachinePostDominatorTree>();
221     MachineFunctionPass::getAnalysisUsage(AU);
222   }
223 
224   MachineFunctionProperties getClearedProperties() const override {
225     return MachineFunctionProperties().set(
226         MachineFunctionProperties::Property::IsSSA);
227   }
228 };
229 
230 } // end anonymous namespace
231 
232 char SIWholeQuadMode::ID = 0;
233 
234 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
235                       false)
236 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
237 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
238 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
239 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
240                     false)
241 
242 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
243 
244 FunctionPass *llvm::createSIWholeQuadModePass() {
245   return new SIWholeQuadMode;
246 }
247 
248 #ifndef NDEBUG
249 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
250   for (const auto &BII : Blocks) {
251     dbgs() << "\n"
252            << printMBBReference(*BII.first) << ":\n"
253            << "  InNeeds = " << PrintState(BII.second.InNeeds)
254            << ", Needs = " << PrintState(BII.second.Needs)
255            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
256 
257     for (const MachineInstr &MI : *BII.first) {
258       auto III = Instructions.find(&MI);
259       if (III == Instructions.end())
260         continue;
261 
262       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
263              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
264     }
265   }
266 }
267 #endif
268 
269 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
270                                       std::vector<WorkItem> &Worklist) {
271   InstrInfo &II = Instructions[&MI];
272 
273   assert(!(Flag & StateExact) && Flag != 0);
274 
275   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
276 
277   // Remove any disabled states from the flag. The user that required it gets
278   // an undefined value in the helper lanes. For example, this can happen if
279   // the result of an atomic is used by instruction that requires WQM, where
280   // ignoring the request for WQM is correct as per the relevant specs.
281   Flag &= ~II.Disabled;
282 
283   // Ignore if the flag is already encompassed by the existing needs, or we
284   // just disabled everything.
285   if ((II.Needs & Flag) == Flag)
286     return;
287 
288   II.Needs |= Flag;
289   Worklist.push_back(&MI);
290 }
291 
292 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
293 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
294                                Register Reg, unsigned SubReg, char Flag,
295                                std::vector<WorkItem> &Worklist) {
296   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
297 
298   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
299   if (!UseLRQ.valueIn())
300     return;
301 
302   SmallPtrSet<const VNInfo *, 4> Visited;
303   SmallVector<const VNInfo *, 4> ToProcess;
304   ToProcess.push_back(UseLRQ.valueIn());
305   do {
306     const VNInfo *Value = ToProcess.pop_back_val();
307     Visited.insert(Value);
308 
309     if (Value->isPHIDef()) {
310       // Need to mark all defs used in the PHI node
311       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
312       assert(MBB && "Phi-def has no defining MBB");
313       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
314                                                   PE = MBB->pred_end();
315            PI != PE; ++PI) {
316         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
317           if (!Visited.count(VN))
318             ToProcess.push_back(VN);
319         }
320       }
321     } else {
322       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
323       assert(MI && "Def has no defining instruction");
324       markInstruction(*MI, Flag, Worklist);
325 
326       // Iterate over all operands to find relevant definitions
327       for (const MachineOperand &Op : MI->operands()) {
328         if (!(Op.isReg() && Op.getReg() == Reg))
329           continue;
330 
331         // Does this def cover whole register?
332         bool DefinesFullReg =
333             Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
334         if (!DefinesFullReg) {
335           // Partial definition; need to follow and mark input value
336           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
337           if (const VNInfo *VN = LRQ.valueIn()) {
338             if (!Visited.count(VN))
339               ToProcess.push_back(VN);
340           }
341         }
342       }
343     }
344   } while (!ToProcess.empty());
345 }
346 
347 /// Mark all instructions defining the uses in \p MI with \p Flag.
348 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
349                                           std::vector<WorkItem> &Worklist) {
350 
351   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
352                     << MI);
353 
354   for (const MachineOperand &Use : MI.uses()) {
355     if (!Use.isReg() || !Use.isUse())
356       continue;
357 
358     Register Reg = Use.getReg();
359 
360     // Handle physical registers that we need to track; this is mostly relevant
361     // for VCC, which can appear as the (implicit) input of a uniform branch,
362     // e.g. when a loop counter is stored in a VGPR.
363     if (!Reg.isVirtual()) {
364       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
365         continue;
366 
367       for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
368            ++RegUnit) {
369         LiveRange &LR = LIS->getRegUnit(*RegUnit);
370         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
371         if (!Value)
372           continue;
373 
374         markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
375       }
376 
377       continue;
378     }
379 
380     LiveRange &LR = LIS->getInterval(Reg);
381     markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
382   }
383 }
384 
385 // Scan instructions to determine which ones require an Exact execmask and
386 // which ones seed WQM requirements.
387 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
388                                        std::vector<WorkItem> &Worklist) {
389   char GlobalFlags = 0;
390   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
391   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
392   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
393 
394   // We need to visit the basic blocks in reverse post-order so that we visit
395   // defs before uses, in particular so that we don't accidentally mark an
396   // instruction as needing e.g. WQM before visiting it and realizing it needs
397   // WQM disabled.
398   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
399   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
400     MachineBasicBlock &MBB = **BI;
401     BlockInfo &BBI = Blocks[&MBB];
402 
403     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
404       MachineInstr &MI = *II;
405       InstrInfo &III = Instructions[&MI];
406       unsigned Opcode = MI.getOpcode();
407       char Flags = 0;
408 
409       if (TII->isWQM(Opcode)) {
410         // Sampling instructions don't need to produce results for all pixels
411         // in a quad, they just require all inputs of a quad to have been
412         // computed for derivatives.
413         markInstructionUses(MI, StateWQM, Worklist);
414         GlobalFlags |= StateWQM;
415         continue;
416       } else if (Opcode == AMDGPU::WQM) {
417         // The WQM intrinsic requires its output to have all the helper lanes
418         // correct, so we need it to be in WQM.
419         Flags = StateWQM;
420         LowerToCopyInstrs.push_back(&MI);
421       } else if (Opcode == AMDGPU::SOFT_WQM) {
422         LowerToCopyInstrs.push_back(&MI);
423         SoftWQMInstrs.push_back(&MI);
424         continue;
425       } else if (Opcode == AMDGPU::WWM) {
426         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
427         // to be executed in WQM or Exact so that its copy doesn't clobber
428         // inactive lanes.
429         markInstructionUses(MI, StateWWM, Worklist);
430         GlobalFlags |= StateWWM;
431         LowerToMovInstrs.push_back(&MI);
432         continue;
433       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
434                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
435         III.Disabled = StateWWM;
436         MachineOperand &Inactive = MI.getOperand(2);
437         if (Inactive.isReg()) {
438           if (Inactive.isUndef()) {
439             LowerToCopyInstrs.push_back(&MI);
440           } else {
441             Register Reg = Inactive.getReg();
442             if (Reg.isVirtual()) {
443               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
444                 markInstruction(DefMI, StateWWM, Worklist);
445             }
446           }
447         }
448         SetInactiveInstrs.push_back(&MI);
449         continue;
450       } else if (TII->isDisableWQM(MI)) {
451         BBI.Needs |= StateExact;
452         if (!(BBI.InNeeds & StateExact)) {
453           BBI.InNeeds |= StateExact;
454           Worklist.push_back(&MBB);
455         }
456         GlobalFlags |= StateExact;
457         III.Disabled = StateWQM | StateWWM;
458         continue;
459       } else {
460         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
461           LiveMaskQueries.push_back(&MI);
462         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
463                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
464                    Opcode == AMDGPU::SI_DEMOTE_I1) {
465           KillInstrs.push_back(&MI);
466           BBI.NeedsLowering = true;
467         } else if (WQMOutputs) {
468           // The function is in machine SSA form, which means that physical
469           // VGPRs correspond to shader inputs and outputs. Inputs are
470           // only used, outputs are only defined.
471           // FIXME: is this still valid?
472           for (const MachineOperand &MO : MI.defs()) {
473             if (!MO.isReg())
474               continue;
475 
476             Register Reg = MO.getReg();
477 
478             if (!Reg.isVirtual() &&
479                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
480               Flags = StateWQM;
481               break;
482             }
483           }
484         }
485 
486         if (!Flags)
487           continue;
488       }
489 
490       markInstruction(MI, Flags, Worklist);
491       GlobalFlags |= Flags;
492     }
493   }
494 
495   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
496   // ever used anywhere in the function. This implements the corresponding
497   // semantics of @llvm.amdgcn.set.inactive.
498   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
499   if (GlobalFlags & StateWQM) {
500     for (MachineInstr *MI : SetInactiveInstrs)
501       markInstruction(*MI, StateWQM, Worklist);
502     for (MachineInstr *MI : SoftWQMInstrs)
503       markInstruction(*MI, StateWQM, Worklist);
504   }
505 
506   return GlobalFlags;
507 }
508 
509 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
510                                            std::vector<WorkItem>& Worklist) {
511   MachineBasicBlock *MBB = MI.getParent();
512   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
513   BlockInfo &BI = Blocks[MBB];
514 
515   // Control flow-type instructions and stores to temporary memory that are
516   // followed by WQM computations must themselves be in WQM.
517   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
518       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
519     Instructions[&MI].Needs = StateWQM;
520     II.Needs = StateWQM;
521   }
522 
523   // Propagate to block level
524   if (II.Needs & StateWQM) {
525     BI.Needs |= StateWQM;
526     if (!(BI.InNeeds & StateWQM)) {
527       BI.InNeeds |= StateWQM;
528       Worklist.push_back(MBB);
529     }
530   }
531 
532   // Propagate backwards within block
533   if (MachineInstr *PrevMI = MI.getPrevNode()) {
534     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
535     if (!PrevMI->isPHI()) {
536       InstrInfo &PrevII = Instructions[PrevMI];
537       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
538         PrevII.OutNeeds |= InNeeds;
539         Worklist.push_back(PrevMI);
540       }
541     }
542   }
543 
544   // Propagate WQM flag to instruction inputs
545   assert(!(II.Needs & StateExact));
546 
547   if (II.Needs != 0)
548     markInstructionUses(MI, II.Needs, Worklist);
549 
550   // Ensure we process a block containing WWM, even if it does not require any
551   // WQM transitions.
552   if (II.Needs & StateWWM)
553     BI.Needs |= StateWWM;
554 }
555 
556 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
557                                      std::vector<WorkItem>& Worklist) {
558   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
559 
560   // Propagate through instructions
561   if (!MBB.empty()) {
562     MachineInstr *LastMI = &*MBB.rbegin();
563     InstrInfo &LastII = Instructions[LastMI];
564     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
565       LastII.OutNeeds |= BI.OutNeeds;
566       Worklist.push_back(LastMI);
567     }
568   }
569 
570   // Predecessor blocks must provide for our WQM/Exact needs.
571   for (MachineBasicBlock *Pred : MBB.predecessors()) {
572     BlockInfo &PredBI = Blocks[Pred];
573     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
574       continue;
575 
576     PredBI.OutNeeds |= BI.InNeeds;
577     PredBI.InNeeds |= BI.InNeeds;
578     Worklist.push_back(Pred);
579   }
580 
581   // All successors must be prepared to accept the same set of WQM/Exact data.
582   for (MachineBasicBlock *Succ : MBB.successors()) {
583     BlockInfo &SuccBI = Blocks[Succ];
584     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
585       continue;
586 
587     SuccBI.InNeeds |= BI.OutNeeds;
588     Worklist.push_back(Succ);
589   }
590 }
591 
592 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
593   std::vector<WorkItem> Worklist;
594   char GlobalFlags = scanInstructions(MF, Worklist);
595 
596   while (!Worklist.empty()) {
597     WorkItem WI = Worklist.back();
598     Worklist.pop_back();
599 
600     if (WI.MI)
601       propagateInstruction(*WI.MI, Worklist);
602     else
603       propagateBlock(*WI.MBB, Worklist);
604   }
605 
606   return GlobalFlags;
607 }
608 
609 MachineBasicBlock::iterator
610 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
611                          MachineBasicBlock::iterator Before) {
612   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
613 
614   MachineInstr *Save =
615       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
616           .addReg(AMDGPU::SCC);
617   MachineInstr *Restore =
618       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
619           .addReg(SaveReg);
620 
621   LIS->InsertMachineInstrInMaps(*Save);
622   LIS->InsertMachineInstrInMaps(*Restore);
623   LIS->createAndComputeVirtRegInterval(SaveReg);
624 
625   return Restore;
626 }
627 
628 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
629                                                MachineInstr *TermMI) {
630   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
631                     << *TermMI << "\n");
632 
633   MachineBasicBlock *SplitBB =
634       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
635 
636   // Convert last instruction in block to a terminator.
637   // Note: this only covers the expected patterns
638   unsigned NewOpcode = 0;
639   switch (TermMI->getOpcode()) {
640   case AMDGPU::S_AND_B32:
641     NewOpcode = AMDGPU::S_AND_B32_term;
642     break;
643   case AMDGPU::S_AND_B64:
644     NewOpcode = AMDGPU::S_AND_B64_term;
645     break;
646   case AMDGPU::S_MOV_B32:
647     NewOpcode = AMDGPU::S_MOV_B32_term;
648     break;
649   case AMDGPU::S_MOV_B64:
650     NewOpcode = AMDGPU::S_MOV_B64_term;
651     break;
652   default:
653     break;
654   }
655   if (NewOpcode)
656     TermMI->setDesc(TII->get(NewOpcode));
657 
658   if (SplitBB != BB) {
659     // Update dominator trees
660     using DomTreeT = DomTreeBase<MachineBasicBlock>;
661     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
662     for (MachineBasicBlock *Succ : SplitBB->successors()) {
663       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
664       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
665     }
666     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
667     if (MDT)
668       MDT->getBase().applyUpdates(DTUpdates);
669     if (PDT)
670       PDT->getBase().applyUpdates(DTUpdates);
671 
672     // Link blocks
673     MachineInstr *MI =
674         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
675             .addMBB(SplitBB);
676     LIS->InsertMachineInstrInMaps(*MI);
677   }
678 
679   return SplitBB;
680 }
681 
682 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
683                                             MachineInstr &MI) {
684   const DebugLoc &DL = MI.getDebugLoc();
685   unsigned Opcode = 0;
686 
687   assert(MI.getOperand(0).isReg());
688 
689   // Comparison is for live lanes; however here we compute the inverse
690   // (killed lanes).  This is because VCMP will always generate 0 bits
691   // for inactive lanes so a mask of live lanes would not be correct
692   // inside control flow.
693   // Invert the comparison by swapping the operands and adjusting
694   // the comparison codes.
695 
696   switch (MI.getOperand(2).getImm()) {
697   case ISD::SETUEQ:
698     Opcode = AMDGPU::V_CMP_LG_F32_e64;
699     break;
700   case ISD::SETUGT:
701     Opcode = AMDGPU::V_CMP_GE_F32_e64;
702     break;
703   case ISD::SETUGE:
704     Opcode = AMDGPU::V_CMP_GT_F32_e64;
705     break;
706   case ISD::SETULT:
707     Opcode = AMDGPU::V_CMP_LE_F32_e64;
708     break;
709   case ISD::SETULE:
710     Opcode = AMDGPU::V_CMP_LT_F32_e64;
711     break;
712   case ISD::SETUNE:
713     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
714     break;
715   case ISD::SETO:
716     Opcode = AMDGPU::V_CMP_O_F32_e64;
717     break;
718   case ISD::SETUO:
719     Opcode = AMDGPU::V_CMP_U_F32_e64;
720     break;
721   case ISD::SETOEQ:
722   case ISD::SETEQ:
723     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
724     break;
725   case ISD::SETOGT:
726   case ISD::SETGT:
727     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
728     break;
729   case ISD::SETOGE:
730   case ISD::SETGE:
731     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
732     break;
733   case ISD::SETOLT:
734   case ISD::SETLT:
735     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
736     break;
737   case ISD::SETOLE:
738   case ISD::SETLE:
739     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
740     break;
741   case ISD::SETONE:
742   case ISD::SETNE:
743     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
744     break;
745   default:
746     llvm_unreachable("invalid ISD:SET cond code");
747   }
748 
749   // Pick opcode based on comparison type.
750   MachineInstr *VcmpMI;
751   const MachineOperand &Op0 = MI.getOperand(0);
752   const MachineOperand &Op1 = MI.getOperand(1);
753   if (TRI->isVGPR(*MRI, Op0.getReg())) {
754     Opcode = AMDGPU::getVOPe32(Opcode);
755     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
756   } else {
757     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
758                  .addReg(AMDGPU::VCC, RegState::Define)
759                  .addImm(0) // src0 modifiers
760                  .add(Op1)
761                  .addImm(0) // src1 modifiers
762                  .add(Op0)
763                  .addImm(0); // omod
764   }
765 
766   // VCC represents lanes killed.
767   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
768 
769   MachineInstr *MaskUpdateMI =
770       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
771           .addReg(LiveMaskReg)
772           .addReg(VCC);
773 
774   // State of SCC represents whether any lanes are live in mask,
775   // if SCC is 0 then no lanes will be alive anymore.
776   MachineInstr *EarlyTermMI =
777       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
778 
779   MachineInstr *ExecMaskMI =
780       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
781 
782   assert(MBB.succ_size() == 1);
783   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
784                               .addMBB(*MBB.succ_begin());
785 
786   // Update live intervals
787   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
788   MBB.remove(&MI);
789 
790   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
791   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
792   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
793   LIS->InsertMachineInstrInMaps(*NewTerm);
794 
795   return NewTerm;
796 }
797 
798 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
799                                            MachineInstr &MI, bool IsWQM) {
800   const DebugLoc &DL = MI.getDebugLoc();
801   MachineInstr *MaskUpdateMI = nullptr;
802 
803   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
804   const MachineOperand &Op = MI.getOperand(0);
805   int64_t KillVal = MI.getOperand(1).getImm();
806   MachineInstr *ComputeKilledMaskMI = nullptr;
807   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
808   Register TmpReg;
809 
810   // Is this a static or dynamic kill?
811   if (Op.isImm()) {
812     if (Op.getImm() == KillVal) {
813       // Static: all active lanes are killed
814       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
815                          .addReg(LiveMaskReg)
816                          .addReg(Exec);
817     } else {
818       // Static: kill does nothing
819       MachineInstr *NewTerm = nullptr;
820       if (IsDemote) {
821         LIS->RemoveMachineInstrFromMaps(MI);
822       } else {
823         assert(MBB.succ_size() == 1);
824         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
825                       .addMBB(*MBB.succ_begin());
826         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
827       }
828       MBB.remove(&MI);
829       return NewTerm;
830     }
831   } else {
832     if (!KillVal) {
833       // Op represents live lanes after kill,
834       // so exec mask needs to be factored in.
835       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
836       ComputeKilledMaskMI =
837           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
838       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
839                          .addReg(LiveMaskReg)
840                          .addReg(TmpReg);
841     } else {
842       // Op represents lanes to kill
843       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
844                          .addReg(LiveMaskReg)
845                          .add(Op);
846     }
847   }
848 
849   // State of SCC represents whether any lanes are live in mask,
850   // if SCC is 0 then no lanes will be alive anymore.
851   MachineInstr *EarlyTermMI =
852       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
853 
854   // In the case we got this far some lanes are still live,
855   // update EXEC to deactivate lanes as appropriate.
856   MachineInstr *NewTerm;
857   MachineInstr *WQMMaskMI = nullptr;
858   Register LiveMaskWQM;
859   if (IsDemote) {
860     // Demotes deactive quads with only helper lanes
861     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
862     WQMMaskMI =
863         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
864     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
865                   .addReg(Exec)
866                   .addReg(LiveMaskWQM);
867   } else {
868     // Kills deactivate lanes
869     if (Op.isImm()) {
870       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
871       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
872     } else if (!IsWQM) {
873       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
874                     .addReg(Exec)
875                     .addReg(LiveMaskReg);
876     } else {
877       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
878       NewTerm =
879           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
880     }
881   }
882 
883   // Update live intervals
884   LIS->RemoveMachineInstrFromMaps(MI);
885   MBB.remove(&MI);
886   assert(EarlyTermMI);
887   assert(MaskUpdateMI);
888   assert(NewTerm);
889   if (ComputeKilledMaskMI)
890     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
891   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
892   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
893   if (WQMMaskMI)
894     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
895   LIS->InsertMachineInstrInMaps(*NewTerm);
896 
897   if (CndReg) {
898     LIS->removeInterval(CndReg);
899     LIS->createAndComputeVirtRegInterval(CndReg);
900   }
901   if (TmpReg)
902     LIS->createAndComputeVirtRegInterval(TmpReg);
903   if (LiveMaskWQM)
904     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
905 
906   return NewTerm;
907 }
908 
909 // Replace (or supplement) instructions accessing live mask.
910 // This can only happen once all the live mask registers have been created
911 // and the execute state (WQM/WWM/Exact) of instructions is known.
912 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
913   auto BII = Blocks.find(&MBB);
914   if (BII == Blocks.end())
915     return;
916 
917   const BlockInfo &BI = BII->second;
918   if (!BI.NeedsLowering)
919     return;
920 
921   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
922 
923   SmallVector<MachineInstr *, 4> SplitPoints;
924   char State = BI.InitialState;
925 
926   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
927   while (II != IE) {
928     auto Next = std::next(II);
929     MachineInstr &MI = *II;
930 
931     if (StateTransition.count(&MI))
932       State = StateTransition[&MI];
933 
934     MachineInstr *SplitPoint = nullptr;
935     switch (MI.getOpcode()) {
936     case AMDGPU::SI_DEMOTE_I1:
937     case AMDGPU::SI_KILL_I1_TERMINATOR:
938       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
939       break;
940     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
941       SplitPoint = lowerKillF32(MBB, MI);
942       break;
943     default:
944       break;
945     }
946     if (SplitPoint)
947       SplitPoints.push_back(SplitPoint);
948 
949     II = Next;
950   }
951 
952   // Perform splitting after instruction scan to simplify iteration.
953   if (!SplitPoints.empty()) {
954     MachineBasicBlock *BB = &MBB;
955     for (MachineInstr *MI : SplitPoints) {
956       BB = splitBlock(BB, MI);
957     }
958   }
959 }
960 
961 // Return an iterator in the (inclusive) range [First, Last] at which
962 // instructions can be safely inserted, keeping in mind that some of the
963 // instructions we want to add necessarily clobber SCC.
964 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
965     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
966     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
967   if (!SaveSCC)
968     return PreferLast ? Last : First;
969 
970   LiveRange &LR =
971       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
972   auto MBBE = MBB.end();
973   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
974                                      : LIS->getMBBEndIdx(&MBB);
975   SlotIndex LastIdx =
976       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
977   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
978   const LiveRange::Segment *S;
979 
980   for (;;) {
981     S = LR.getSegmentContaining(Idx);
982     if (!S)
983       break;
984 
985     if (PreferLast) {
986       SlotIndex Next = S->start.getBaseIndex();
987       if (Next < FirstIdx)
988         break;
989       Idx = Next;
990     } else {
991       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
992       assert(EndMI && "Segment does not end on valid instruction");
993       auto NextI = std::next(EndMI->getIterator());
994       if (NextI == MBB.end())
995         break;
996       SlotIndex Next = LIS->getInstructionIndex(*NextI);
997       if (Next > LastIdx)
998         break;
999       Idx = Next;
1000     }
1001   }
1002 
1003   MachineBasicBlock::iterator MBBI;
1004 
1005   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1006     MBBI = MI;
1007   else {
1008     assert(Idx == LIS->getMBBEndIdx(&MBB));
1009     MBBI = MBB.end();
1010   }
1011 
1012   // Move insertion point past any operations modifying EXEC.
1013   // This assumes that the value of SCC defined by any of these operations
1014   // does not need to be preserved.
1015   while (MBBI != Last) {
1016     bool IsExecDef = false;
1017     for (const MachineOperand &MO : MBBI->operands()) {
1018       if (MO.isReg() && MO.isDef()) {
1019         IsExecDef |=
1020             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1021       }
1022     }
1023     if (!IsExecDef)
1024       break;
1025     MBBI++;
1026     S = nullptr;
1027   }
1028 
1029   if (S)
1030     MBBI = saveSCC(MBB, MBBI);
1031 
1032   return MBBI;
1033 }
1034 
1035 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1036                               MachineBasicBlock::iterator Before,
1037                               Register SaveWQM) {
1038   MachineInstr *MI;
1039 
1040   if (SaveWQM) {
1041     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1042              .addReg(LiveMaskReg);
1043   } else {
1044     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1045              .addReg(Exec)
1046              .addReg(LiveMaskReg);
1047   }
1048 
1049   LIS->InsertMachineInstrInMaps(*MI);
1050   StateTransition[MI] = StateExact;
1051 }
1052 
1053 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1054                             MachineBasicBlock::iterator Before,
1055                             Register SavedWQM) {
1056   MachineInstr *MI;
1057 
1058   if (SavedWQM) {
1059     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1060              .addReg(SavedWQM);
1061   } else {
1062     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1063   }
1064 
1065   LIS->InsertMachineInstrInMaps(*MI);
1066   StateTransition[MI] = StateWQM;
1067 }
1068 
1069 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
1070                             MachineBasicBlock::iterator Before,
1071                             Register SaveOrig) {
1072   MachineInstr *MI;
1073 
1074   assert(SaveOrig);
1075   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
1076            .addImm(-1);
1077   LIS->InsertMachineInstrInMaps(*MI);
1078   StateTransition[MI] = StateWWM;
1079 }
1080 
1081 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
1082                               MachineBasicBlock::iterator Before,
1083                               Register SavedOrig, char NonWWMState) {
1084   MachineInstr *MI;
1085 
1086   assert(SavedOrig);
1087   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec)
1088            .addReg(SavedOrig);
1089   LIS->InsertMachineInstrInMaps(*MI);
1090   StateTransition[MI] = NonWWMState;
1091 }
1092 
1093 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1094   auto BII = Blocks.find(&MBB);
1095   if (BII == Blocks.end())
1096     return;
1097 
1098   BlockInfo &BI = BII->second;
1099 
1100   // This is a non-entry block that is WQM throughout, so no need to do
1101   // anything.
1102   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1103     BI.InitialState = StateWQM;
1104     return;
1105   }
1106 
1107   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1108                     << ":\n");
1109 
1110   Register SavedWQMReg;
1111   Register SavedNonWWMReg;
1112   bool WQMFromExec = IsEntry;
1113   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1114   char NonWWMState = 0;
1115   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1116 
1117   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1118   if (IsEntry) {
1119     // Skip the instruction that saves LiveMask
1120     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1121       ++II;
1122   }
1123 
1124   // This stores the first instruction where it's safe to switch from WQM to
1125   // Exact or vice versa.
1126   MachineBasicBlock::iterator FirstWQM = IE;
1127 
1128   // This stores the first instruction where it's safe to switch from WWM to
1129   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
1130   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
1131   // switch to/from WQM as well.
1132   MachineBasicBlock::iterator FirstWWM = IE;
1133 
1134   // Record initial state is block information.
1135   BI.InitialState = State;
1136 
1137   for (;;) {
1138     MachineBasicBlock::iterator Next = II;
1139     char Needs = StateExact | StateWQM; // WWM is disabled by default
1140     char OutNeeds = 0;
1141 
1142     if (FirstWQM == IE)
1143       FirstWQM = II;
1144 
1145     if (FirstWWM == IE)
1146       FirstWWM = II;
1147 
1148     // First, figure out the allowed states (Needs) based on the propagated
1149     // flags.
1150     if (II != IE) {
1151       MachineInstr &MI = *II;
1152 
1153       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1154         auto III = Instructions.find(&MI);
1155         if (III != Instructions.end()) {
1156           if (III->second.Needs & StateWWM)
1157             Needs = StateWWM;
1158           else if (III->second.Needs & StateWQM)
1159             Needs = StateWQM;
1160           else
1161             Needs &= ~III->second.Disabled;
1162           OutNeeds = III->second.OutNeeds;
1163         }
1164       } else {
1165         // If the instruction doesn't actually need a correct EXEC, then we can
1166         // safely leave WWM enabled.
1167         Needs = StateExact | StateWQM | StateWWM;
1168       }
1169 
1170       if (MI.isTerminator() && OutNeeds == StateExact)
1171         Needs = StateExact;
1172 
1173       ++Next;
1174     } else {
1175       // End of basic block
1176       if (BI.OutNeeds & StateWQM)
1177         Needs = StateWQM;
1178       else if (BI.OutNeeds == StateExact)
1179         Needs = StateExact;
1180       else
1181         Needs = StateWQM | StateExact;
1182     }
1183 
1184     // Now, transition if necessary.
1185     if (!(Needs & State)) {
1186       MachineBasicBlock::iterator First;
1187       if (State == StateWWM || Needs == StateWWM) {
1188         // We must switch to or from WWM
1189         First = FirstWWM;
1190       } else {
1191         // We only need to switch to/from WQM, so we can use FirstWQM
1192         First = FirstWQM;
1193       }
1194 
1195       // Whether we need to save SCC depends on start and end states
1196       bool SaveSCC = false;
1197       switch (State) {
1198       case StateExact:
1199       case StateWWM:
1200         // Exact/WWM -> WWM: save SCC
1201         // Exact/WWM -> WQM: save SCC if WQM mask is generated from exec
1202         // Exact/WWM -> Exact: no save
1203         SaveSCC = (Needs & StateWWM) || ((Needs & StateWQM) && WQMFromExec);
1204         break;
1205       case StateWQM:
1206         // WQM -> Exact/WMM: save SCC
1207         SaveSCC = !(Needs & StateWQM);
1208         break;
1209       default:
1210         llvm_unreachable("Unknown state");
1211         break;
1212       }
1213       MachineBasicBlock::iterator Before =
1214           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1215 
1216       if (State == StateWWM) {
1217         assert(SavedNonWWMReg);
1218         fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
1219         LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
1220         SavedNonWWMReg = 0;
1221         State = NonWWMState;
1222       }
1223 
1224       if (Needs == StateWWM) {
1225         NonWWMState = State;
1226         assert(!SavedNonWWMReg);
1227         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
1228         toWWM(MBB, Before, SavedNonWWMReg);
1229         State = StateWWM;
1230       } else {
1231         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1232           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1233             assert(!SavedWQMReg);
1234             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1235           }
1236 
1237           toExact(MBB, Before, SavedWQMReg);
1238           State = StateExact;
1239         } else if (State == StateExact && (Needs & StateWQM) &&
1240                    !(Needs & StateExact)) {
1241           assert(WQMFromExec == (SavedWQMReg == 0));
1242 
1243           toWQM(MBB, Before, SavedWQMReg);
1244 
1245           if (SavedWQMReg) {
1246             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1247             SavedWQMReg = 0;
1248           }
1249           State = StateWQM;
1250         } else {
1251           // We can get here if we transitioned from WWM to a non-WWM state that
1252           // already matches our needs, but we shouldn't need to do anything.
1253           assert(Needs & State);
1254         }
1255       }
1256     }
1257 
1258     if (Needs != (StateExact | StateWQM | StateWWM)) {
1259       if (Needs != (StateExact | StateWQM))
1260         FirstWQM = IE;
1261       FirstWWM = IE;
1262     }
1263 
1264     if (II == IE)
1265       break;
1266 
1267     II = Next;
1268   }
1269   assert(!SavedWQMReg);
1270   assert(!SavedNonWWMReg);
1271 }
1272 
1273 void SIWholeQuadMode::lowerLiveMaskQueries() {
1274   for (MachineInstr *MI : LiveMaskQueries) {
1275     const DebugLoc &DL = MI->getDebugLoc();
1276     Register Dest = MI->getOperand(0).getReg();
1277 
1278     MachineInstr *Copy =
1279         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1280             .addReg(LiveMaskReg);
1281 
1282     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1283     MI->eraseFromParent();
1284   }
1285 }
1286 
1287 void SIWholeQuadMode::lowerCopyInstrs() {
1288   for (MachineInstr *MI : LowerToMovInstrs) {
1289     assert(MI->getNumExplicitOperands() == 2);
1290 
1291     const Register Reg = MI->getOperand(0).getReg();
1292     const unsigned SubReg = MI->getOperand(0).getSubReg();
1293 
1294     if (TRI->isVGPR(*MRI, Reg)) {
1295       const TargetRegisterClass *regClass =
1296           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1297       if (SubReg)
1298         regClass = TRI->getSubRegClass(regClass, SubReg);
1299 
1300       const unsigned MovOp = TII->getMovOpcode(regClass);
1301       MI->setDesc(TII->get(MovOp));
1302 
1303       // And make it implicitly depend on exec (like all VALU movs should do).
1304       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
1305     } else {
1306       // Remove early-clobber and exec dependency from simple SGPR copies.
1307       // This allows some to be eliminated during/post RA.
1308       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1309       if (MI->getOperand(0).isEarlyClobber()) {
1310         LIS->removeInterval(Reg);
1311         MI->getOperand(0).setIsEarlyClobber(false);
1312         LIS->createAndComputeVirtRegInterval(Reg);
1313       }
1314       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1315       while (Index >= 0) {
1316         MI->RemoveOperand(Index);
1317         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1318       }
1319       MI->setDesc(TII->get(AMDGPU::COPY));
1320       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1321     }
1322   }
1323   for (MachineInstr *MI : LowerToCopyInstrs) {
1324     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1325         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1326       assert(MI->getNumExplicitOperands() == 3);
1327       // the only reason we should be here is V_SET_INACTIVE has
1328       // an undef input so it is being replaced by a simple copy.
1329       // There should be a second undef source that we should remove.
1330       assert(MI->getOperand(2).isUndef());
1331       MI->RemoveOperand(2);
1332       MI->untieRegOperand(1);
1333     } else {
1334       assert(MI->getNumExplicitOperands() == 2);
1335     }
1336 
1337     MI->setDesc(TII->get(AMDGPU::COPY));
1338   }
1339 }
1340 
1341 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1342   for (MachineInstr *MI : KillInstrs) {
1343     MachineBasicBlock *MBB = MI->getParent();
1344     MachineInstr *SplitPoint = nullptr;
1345     switch (MI->getOpcode()) {
1346     case AMDGPU::SI_DEMOTE_I1:
1347     case AMDGPU::SI_KILL_I1_TERMINATOR:
1348       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1349       break;
1350     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1351       SplitPoint = lowerKillF32(*MBB, *MI);
1352       break;
1353     default:
1354       continue;
1355     }
1356     if (SplitPoint)
1357       splitBlock(MBB, SplitPoint);
1358   }
1359 }
1360 
1361 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1362   Instructions.clear();
1363   Blocks.clear();
1364   LiveMaskQueries.clear();
1365   LowerToCopyInstrs.clear();
1366   LowerToMovInstrs.clear();
1367   KillInstrs.clear();
1368   StateTransition.clear();
1369 
1370   ST = &MF.getSubtarget<GCNSubtarget>();
1371 
1372   TII = ST->getInstrInfo();
1373   TRI = &TII->getRegisterInfo();
1374   MRI = &MF.getRegInfo();
1375   LIS = &getAnalysis<LiveIntervals>();
1376   MDT = &getAnalysis<MachineDominatorTree>();
1377   PDT = &getAnalysis<MachinePostDominatorTree>();
1378 
1379   if (ST->isWave32()) {
1380     AndOpc = AMDGPU::S_AND_B32;
1381     AndN2Opc = AMDGPU::S_ANDN2_B32;
1382     XorOpc = AMDGPU::S_XOR_B32;
1383     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1384     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1385     WQMOpc = AMDGPU::S_WQM_B32;
1386     Exec = AMDGPU::EXEC_LO;
1387   } else {
1388     AndOpc = AMDGPU::S_AND_B64;
1389     AndN2Opc = AMDGPU::S_ANDN2_B64;
1390     XorOpc = AMDGPU::S_XOR_B64;
1391     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1392     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1393     WQMOpc = AMDGPU::S_WQM_B64;
1394     Exec = AMDGPU::EXEC;
1395   }
1396 
1397   const char GlobalFlags = analyzeFunction(MF);
1398   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1399 
1400   LiveMaskReg = Exec;
1401 
1402   // Shader is simple does not need WQM/WWM or any complex lowering
1403   if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() &&
1404       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1405     lowerLiveMaskQueries();
1406     return !LiveMaskQueries.empty();
1407   }
1408 
1409   MachineBasicBlock &Entry = MF.front();
1410   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1411 
1412   // Store a copy of the original live mask when required
1413   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1414     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1415     MachineInstr *MI =
1416         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1417             .addReg(Exec);
1418     LIS->InsertMachineInstrInMaps(*MI);
1419   }
1420 
1421   LLVM_DEBUG(printInfo());
1422 
1423   lowerLiveMaskQueries();
1424   lowerCopyInstrs();
1425 
1426   // Shader only needs WQM
1427   if (GlobalFlags == StateWQM) {
1428     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1429                   .addReg(Exec);
1430     LIS->InsertMachineInstrInMaps(*MI);
1431     lowerKillInstrs(true);
1432   } else {
1433     for (auto BII : Blocks)
1434       processBlock(*BII.first, BII.first == &Entry);
1435     // Lowering blocks causes block splitting so perform as a second pass.
1436     for (auto BII : Blocks)
1437       lowerBlock(*BII.first);
1438   }
1439 
1440   // Compute live range for live mask
1441   if (LiveMaskReg != Exec)
1442     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1443 
1444   // Physical registers like SCC aren't tracked by default anyway, so just
1445   // removing the ranges we computed is the simplest option for maintaining
1446   // the analysis results.
1447   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1448 
1449   // If we performed any kills then recompute EXEC
1450   if (!KillInstrs.empty())
1451     LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1452 
1453   return true;
1454 }
1455