1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 ///   S_MOV_B64 LiveMask, EXEC
22 ///   S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 ///   ...
29 ///   S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 ///  (1) at the top level (outside of control flow statements, and as long as
48 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
49 ///      the LiveMask (this is implemented for the entry block).
50 ///
51 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
52 ///      consist of exact and don't-care instructions, the switch only has to
53 ///      be done at the entry and exit points rather than potentially in each
54 ///      block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "AMDGPUSubtarget.h"
60 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
61 #include "SIInstrInfo.h"
62 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PostOrderIterator.h"
66 #include "llvm/ADT/SmallVector.h"
67 #include "llvm/ADT/StringRef.h"
68 #include "llvm/CodeGen/LiveInterval.h"
69 #include "llvm/CodeGen/LiveIntervals.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/CodeGen/SlotIndexes.h"
78 #include "llvm/CodeGen/TargetRegisterInfo.h"
79 #include "llvm/IR/CallingConv.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/InitializePasses.h"
82 #include "llvm/MC/MCRegisterInfo.h"
83 #include "llvm/Pass.h"
84 #include "llvm/Support/Debug.h"
85 #include "llvm/Support/raw_ostream.h"
86 #include <cassert>
87 #include <vector>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "si-wqm"
92 
93 namespace {
94 
95 enum {
96   StateWQM = 0x1,
97   StateWWM = 0x2,
98   StateExact = 0x4,
99 };
100 
101 struct PrintState {
102 public:
103   int State;
104 
105   explicit PrintState(int State) : State(State) {}
106 };
107 
108 #ifndef NDEBUG
109 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
110   if (PS.State & StateWQM)
111     OS << "WQM";
112   if (PS.State & StateWWM) {
113     if (PS.State & StateWQM)
114       OS << '|';
115     OS << "WWM";
116   }
117   if (PS.State & StateExact) {
118     if (PS.State & (StateWQM | StateWWM))
119       OS << '|';
120     OS << "Exact";
121   }
122 
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137 };
138 
139 struct WorkItem {
140   MachineBasicBlock *MBB = nullptr;
141   MachineInstr *MI = nullptr;
142 
143   WorkItem() = default;
144   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
145   WorkItem(MachineInstr *MI) : MI(MI) {}
146 };
147 
148 class SIWholeQuadMode : public MachineFunctionPass {
149 private:
150   CallingConv::ID CallingConv;
151   const SIInstrInfo *TII;
152   const SIRegisterInfo *TRI;
153   const GCNSubtarget *ST;
154   MachineRegisterInfo *MRI;
155   LiveIntervals *LIS;
156 
157   unsigned AndOpc;
158   unsigned XorTermrOpc;
159   unsigned OrSaveExecOpc;
160   unsigned Exec;
161 
162   DenseMap<const MachineInstr *, InstrInfo> Instructions;
163   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
164   SmallVector<MachineInstr *, 1> LiveMaskQueries;
165   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
166   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
167 
168   void printInfo();
169 
170   void markInstruction(MachineInstr &MI, char Flag,
171                        std::vector<WorkItem> &Worklist);
172   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
173                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
174   void markInstructionUses(const MachineInstr &MI, char Flag,
175                            std::vector<WorkItem> &Worklist);
176   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
177   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
178   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
179   char analyzeFunction(MachineFunction &MF);
180 
181   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
182                                       MachineBasicBlock::iterator Before);
183   MachineBasicBlock::iterator
184   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
185                    MachineBasicBlock::iterator Last, bool PreferLast,
186                    bool SaveSCC);
187   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
188                unsigned SaveWQM, unsigned LiveMaskReg);
189   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
190              unsigned SavedWQM);
191   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
192              unsigned SaveOrig);
193   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
194                unsigned SavedOrig);
195   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
196 
197   void lowerLiveMaskQueries(unsigned LiveMaskReg);
198   void lowerCopyInstrs();
199 
200 public:
201   static char ID;
202 
203   SIWholeQuadMode() :
204     MachineFunctionPass(ID) { }
205 
206   bool runOnMachineFunction(MachineFunction &MF) override;
207 
208   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
209 
210   void getAnalysisUsage(AnalysisUsage &AU) const override {
211     AU.addRequired<LiveIntervals>();
212     AU.addPreserved<SlotIndexes>();
213     AU.addPreserved<LiveIntervals>();
214     AU.setPreservesCFG();
215     MachineFunctionPass::getAnalysisUsage(AU);
216   }
217 };
218 
219 } // end anonymous namespace
220 
221 char SIWholeQuadMode::ID = 0;
222 
223 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
224                       false)
225 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
226 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
227                     false)
228 
229 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
230 
231 FunctionPass *llvm::createSIWholeQuadModePass() {
232   return new SIWholeQuadMode;
233 }
234 
235 #ifndef NDEBUG
236 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
237   for (const auto &BII : Blocks) {
238     dbgs() << "\n"
239            << printMBBReference(*BII.first) << ":\n"
240            << "  InNeeds = " << PrintState(BII.second.InNeeds)
241            << ", Needs = " << PrintState(BII.second.Needs)
242            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
243 
244     for (const MachineInstr &MI : *BII.first) {
245       auto III = Instructions.find(&MI);
246       if (III == Instructions.end())
247         continue;
248 
249       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
250              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
251     }
252   }
253 }
254 #endif
255 
256 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
257                                       std::vector<WorkItem> &Worklist) {
258   InstrInfo &II = Instructions[&MI];
259 
260   assert(!(Flag & StateExact) && Flag != 0);
261 
262   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
263 
264   // Remove any disabled states from the flag. The user that required it gets
265   // an undefined value in the helper lanes. For example, this can happen if
266   // the result of an atomic is used by instruction that requires WQM, where
267   // ignoring the request for WQM is correct as per the relevant specs.
268   Flag &= ~II.Disabled;
269 
270   // Ignore if the flag is already encompassed by the existing needs, or we
271   // just disabled everything.
272   if ((II.Needs & Flag) == Flag)
273     return;
274 
275   II.Needs |= Flag;
276   Worklist.push_back(&MI);
277 }
278 
279 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
280 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
281                                Register Reg, unsigned SubReg, char Flag,
282                                std::vector<WorkItem> &Worklist) {
283   assert(!MRI->isSSA());
284 
285   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
286 
287   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
288   if (!UseLRQ.valueIn())
289     return;
290 
291   SmallPtrSet<const VNInfo *, 4> Visited;
292   SmallVector<const VNInfo *, 4> ToProcess;
293   ToProcess.push_back(UseLRQ.valueIn());
294   do {
295     const VNInfo *Value = ToProcess.pop_back_val();
296     Visited.insert(Value);
297 
298     if (Value->isPHIDef()) {
299       // Need to mark all defs used in the PHI node
300       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
301       assert(MBB && "Phi-def has no defining MBB");
302       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
303                                                   PE = MBB->pred_end();
304            PI != PE; ++PI) {
305         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
306           if (!Visited.count(VN))
307             ToProcess.push_back(VN);
308         }
309       }
310     } else {
311       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
312       assert(MI && "Def has no defining instruction");
313       markInstruction(*MI, Flag, Worklist);
314 
315       // Iterate over all operands to find relevant definitions
316       for (const MachineOperand &Op : MI->operands()) {
317         if (!(Op.isReg() && Op.getReg() == Reg))
318           continue;
319 
320         // Does this def cover whole register?
321         bool DefinesFullReg =
322             Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
323         if (!DefinesFullReg) {
324           // Partial definition; need to follow and mark input value
325           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
326           if (const VNInfo *VN = LRQ.valueIn()) {
327             if (!Visited.count(VN))
328               ToProcess.push_back(VN);
329           }
330         }
331       }
332     }
333   } while (!ToProcess.empty());
334 }
335 
336 /// Mark all instructions defining the uses in \p MI with \p Flag.
337 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
338                                           std::vector<WorkItem> &Worklist) {
339 
340   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
341                     << MI);
342 
343   for (const MachineOperand &Use : MI.uses()) {
344     if (!Use.isReg() || !Use.isUse())
345       continue;
346 
347     Register Reg = Use.getReg();
348 
349     // Handle physical registers that we need to track; this is mostly relevant
350     // for VCC, which can appear as the (implicit) input of a uniform branch,
351     // e.g. when a loop counter is stored in a VGPR.
352     if (!Reg.isVirtual()) {
353       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
354         continue;
355 
356       for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
357            ++RegUnit) {
358         LiveRange &LR = LIS->getRegUnit(*RegUnit);
359         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
360         if (!Value)
361           continue;
362 
363         if (MRI->isSSA()) {
364           // Since we're in machine SSA, we do not need to track physical
365           // registers across basic blocks.
366           if (Value->isPHIDef())
367             continue;
368           markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
369                           Worklist);
370         } else {
371           markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
372         }
373       }
374 
375       continue;
376     }
377 
378     if (MRI->isSSA()) {
379       for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
380         markInstruction(DefMI, Flag, Worklist);
381     } else {
382       LiveRange &LR = LIS->getInterval(Reg);
383       markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
384     }
385   }
386 }
387 
388 // Scan instructions to determine which ones require an Exact execmask and
389 // which ones seed WQM requirements.
390 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
391                                        std::vector<WorkItem> &Worklist) {
392   char GlobalFlags = 0;
393   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
394   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
395   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
396 
397   // We need to visit the basic blocks in reverse post-order so that we visit
398   // defs before uses, in particular so that we don't accidentally mark an
399   // instruction as needing e.g. WQM before visiting it and realizing it needs
400   // WQM disabled.
401   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
402   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
403     MachineBasicBlock &MBB = **BI;
404     BlockInfo &BBI = Blocks[&MBB];
405 
406     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
407       MachineInstr &MI = *II;
408       InstrInfo &III = Instructions[&MI];
409       unsigned Opcode = MI.getOpcode();
410       char Flags = 0;
411 
412       if (TII->isWQM(Opcode)) {
413         // Sampling instructions don't need to produce results for all pixels
414         // in a quad, they just require all inputs of a quad to have been
415         // computed for derivatives.
416         markInstructionUses(MI, StateWQM, Worklist);
417         GlobalFlags |= StateWQM;
418         continue;
419       } else if (Opcode == AMDGPU::WQM) {
420         // The WQM intrinsic requires its output to have all the helper lanes
421         // correct, so we need it to be in WQM.
422         Flags = StateWQM;
423         LowerToCopyInstrs.push_back(&MI);
424       } else if (Opcode == AMDGPU::SOFT_WQM) {
425         LowerToCopyInstrs.push_back(&MI);
426         SoftWQMInstrs.push_back(&MI);
427         continue;
428       } else if (Opcode == AMDGPU::WWM) {
429         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
430         // to be executed in WQM or Exact so that its copy doesn't clobber
431         // inactive lanes.
432         markInstructionUses(MI, StateWWM, Worklist);
433         GlobalFlags |= StateWWM;
434         LowerToMovInstrs.push_back(&MI);
435         continue;
436       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
437                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
438         III.Disabled = StateWWM;
439         MachineOperand &Inactive = MI.getOperand(2);
440         if (Inactive.isReg()) {
441           if (Inactive.isUndef()) {
442             LowerToCopyInstrs.push_back(&MI);
443           } else {
444             Register Reg = Inactive.getReg();
445             if (Reg.isVirtual()) {
446               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
447                 markInstruction(DefMI, StateWWM, Worklist);
448             }
449           }
450         }
451         SetInactiveInstrs.push_back(&MI);
452         continue;
453       } else if (TII->isDisableWQM(MI)) {
454         BBI.Needs |= StateExact;
455         if (!(BBI.InNeeds & StateExact)) {
456           BBI.InNeeds |= StateExact;
457           Worklist.push_back(&MBB);
458         }
459         GlobalFlags |= StateExact;
460         III.Disabled = StateWQM | StateWWM;
461         continue;
462       } else {
463         if (Opcode == AMDGPU::SI_PS_LIVE) {
464           LiveMaskQueries.push_back(&MI);
465         } else if (WQMOutputs) {
466           // The function is in machine SSA form, which means that physical
467           // VGPRs correspond to shader inputs and outputs. Inputs are
468           // only used, outputs are only defined.
469           for (const MachineOperand &MO : MI.defs()) {
470             if (!MO.isReg())
471               continue;
472 
473             Register Reg = MO.getReg();
474 
475             if (!Reg.isVirtual() &&
476                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
477               Flags = StateWQM;
478               break;
479             }
480           }
481         }
482 
483         if (!Flags)
484           continue;
485       }
486 
487       markInstruction(MI, Flags, Worklist);
488       GlobalFlags |= Flags;
489     }
490   }
491 
492   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
493   // ever used anywhere in the function. This implements the corresponding
494   // semantics of @llvm.amdgcn.set.inactive.
495   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
496   if (GlobalFlags & StateWQM) {
497     for (MachineInstr *MI : SetInactiveInstrs)
498       markInstruction(*MI, StateWQM, Worklist);
499     for (MachineInstr *MI : SoftWQMInstrs)
500       markInstruction(*MI, StateWQM, Worklist);
501   }
502 
503   return GlobalFlags;
504 }
505 
506 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
507                                            std::vector<WorkItem>& Worklist) {
508   MachineBasicBlock *MBB = MI.getParent();
509   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
510   BlockInfo &BI = Blocks[MBB];
511 
512   // Control flow-type instructions and stores to temporary memory that are
513   // followed by WQM computations must themselves be in WQM.
514   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
515       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
516     Instructions[&MI].Needs = StateWQM;
517     II.Needs = StateWQM;
518   }
519 
520   // Propagate to block level
521   if (II.Needs & StateWQM) {
522     BI.Needs |= StateWQM;
523     if (!(BI.InNeeds & StateWQM)) {
524       BI.InNeeds |= StateWQM;
525       Worklist.push_back(MBB);
526     }
527   }
528 
529   // Propagate backwards within block
530   if (MachineInstr *PrevMI = MI.getPrevNode()) {
531     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
532     if (!PrevMI->isPHI()) {
533       InstrInfo &PrevII = Instructions[PrevMI];
534       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
535         PrevII.OutNeeds |= InNeeds;
536         Worklist.push_back(PrevMI);
537       }
538     }
539   }
540 
541   // Propagate WQM flag to instruction inputs
542   assert(!(II.Needs & StateExact));
543 
544   if (II.Needs != 0)
545     markInstructionUses(MI, II.Needs, Worklist);
546 
547   // Ensure we process a block containing WWM, even if it does not require any
548   // WQM transitions.
549   if (II.Needs & StateWWM)
550     BI.Needs |= StateWWM;
551 }
552 
553 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
554                                      std::vector<WorkItem>& Worklist) {
555   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
556 
557   // Propagate through instructions
558   if (!MBB.empty()) {
559     MachineInstr *LastMI = &*MBB.rbegin();
560     InstrInfo &LastII = Instructions[LastMI];
561     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
562       LastII.OutNeeds |= BI.OutNeeds;
563       Worklist.push_back(LastMI);
564     }
565   }
566 
567   // Predecessor blocks must provide for our WQM/Exact needs.
568   for (MachineBasicBlock *Pred : MBB.predecessors()) {
569     BlockInfo &PredBI = Blocks[Pred];
570     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
571       continue;
572 
573     PredBI.OutNeeds |= BI.InNeeds;
574     PredBI.InNeeds |= BI.InNeeds;
575     Worklist.push_back(Pred);
576   }
577 
578   // All successors must be prepared to accept the same set of WQM/Exact data.
579   for (MachineBasicBlock *Succ : MBB.successors()) {
580     BlockInfo &SuccBI = Blocks[Succ];
581     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
582       continue;
583 
584     SuccBI.InNeeds |= BI.OutNeeds;
585     Worklist.push_back(Succ);
586   }
587 }
588 
589 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
590   std::vector<WorkItem> Worklist;
591   char GlobalFlags = scanInstructions(MF, Worklist);
592 
593   while (!Worklist.empty()) {
594     WorkItem WI = Worklist.back();
595     Worklist.pop_back();
596 
597     if (WI.MI)
598       propagateInstruction(*WI.MI, Worklist);
599     else
600       propagateBlock(*WI.MBB, Worklist);
601   }
602 
603   return GlobalFlags;
604 }
605 
606 MachineBasicBlock::iterator
607 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
608                          MachineBasicBlock::iterator Before) {
609   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
610 
611   MachineInstr *Save =
612       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
613           .addReg(AMDGPU::SCC);
614   MachineInstr *Restore =
615       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
616           .addReg(SaveReg);
617 
618   LIS->InsertMachineInstrInMaps(*Save);
619   LIS->InsertMachineInstrInMaps(*Restore);
620   LIS->createAndComputeVirtRegInterval(SaveReg);
621 
622   return Restore;
623 }
624 
625 // Return an iterator in the (inclusive) range [First, Last] at which
626 // instructions can be safely inserted, keeping in mind that some of the
627 // instructions we want to add necessarily clobber SCC.
628 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
629     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
630     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
631   if (!SaveSCC)
632     return PreferLast ? Last : First;
633 
634   LiveRange &LR =
635       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
636   auto MBBE = MBB.end();
637   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
638                                      : LIS->getMBBEndIdx(&MBB);
639   SlotIndex LastIdx =
640       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
641   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
642   const LiveRange::Segment *S;
643 
644   for (;;) {
645     S = LR.getSegmentContaining(Idx);
646     if (!S)
647       break;
648 
649     if (PreferLast) {
650       SlotIndex Next = S->start.getBaseIndex();
651       if (Next < FirstIdx)
652         break;
653       Idx = Next;
654     } else {
655       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
656       assert(EndMI && "Segment does not end on valid instruction");
657       auto NextI = std::next(EndMI->getIterator());
658       if (NextI == MBB.end())
659         break;
660       SlotIndex Next = LIS->getInstructionIndex(*NextI);
661       if (Next > LastIdx)
662         break;
663       Idx = Next;
664     }
665   }
666 
667   MachineBasicBlock::iterator MBBI;
668 
669   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
670     MBBI = MI;
671   else {
672     assert(Idx == LIS->getMBBEndIdx(&MBB));
673     MBBI = MBB.end();
674   }
675 
676   // Move insertion point past any operations modifying EXEC.
677   // This assumes that the value of SCC defined by any of these operations
678   // does not need to be preserved.
679   while (MBBI != Last) {
680     bool IsExecDef = false;
681     for (const MachineOperand &MO : MBBI->operands()) {
682       if (MO.isReg() && MO.isDef()) {
683         IsExecDef |=
684             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
685       }
686     }
687     if (!IsExecDef)
688       break;
689     MBBI++;
690     S = nullptr;
691   }
692 
693   if (S)
694     MBBI = saveSCC(MBB, MBBI);
695 
696   return MBBI;
697 }
698 
699 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
700                               MachineBasicBlock::iterator Before,
701                               unsigned SaveWQM, unsigned LiveMaskReg) {
702   MachineInstr *MI;
703 
704   if (SaveWQM) {
705     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
706                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
707                  SaveWQM)
708              .addReg(LiveMaskReg);
709   } else {
710     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
711     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
712                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
713                  Exec)
714              .addReg(Exec)
715              .addReg(LiveMaskReg);
716   }
717 
718   LIS->InsertMachineInstrInMaps(*MI);
719 }
720 
721 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
722                             MachineBasicBlock::iterator Before,
723                             unsigned SavedWQM) {
724   MachineInstr *MI;
725 
726   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
727   if (SavedWQM) {
728     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
729              .addReg(SavedWQM);
730   } else {
731     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
732                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
733                  Exec)
734              .addReg(Exec);
735   }
736 
737   LIS->InsertMachineInstrInMaps(*MI);
738 }
739 
740 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
741                             MachineBasicBlock::iterator Before,
742                             unsigned SaveOrig) {
743   MachineInstr *MI;
744 
745   assert(SaveOrig);
746   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
747            .addImm(-1);
748   LIS->InsertMachineInstrInMaps(*MI);
749 }
750 
751 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
752                               MachineBasicBlock::iterator Before,
753                               unsigned SavedOrig) {
754   MachineInstr *MI;
755 
756   assert(SavedOrig);
757   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
758                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
759            .addReg(SavedOrig);
760   LIS->InsertMachineInstrInMaps(*MI);
761 }
762 
763 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
764                                    bool isEntry) {
765   auto BII = Blocks.find(&MBB);
766   if (BII == Blocks.end())
767     return;
768 
769   const BlockInfo &BI = BII->second;
770 
771   // This is a non-entry block that is WQM throughout, so no need to do
772   // anything.
773   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
774     return;
775 
776   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
777                     << ":\n");
778 
779   unsigned SavedWQMReg = 0;
780   unsigned SavedNonWWMReg = 0;
781   bool WQMFromExec = isEntry;
782   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
783   char NonWWMState = 0;
784   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
785 
786   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
787   if (isEntry) {
788     // Skip the instruction that saves LiveMask
789     if (II != IE && II->getOpcode() == AMDGPU::COPY)
790       ++II;
791   }
792 
793   // This stores the first instruction where it's safe to switch from WQM to
794   // Exact or vice versa.
795   MachineBasicBlock::iterator FirstWQM = IE;
796 
797   // This stores the first instruction where it's safe to switch from WWM to
798   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
799   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
800   // switch to/from WQM as well.
801   MachineBasicBlock::iterator FirstWWM = IE;
802 
803   for (;;) {
804     MachineBasicBlock::iterator Next = II;
805     char Needs = StateExact | StateWQM; // WWM is disabled by default
806     char OutNeeds = 0;
807 
808     if (FirstWQM == IE)
809       FirstWQM = II;
810 
811     if (FirstWWM == IE)
812       FirstWWM = II;
813 
814     // First, figure out the allowed states (Needs) based on the propagated
815     // flags.
816     if (II != IE) {
817       MachineInstr &MI = *II;
818 
819       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
820         auto III = Instructions.find(&MI);
821         if (III != Instructions.end()) {
822           if (III->second.Needs & StateWWM)
823             Needs = StateWWM;
824           else if (III->second.Needs & StateWQM)
825             Needs = StateWQM;
826           else
827             Needs &= ~III->second.Disabled;
828           OutNeeds = III->second.OutNeeds;
829         }
830       } else {
831         // If the instruction doesn't actually need a correct EXEC, then we can
832         // safely leave WWM enabled.
833         Needs = StateExact | StateWQM | StateWWM;
834       }
835 
836       if (MI.isTerminator() && OutNeeds == StateExact)
837         Needs = StateExact;
838 
839       ++Next;
840     } else {
841       // End of basic block
842       if (BI.OutNeeds & StateWQM)
843         Needs = StateWQM;
844       else if (BI.OutNeeds == StateExact)
845         Needs = StateExact;
846       else
847         Needs = StateWQM | StateExact;
848     }
849 
850     // Now, transition if necessary.
851     if (!(Needs & State)) {
852       MachineBasicBlock::iterator First;
853       if (State == StateWWM || Needs == StateWWM) {
854         // We must switch to or from WWM
855         First = FirstWWM;
856       } else {
857         // We only need to switch to/from WQM, so we can use FirstWQM
858         First = FirstWQM;
859       }
860 
861       MachineBasicBlock::iterator Before =
862           prepareInsertion(MBB, First, II, Needs == StateWQM,
863                            Needs == StateExact || WQMFromExec);
864 
865       if (State == StateWWM) {
866         assert(SavedNonWWMReg);
867         fromWWM(MBB, Before, SavedNonWWMReg);
868         LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
869         SavedNonWWMReg = 0;
870         State = NonWWMState;
871       }
872 
873       if (Needs == StateWWM) {
874         NonWWMState = State;
875         assert(!SavedNonWWMReg);
876         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
877         toWWM(MBB, Before, SavedNonWWMReg);
878         State = StateWWM;
879       } else {
880         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
881           if (!WQMFromExec && (OutNeeds & StateWQM)) {
882             assert(!SavedWQMReg);
883             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
884           }
885 
886           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
887           State = StateExact;
888         } else if (State == StateExact && (Needs & StateWQM) &&
889                    !(Needs & StateExact)) {
890           assert(WQMFromExec == (SavedWQMReg == 0));
891 
892           toWQM(MBB, Before, SavedWQMReg);
893 
894           if (SavedWQMReg) {
895             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
896             SavedWQMReg = 0;
897           }
898           State = StateWQM;
899         } else {
900           // We can get here if we transitioned from WWM to a non-WWM state that
901           // already matches our needs, but we shouldn't need to do anything.
902           assert(Needs & State);
903         }
904       }
905     }
906 
907     if (Needs != (StateExact | StateWQM | StateWWM)) {
908       if (Needs != (StateExact | StateWQM))
909         FirstWQM = IE;
910       FirstWWM = IE;
911     }
912 
913     if (II == IE)
914       break;
915 
916     II = Next;
917   }
918   assert(!SavedWQMReg);
919   assert(!SavedNonWWMReg);
920 }
921 
922 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
923   for (MachineInstr *MI : LiveMaskQueries) {
924     const DebugLoc &DL = MI->getDebugLoc();
925     Register Dest = MI->getOperand(0).getReg();
926 
927     MachineInstr *Copy =
928         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
929             .addReg(LiveMaskReg);
930 
931     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
932     MI->eraseFromParent();
933   }
934 }
935 
936 void SIWholeQuadMode::lowerCopyInstrs() {
937   for (MachineInstr *MI : LowerToMovInstrs) {
938     assert(MI->getNumExplicitOperands() == 2);
939 
940     const Register Reg = MI->getOperand(0).getReg();
941     const unsigned SubReg = MI->getOperand(0).getSubReg();
942 
943     if (TRI->isVGPR(*MRI, Reg)) {
944       const TargetRegisterClass *regClass =
945           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
946       if (SubReg)
947         regClass = TRI->getSubRegClass(regClass, SubReg);
948 
949       const unsigned MovOp = TII->getMovOpcode(regClass);
950       MI->setDesc(TII->get(MovOp));
951 
952       // And make it implicitly depend on exec (like all VALU movs should do).
953       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
954     } else if (!MRI->isSSA()) {
955       // Remove early-clobber and exec dependency from simple SGPR copies.
956       // This allows some to be eliminated during/post RA.
957       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
958       if (MI->getOperand(0).isEarlyClobber()) {
959         LIS->removeInterval(Reg);
960         MI->getOperand(0).setIsEarlyClobber(false);
961         LIS->createAndComputeVirtRegInterval(Reg);
962       }
963       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
964       while (Index >= 0) {
965         MI->RemoveOperand(Index);
966         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
967       }
968       MI->setDesc(TII->get(AMDGPU::COPY));
969       LLVM_DEBUG(dbgs() << "  -> " << *MI);
970     }
971   }
972   for (MachineInstr *MI : LowerToCopyInstrs) {
973     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
974         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
975       assert(MI->getNumExplicitOperands() == 3);
976       // the only reason we should be here is V_SET_INACTIVE has
977       // an undef input so it is being replaced by a simple copy.
978       // There should be a second undef source that we should remove.
979       assert(MI->getOperand(2).isUndef());
980       MI->RemoveOperand(2);
981       MI->untieRegOperand(1);
982     } else {
983       assert(MI->getNumExplicitOperands() == 2);
984     }
985 
986     MI->setDesc(TII->get(AMDGPU::COPY));
987   }
988 }
989 
990 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
991   Instructions.clear();
992   Blocks.clear();
993   LiveMaskQueries.clear();
994   LowerToCopyInstrs.clear();
995   LowerToMovInstrs.clear();
996   CallingConv = MF.getFunction().getCallingConv();
997 
998   ST = &MF.getSubtarget<GCNSubtarget>();
999 
1000   TII = ST->getInstrInfo();
1001   TRI = &TII->getRegisterInfo();
1002   MRI = &MF.getRegInfo();
1003   LIS = &getAnalysis<LiveIntervals>();
1004 
1005   if (ST->isWave32()) {
1006     AndOpc = AMDGPU::S_AND_B32;
1007     XorTermrOpc = AMDGPU::S_XOR_B32_term;
1008     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1009     Exec = AMDGPU::EXEC_LO;
1010   } else {
1011     AndOpc = AMDGPU::S_AND_B64;
1012     XorTermrOpc = AMDGPU::S_XOR_B64_term;
1013     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1014     Exec = AMDGPU::EXEC;
1015   }
1016 
1017   char GlobalFlags = analyzeFunction(MF);
1018   unsigned LiveMaskReg = 0;
1019   if (!(GlobalFlags & StateWQM)) {
1020     lowerLiveMaskQueries(Exec);
1021     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
1022       return !LiveMaskQueries.empty();
1023   } else {
1024     // Store a copy of the original live mask when required
1025     MachineBasicBlock &Entry = MF.front();
1026     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1027 
1028     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
1029       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1030       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
1031                                  TII->get(AMDGPU::COPY), LiveMaskReg)
1032                              .addReg(Exec);
1033       LIS->InsertMachineInstrInMaps(*MI);
1034     }
1035 
1036     lowerLiveMaskQueries(LiveMaskReg);
1037 
1038     if (GlobalFlags == StateWQM) {
1039       // For a shader that needs only WQM, we can just set it once.
1040       auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
1041                         TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
1042                                                 : AMDGPU::S_WQM_B64),
1043                         Exec)
1044                     .addReg(Exec);
1045       LIS->InsertMachineInstrInMaps(*MI);
1046 
1047       lowerCopyInstrs();
1048       // EntryMI may become invalid here
1049       return true;
1050     }
1051   }
1052 
1053   LLVM_DEBUG(printInfo());
1054 
1055   lowerCopyInstrs();
1056 
1057   // Handle the general case
1058   for (auto BII : Blocks)
1059     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
1060 
1061   if (LiveMaskReg)
1062     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1063 
1064   // Physical registers like SCC aren't tracked by default anyway, so just
1065   // removing the ranges we computed is the simplest option for maintaining
1066   // the analysis results.
1067   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1068 
1069   return true;
1070 }
1071