1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 ///   S_MOV_B64 LiveMask, EXEC
22 ///   S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 ///   ...
29 ///   S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 ///  (1) at the top level (outside of control flow statements, and as long as
48 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
49 ///      the LiveMask (this is implemented for the entry block).
50 ///
51 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
52 ///      consist of exact and don't-care instructions, the switch only has to
53 ///      be done at the entry and exit points rather than potentially in each
54 ///      block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "AMDGPUSubtarget.h"
60 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
61 #include "SIInstrInfo.h"
62 #include "SIMachineFunctionInfo.h"
63 #include "llvm/ADT/DenseMap.h"
64 #include "llvm/ADT/MapVector.h"
65 #include "llvm/ADT/PostOrderIterator.h"
66 #include "llvm/ADT/SmallVector.h"
67 #include "llvm/ADT/StringRef.h"
68 #include "llvm/CodeGen/LiveInterval.h"
69 #include "llvm/CodeGen/LiveIntervals.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/CodeGen/SlotIndexes.h"
78 #include "llvm/CodeGen/TargetRegisterInfo.h"
79 #include "llvm/IR/CallingConv.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/InitializePasses.h"
82 #include "llvm/MC/MCRegisterInfo.h"
83 #include "llvm/Pass.h"
84 #include "llvm/Support/Debug.h"
85 #include "llvm/Support/raw_ostream.h"
86 #include <cassert>
87 #include <vector>
88 
89 using namespace llvm;
90 
91 #define DEBUG_TYPE "si-wqm"
92 
93 namespace {
94 
95 enum {
96   StateWQM = 0x1,
97   StateWWM = 0x2,
98   StateExact = 0x4,
99 };
100 
101 struct PrintState {
102 public:
103   int State;
104 
105   explicit PrintState(int State) : State(State) {}
106 };
107 
108 #ifndef NDEBUG
109 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
110   if (PS.State & StateWQM)
111     OS << "WQM";
112   if (PS.State & StateWWM) {
113     if (PS.State & StateWQM)
114       OS << '|';
115     OS << "WWM";
116   }
117   if (PS.State & StateExact) {
118     if (PS.State & (StateWQM | StateWWM))
119       OS << '|';
120     OS << "Exact";
121   }
122 
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137 };
138 
139 struct WorkItem {
140   MachineBasicBlock *MBB = nullptr;
141   MachineInstr *MI = nullptr;
142 
143   WorkItem() = default;
144   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
145   WorkItem(MachineInstr *MI) : MI(MI) {}
146 };
147 
148 class SIWholeQuadMode : public MachineFunctionPass {
149 private:
150   CallingConv::ID CallingConv;
151   const SIInstrInfo *TII;
152   const SIRegisterInfo *TRI;
153   const GCNSubtarget *ST;
154   MachineRegisterInfo *MRI;
155   LiveIntervals *LIS;
156 
157   unsigned AndOpc;
158   unsigned XorTermrOpc;
159   unsigned OrSaveExecOpc;
160   unsigned Exec;
161 
162   DenseMap<const MachineInstr *, InstrInfo> Instructions;
163   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
164   SmallVector<MachineInstr *, 1> LiveMaskQueries;
165   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
166   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
167 
168   void printInfo();
169 
170   void markInstruction(MachineInstr &MI, char Flag,
171                        std::vector<WorkItem> &Worklist);
172   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
173                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
174   void markInstructionUses(const MachineInstr &MI, char Flag,
175                            std::vector<WorkItem> &Worklist);
176   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
177   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
178   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
179   char analyzeFunction(MachineFunction &MF);
180 
181   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
182                                       MachineBasicBlock::iterator Before);
183   MachineBasicBlock::iterator
184   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
185                    MachineBasicBlock::iterator Last, bool PreferLast,
186                    bool SaveSCC);
187   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
188                unsigned SaveWQM, unsigned LiveMaskReg);
189   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
190              unsigned SavedWQM);
191   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
192              unsigned SaveOrig);
193   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
194                unsigned SavedOrig);
195   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
196 
197   void lowerLiveMaskQueries(unsigned LiveMaskReg);
198   void lowerCopyInstrs();
199 
200 public:
201   static char ID;
202 
203   SIWholeQuadMode() :
204     MachineFunctionPass(ID) { }
205 
206   bool runOnMachineFunction(MachineFunction &MF) override;
207 
208   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
209 
210   void getAnalysisUsage(AnalysisUsage &AU) const override {
211     AU.addRequired<LiveIntervals>();
212     AU.addPreserved<SlotIndexes>();
213     AU.addPreserved<LiveIntervals>();
214     AU.setPreservesCFG();
215     MachineFunctionPass::getAnalysisUsage(AU);
216   }
217 };
218 
219 } // end anonymous namespace
220 
221 char SIWholeQuadMode::ID = 0;
222 
223 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
224                       false)
225 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
226 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
227                     false)
228 
229 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
230 
231 FunctionPass *llvm::createSIWholeQuadModePass() {
232   return new SIWholeQuadMode;
233 }
234 
235 #ifndef NDEBUG
236 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
237   for (const auto &BII : Blocks) {
238     dbgs() << "\n"
239            << printMBBReference(*BII.first) << ":\n"
240            << "  InNeeds = " << PrintState(BII.second.InNeeds)
241            << ", Needs = " << PrintState(BII.second.Needs)
242            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
243 
244     for (const MachineInstr &MI : *BII.first) {
245       auto III = Instructions.find(&MI);
246       if (III == Instructions.end())
247         continue;
248 
249       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
250              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
251     }
252   }
253 }
254 #endif
255 
256 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
257                                       std::vector<WorkItem> &Worklist) {
258   InstrInfo &II = Instructions[&MI];
259 
260   assert(!(Flag & StateExact) && Flag != 0);
261 
262   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
263 
264   // Remove any disabled states from the flag. The user that required it gets
265   // an undefined value in the helper lanes. For example, this can happen if
266   // the result of an atomic is used by instruction that requires WQM, where
267   // ignoring the request for WQM is correct as per the relevant specs.
268   Flag &= ~II.Disabled;
269 
270   // Ignore if the flag is already encompassed by the existing needs, or we
271   // just disabled everything.
272   if ((II.Needs & Flag) == Flag)
273     return;
274 
275   II.Needs |= Flag;
276   Worklist.push_back(&MI);
277 }
278 
279 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
280 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
281                                Register Reg, unsigned SubReg, char Flag,
282                                std::vector<WorkItem> &Worklist) {
283   assert(!MRI->isSSA());
284 
285   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
286 
287   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
288   if (!UseLRQ.valueIn())
289     return;
290 
291   SmallPtrSet<const VNInfo *, 4> Visited;
292   SmallVector<const VNInfo *, 4> ToProcess;
293   ToProcess.push_back(UseLRQ.valueIn());
294   do {
295     const VNInfo *Value = ToProcess.pop_back_val();
296     Visited.insert(Value);
297 
298     if (Value->isPHIDef()) {
299       // Need to mark all defs used in the PHI node
300       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
301       assert(MBB && "Phi-def has no defining MBB");
302       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
303                                                   PE = MBB->pred_end();
304            PI != PE; ++PI) {
305         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
306           if (!Visited.count(VN))
307             ToProcess.push_back(VN);
308         }
309       }
310     } else {
311       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
312       assert(MI && "Def has no defining instruction");
313       markInstruction(*MI, Flag, Worklist);
314 
315       // Iterate over all operands to find relevant definitions
316       for (const MachineOperand &Op : MI->operands()) {
317         if (!(Op.isReg() && Op.getReg() == Reg))
318           continue;
319 
320         // Does this def cover whole register?
321         bool DefinesFullReg =
322             Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
323         if (!DefinesFullReg) {
324           // Partial definition; need to follow and mark input value
325           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
326           if (const VNInfo *VN = LRQ.valueIn()) {
327             if (!Visited.count(VN))
328               ToProcess.push_back(VN);
329           }
330         }
331       }
332     }
333   } while (!ToProcess.empty());
334 }
335 
336 /// Mark all instructions defining the uses in \p MI with \p Flag.
337 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
338                                           std::vector<WorkItem> &Worklist) {
339 
340   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
341                     << MI);
342 
343   for (const MachineOperand &Use : MI.uses()) {
344     if (!Use.isReg() || !Use.isUse())
345       continue;
346 
347     Register Reg = Use.getReg();
348 
349     // Handle physical registers that we need to track; this is mostly relevant
350     // for VCC, which can appear as the (implicit) input of a uniform branch,
351     // e.g. when a loop counter is stored in a VGPR.
352     if (!Reg.isVirtual()) {
353       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
354         continue;
355 
356       for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
357         LiveRange &LR = LIS->getRegUnit(*RegUnit);
358         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
359         if (!Value)
360           continue;
361 
362         if (MRI->isSSA()) {
363           // Since we're in machine SSA, we do not need to track physical
364           // registers across basic blocks.
365           if (Value->isPHIDef())
366             continue;
367           markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
368                           Worklist);
369         } else {
370           markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
371         }
372       }
373 
374       continue;
375     }
376 
377     if (MRI->isSSA()) {
378       for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
379         markInstruction(DefMI, Flag, Worklist);
380     } else {
381       LiveRange &LR = LIS->getInterval(Reg);
382       markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
383     }
384   }
385 }
386 
387 // Scan instructions to determine which ones require an Exact execmask and
388 // which ones seed WQM requirements.
389 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
390                                        std::vector<WorkItem> &Worklist) {
391   char GlobalFlags = 0;
392   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
393   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
394   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
395 
396   // We need to visit the basic blocks in reverse post-order so that we visit
397   // defs before uses, in particular so that we don't accidentally mark an
398   // instruction as needing e.g. WQM before visiting it and realizing it needs
399   // WQM disabled.
400   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
401   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
402     MachineBasicBlock &MBB = **BI;
403     BlockInfo &BBI = Blocks[&MBB];
404 
405     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
406       MachineInstr &MI = *II;
407       InstrInfo &III = Instructions[&MI];
408       unsigned Opcode = MI.getOpcode();
409       char Flags = 0;
410 
411       if (TII->isWQM(Opcode)) {
412         // Sampling instructions don't need to produce results for all pixels
413         // in a quad, they just require all inputs of a quad to have been
414         // computed for derivatives.
415         markInstructionUses(MI, StateWQM, Worklist);
416         GlobalFlags |= StateWQM;
417         continue;
418       } else if (Opcode == AMDGPU::WQM) {
419         // The WQM intrinsic requires its output to have all the helper lanes
420         // correct, so we need it to be in WQM.
421         Flags = StateWQM;
422         LowerToCopyInstrs.push_back(&MI);
423       } else if (Opcode == AMDGPU::SOFT_WQM) {
424         LowerToCopyInstrs.push_back(&MI);
425         SoftWQMInstrs.push_back(&MI);
426         continue;
427       } else if (Opcode == AMDGPU::WWM) {
428         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
429         // to be executed in WQM or Exact so that its copy doesn't clobber
430         // inactive lanes.
431         markInstructionUses(MI, StateWWM, Worklist);
432         GlobalFlags |= StateWWM;
433         LowerToMovInstrs.push_back(&MI);
434         continue;
435       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
436                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
437         III.Disabled = StateWWM;
438         MachineOperand &Inactive = MI.getOperand(2);
439         if (Inactive.isReg()) {
440           if (Inactive.isUndef()) {
441             LowerToCopyInstrs.push_back(&MI);
442           } else {
443             Register Reg = Inactive.getReg();
444             if (Reg.isVirtual()) {
445               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
446                 markInstruction(DefMI, StateWWM, Worklist);
447             }
448           }
449         }
450         SetInactiveInstrs.push_back(&MI);
451         continue;
452       } else if (TII->isDisableWQM(MI)) {
453         BBI.Needs |= StateExact;
454         if (!(BBI.InNeeds & StateExact)) {
455           BBI.InNeeds |= StateExact;
456           Worklist.push_back(&MBB);
457         }
458         GlobalFlags |= StateExact;
459         III.Disabled = StateWQM | StateWWM;
460         continue;
461       } else {
462         if (Opcode == AMDGPU::SI_PS_LIVE) {
463           LiveMaskQueries.push_back(&MI);
464         } else if (WQMOutputs) {
465           // The function is in machine SSA form, which means that physical
466           // VGPRs correspond to shader inputs and outputs. Inputs are
467           // only used, outputs are only defined.
468           for (const MachineOperand &MO : MI.defs()) {
469             if (!MO.isReg())
470               continue;
471 
472             Register Reg = MO.getReg();
473 
474             if (!Reg.isVirtual() &&
475                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
476               Flags = StateWQM;
477               break;
478             }
479           }
480         }
481 
482         if (!Flags)
483           continue;
484       }
485 
486       markInstruction(MI, Flags, Worklist);
487       GlobalFlags |= Flags;
488     }
489   }
490 
491   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
492   // ever used anywhere in the function. This implements the corresponding
493   // semantics of @llvm.amdgcn.set.inactive.
494   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
495   if (GlobalFlags & StateWQM) {
496     for (MachineInstr *MI : SetInactiveInstrs)
497       markInstruction(*MI, StateWQM, Worklist);
498     for (MachineInstr *MI : SoftWQMInstrs)
499       markInstruction(*MI, StateWQM, Worklist);
500   }
501 
502   return GlobalFlags;
503 }
504 
505 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
506                                            std::vector<WorkItem>& Worklist) {
507   MachineBasicBlock *MBB = MI.getParent();
508   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
509   BlockInfo &BI = Blocks[MBB];
510 
511   // Control flow-type instructions and stores to temporary memory that are
512   // followed by WQM computations must themselves be in WQM.
513   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
514       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
515     Instructions[&MI].Needs = StateWQM;
516     II.Needs = StateWQM;
517   }
518 
519   // Propagate to block level
520   if (II.Needs & StateWQM) {
521     BI.Needs |= StateWQM;
522     if (!(BI.InNeeds & StateWQM)) {
523       BI.InNeeds |= StateWQM;
524       Worklist.push_back(MBB);
525     }
526   }
527 
528   // Propagate backwards within block
529   if (MachineInstr *PrevMI = MI.getPrevNode()) {
530     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
531     if (!PrevMI->isPHI()) {
532       InstrInfo &PrevII = Instructions[PrevMI];
533       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
534         PrevII.OutNeeds |= InNeeds;
535         Worklist.push_back(PrevMI);
536       }
537     }
538   }
539 
540   // Propagate WQM flag to instruction inputs
541   assert(!(II.Needs & StateExact));
542 
543   if (II.Needs != 0)
544     markInstructionUses(MI, II.Needs, Worklist);
545 
546   // Ensure we process a block containing WWM, even if it does not require any
547   // WQM transitions.
548   if (II.Needs & StateWWM)
549     BI.Needs |= StateWWM;
550 }
551 
552 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
553                                      std::vector<WorkItem>& Worklist) {
554   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
555 
556   // Propagate through instructions
557   if (!MBB.empty()) {
558     MachineInstr *LastMI = &*MBB.rbegin();
559     InstrInfo &LastII = Instructions[LastMI];
560     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
561       LastII.OutNeeds |= BI.OutNeeds;
562       Worklist.push_back(LastMI);
563     }
564   }
565 
566   // Predecessor blocks must provide for our WQM/Exact needs.
567   for (MachineBasicBlock *Pred : MBB.predecessors()) {
568     BlockInfo &PredBI = Blocks[Pred];
569     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
570       continue;
571 
572     PredBI.OutNeeds |= BI.InNeeds;
573     PredBI.InNeeds |= BI.InNeeds;
574     Worklist.push_back(Pred);
575   }
576 
577   // All successors must be prepared to accept the same set of WQM/Exact data.
578   for (MachineBasicBlock *Succ : MBB.successors()) {
579     BlockInfo &SuccBI = Blocks[Succ];
580     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
581       continue;
582 
583     SuccBI.InNeeds |= BI.OutNeeds;
584     Worklist.push_back(Succ);
585   }
586 }
587 
588 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
589   std::vector<WorkItem> Worklist;
590   char GlobalFlags = scanInstructions(MF, Worklist);
591 
592   while (!Worklist.empty()) {
593     WorkItem WI = Worklist.back();
594     Worklist.pop_back();
595 
596     if (WI.MI)
597       propagateInstruction(*WI.MI, Worklist);
598     else
599       propagateBlock(*WI.MBB, Worklist);
600   }
601 
602   return GlobalFlags;
603 }
604 
605 MachineBasicBlock::iterator
606 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
607                          MachineBasicBlock::iterator Before) {
608   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
609 
610   MachineInstr *Save =
611       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
612           .addReg(AMDGPU::SCC);
613   MachineInstr *Restore =
614       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
615           .addReg(SaveReg);
616 
617   LIS->InsertMachineInstrInMaps(*Save);
618   LIS->InsertMachineInstrInMaps(*Restore);
619   LIS->createAndComputeVirtRegInterval(SaveReg);
620 
621   return Restore;
622 }
623 
624 // Return an iterator in the (inclusive) range [First, Last] at which
625 // instructions can be safely inserted, keeping in mind that some of the
626 // instructions we want to add necessarily clobber SCC.
627 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
628     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
629     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
630   if (!SaveSCC)
631     return PreferLast ? Last : First;
632 
633   LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
634   auto MBBE = MBB.end();
635   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
636                                      : LIS->getMBBEndIdx(&MBB);
637   SlotIndex LastIdx =
638       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
639   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
640   const LiveRange::Segment *S;
641 
642   for (;;) {
643     S = LR.getSegmentContaining(Idx);
644     if (!S)
645       break;
646 
647     if (PreferLast) {
648       SlotIndex Next = S->start.getBaseIndex();
649       if (Next < FirstIdx)
650         break;
651       Idx = Next;
652     } else {
653       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
654       assert(EndMI && "Segment does not end on valid instruction");
655       auto NextI = std::next(EndMI->getIterator());
656       SlotIndex Next = LIS->getInstructionIndex(*NextI);
657       if (Next > LastIdx)
658         break;
659       Idx = Next;
660     }
661   }
662 
663   MachineBasicBlock::iterator MBBI;
664 
665   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
666     MBBI = MI;
667   else {
668     assert(Idx == LIS->getMBBEndIdx(&MBB));
669     MBBI = MBB.end();
670   }
671 
672   // Move insertion point past any operations modifying EXEC.
673   // This assumes that the value of SCC defined by any of these operations
674   // does not need to be preserved.
675   while (MBBI != Last) {
676     bool IsExecDef = false;
677     for (const MachineOperand &MO : MBBI->operands()) {
678       if (MO.isReg() && MO.isDef()) {
679         IsExecDef |=
680             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
681       }
682     }
683     if (!IsExecDef)
684       break;
685     MBBI++;
686     S = nullptr;
687   }
688 
689   if (S)
690     MBBI = saveSCC(MBB, MBBI);
691 
692   return MBBI;
693 }
694 
695 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
696                               MachineBasicBlock::iterator Before,
697                               unsigned SaveWQM, unsigned LiveMaskReg) {
698   MachineInstr *MI;
699 
700   if (SaveWQM) {
701     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
702                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
703                  SaveWQM)
704              .addReg(LiveMaskReg);
705   } else {
706     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
707     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
708                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
709                  Exec)
710              .addReg(Exec)
711              .addReg(LiveMaskReg);
712   }
713 
714   LIS->InsertMachineInstrInMaps(*MI);
715 }
716 
717 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
718                             MachineBasicBlock::iterator Before,
719                             unsigned SavedWQM) {
720   MachineInstr *MI;
721 
722   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
723   if (SavedWQM) {
724     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
725              .addReg(SavedWQM);
726   } else {
727     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
728                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
729                  Exec)
730              .addReg(Exec);
731   }
732 
733   LIS->InsertMachineInstrInMaps(*MI);
734 }
735 
736 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
737                             MachineBasicBlock::iterator Before,
738                             unsigned SaveOrig) {
739   MachineInstr *MI;
740 
741   assert(SaveOrig);
742   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
743            .addImm(-1);
744   LIS->InsertMachineInstrInMaps(*MI);
745 }
746 
747 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
748                               MachineBasicBlock::iterator Before,
749                               unsigned SavedOrig) {
750   MachineInstr *MI;
751 
752   assert(SavedOrig);
753   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
754                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
755            .addReg(SavedOrig);
756   LIS->InsertMachineInstrInMaps(*MI);
757 }
758 
759 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
760                                    bool isEntry) {
761   auto BII = Blocks.find(&MBB);
762   if (BII == Blocks.end())
763     return;
764 
765   const BlockInfo &BI = BII->second;
766 
767   // This is a non-entry block that is WQM throughout, so no need to do
768   // anything.
769   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
770     return;
771 
772   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
773                     << ":\n");
774 
775   unsigned SavedWQMReg = 0;
776   unsigned SavedNonWWMReg = 0;
777   bool WQMFromExec = isEntry;
778   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
779   char NonWWMState = 0;
780   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
781 
782   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
783   if (isEntry) {
784     // Skip the instruction that saves LiveMask
785     if (II != IE && II->getOpcode() == AMDGPU::COPY)
786       ++II;
787   }
788 
789   // This stores the first instruction where it's safe to switch from WQM to
790   // Exact or vice versa.
791   MachineBasicBlock::iterator FirstWQM = IE;
792 
793   // This stores the first instruction where it's safe to switch from WWM to
794   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
795   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
796   // switch to/from WQM as well.
797   MachineBasicBlock::iterator FirstWWM = IE;
798 
799   for (;;) {
800     MachineBasicBlock::iterator Next = II;
801     char Needs = StateExact | StateWQM; // WWM is disabled by default
802     char OutNeeds = 0;
803 
804     if (FirstWQM == IE)
805       FirstWQM = II;
806 
807     if (FirstWWM == IE)
808       FirstWWM = II;
809 
810     // First, figure out the allowed states (Needs) based on the propagated
811     // flags.
812     if (II != IE) {
813       MachineInstr &MI = *II;
814 
815       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
816         auto III = Instructions.find(&MI);
817         if (III != Instructions.end()) {
818           if (III->second.Needs & StateWWM)
819             Needs = StateWWM;
820           else if (III->second.Needs & StateWQM)
821             Needs = StateWQM;
822           else
823             Needs &= ~III->second.Disabled;
824           OutNeeds = III->second.OutNeeds;
825         }
826       } else {
827         // If the instruction doesn't actually need a correct EXEC, then we can
828         // safely leave WWM enabled.
829         Needs = StateExact | StateWQM | StateWWM;
830       }
831 
832       if (MI.isTerminator() && OutNeeds == StateExact)
833         Needs = StateExact;
834 
835       ++Next;
836     } else {
837       // End of basic block
838       if (BI.OutNeeds & StateWQM)
839         Needs = StateWQM;
840       else if (BI.OutNeeds == StateExact)
841         Needs = StateExact;
842       else
843         Needs = StateWQM | StateExact;
844     }
845 
846     // Now, transition if necessary.
847     if (!(Needs & State)) {
848       MachineBasicBlock::iterator First;
849       if (State == StateWWM || Needs == StateWWM) {
850         // We must switch to or from WWM
851         First = FirstWWM;
852       } else {
853         // We only need to switch to/from WQM, so we can use FirstWQM
854         First = FirstWQM;
855       }
856 
857       MachineBasicBlock::iterator Before =
858           prepareInsertion(MBB, First, II, Needs == StateWQM,
859                            Needs == StateExact || WQMFromExec);
860 
861       if (State == StateWWM) {
862         assert(SavedNonWWMReg);
863         fromWWM(MBB, Before, SavedNonWWMReg);
864         LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
865         SavedNonWWMReg = 0;
866         State = NonWWMState;
867       }
868 
869       if (Needs == StateWWM) {
870         NonWWMState = State;
871         assert(!SavedNonWWMReg);
872         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
873         toWWM(MBB, Before, SavedNonWWMReg);
874         State = StateWWM;
875       } else {
876         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
877           if (!WQMFromExec && (OutNeeds & StateWQM)) {
878             assert(!SavedWQMReg);
879             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
880           }
881 
882           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
883           State = StateExact;
884         } else if (State == StateExact && (Needs & StateWQM) &&
885                    !(Needs & StateExact)) {
886           assert(WQMFromExec == (SavedWQMReg == 0));
887 
888           toWQM(MBB, Before, SavedWQMReg);
889 
890           if (SavedWQMReg) {
891             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
892             SavedWQMReg = 0;
893           }
894           State = StateWQM;
895         } else {
896           // We can get here if we transitioned from WWM to a non-WWM state that
897           // already matches our needs, but we shouldn't need to do anything.
898           assert(Needs & State);
899         }
900       }
901     }
902 
903     if (Needs != (StateExact | StateWQM | StateWWM)) {
904       if (Needs != (StateExact | StateWQM))
905         FirstWQM = IE;
906       FirstWWM = IE;
907     }
908 
909     if (II == IE)
910       break;
911 
912     II = Next;
913   }
914   assert(!SavedWQMReg);
915   assert(!SavedNonWWMReg);
916 }
917 
918 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
919   for (MachineInstr *MI : LiveMaskQueries) {
920     const DebugLoc &DL = MI->getDebugLoc();
921     Register Dest = MI->getOperand(0).getReg();
922 
923     MachineInstr *Copy =
924         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
925             .addReg(LiveMaskReg);
926 
927     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
928     MI->eraseFromParent();
929   }
930 }
931 
932 void SIWholeQuadMode::lowerCopyInstrs() {
933   for (MachineInstr *MI : LowerToMovInstrs) {
934     assert(MI->getNumExplicitOperands() == 2);
935 
936     const Register Reg = MI->getOperand(0).getReg();
937     const unsigned SubReg = MI->getOperand(0).getSubReg();
938 
939     if (TRI->isVGPR(*MRI, Reg)) {
940       const TargetRegisterClass *regClass =
941           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
942       if (SubReg)
943         regClass = TRI->getSubRegClass(regClass, SubReg);
944 
945       const unsigned MovOp = TII->getMovOpcode(regClass);
946       MI->setDesc(TII->get(MovOp));
947 
948       // And make it implicitly depend on exec (like all VALU movs should do).
949       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
950     } else if (!MRI->isSSA()) {
951       // Remove early-clobber and exec dependency from simple SGPR copies.
952       // This allows some to be eliminated during/post RA.
953       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
954       if (MI->getOperand(0).isEarlyClobber()) {
955         LIS->removeInterval(Reg);
956         MI->getOperand(0).setIsEarlyClobber(false);
957         LIS->createAndComputeVirtRegInterval(Reg);
958       }
959       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
960       while (Index >= 0) {
961         MI->RemoveOperand(Index);
962         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
963       }
964       MI->setDesc(TII->get(AMDGPU::COPY));
965       LLVM_DEBUG(dbgs() << "  -> " << *MI);
966     }
967   }
968   for (MachineInstr *MI : LowerToCopyInstrs) {
969     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
970         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
971       assert(MI->getNumExplicitOperands() == 3);
972       // the only reason we should be here is V_SET_INACTIVE has
973       // an undef input so it is being replaced by a simple copy.
974       // There should be a second undef source that we should remove.
975       assert(MI->getOperand(2).isUndef());
976       MI->RemoveOperand(2);
977       MI->untieRegOperand(1);
978     } else {
979       assert(MI->getNumExplicitOperands() == 2);
980     }
981 
982     MI->setDesc(TII->get(AMDGPU::COPY));
983   }
984 }
985 
986 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
987   Instructions.clear();
988   Blocks.clear();
989   LiveMaskQueries.clear();
990   LowerToCopyInstrs.clear();
991   LowerToMovInstrs.clear();
992   CallingConv = MF.getFunction().getCallingConv();
993 
994   ST = &MF.getSubtarget<GCNSubtarget>();
995 
996   TII = ST->getInstrInfo();
997   TRI = &TII->getRegisterInfo();
998   MRI = &MF.getRegInfo();
999   LIS = &getAnalysis<LiveIntervals>();
1000 
1001   if (ST->isWave32()) {
1002     AndOpc = AMDGPU::S_AND_B32;
1003     XorTermrOpc = AMDGPU::S_XOR_B32_term;
1004     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1005     Exec = AMDGPU::EXEC_LO;
1006   } else {
1007     AndOpc = AMDGPU::S_AND_B64;
1008     XorTermrOpc = AMDGPU::S_XOR_B64_term;
1009     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1010     Exec = AMDGPU::EXEC;
1011   }
1012 
1013   char GlobalFlags = analyzeFunction(MF);
1014   unsigned LiveMaskReg = 0;
1015   if (!(GlobalFlags & StateWQM)) {
1016     lowerLiveMaskQueries(Exec);
1017     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
1018       return !LiveMaskQueries.empty();
1019   } else {
1020     // Store a copy of the original live mask when required
1021     MachineBasicBlock &Entry = MF.front();
1022     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1023 
1024     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
1025       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1026       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
1027                                  TII->get(AMDGPU::COPY), LiveMaskReg)
1028                              .addReg(Exec);
1029       LIS->InsertMachineInstrInMaps(*MI);
1030     }
1031 
1032     lowerLiveMaskQueries(LiveMaskReg);
1033 
1034     if (GlobalFlags == StateWQM) {
1035       // For a shader that needs only WQM, we can just set it once.
1036       auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
1037                         TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
1038                                                 : AMDGPU::S_WQM_B64),
1039                         Exec)
1040                     .addReg(Exec);
1041       LIS->InsertMachineInstrInMaps(*MI);
1042 
1043       lowerCopyInstrs();
1044       // EntryMI may become invalid here
1045       return true;
1046     }
1047   }
1048 
1049   LLVM_DEBUG(printInfo());
1050 
1051   lowerCopyInstrs();
1052 
1053   // Handle the general case
1054   for (auto BII : Blocks)
1055     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
1056 
1057   if (LiveMaskReg)
1058     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1059 
1060   // Physical registers like SCC aren't tracked by default anyway, so just
1061   // removing the ranges we computed is the simplest option for maintaining
1062   // the analysis results.
1063   LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
1064 
1065   return true;
1066 }
1067