1 //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPU.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "SIRegisterInfo.h"
13 #include "llvm/CodeGen/LivePhysRegs.h"
14 #include "llvm/CodeGen/MachineFunctionPass.h"
15 #include "llvm/InitializePasses.h"
16 
17 using namespace llvm;
18 
19 #define DEBUG_TYPE "si-optimize-exec-masking"
20 
21 namespace {
22 
23 class SIOptimizeExecMasking : public MachineFunctionPass {
24   MachineFunction *MF = nullptr;
25   const GCNSubtarget *ST = nullptr;
26   const SIRegisterInfo *TRI = nullptr;
27   const SIInstrInfo *TII = nullptr;
28   const MachineRegisterInfo *MRI = nullptr;
29 
30   Register isCopyFromExec(const MachineInstr &MI) const;
31   Register isCopyToExec(const MachineInstr &MI) const;
32   bool removeTerminatorBit(MachineInstr &MI) const;
33   MachineBasicBlock::reverse_iterator
34   fixTerminators(MachineBasicBlock &MBB) const;
35   MachineBasicBlock::reverse_iterator
36   findExecCopy(MachineBasicBlock &MBB, MachineBasicBlock::reverse_iterator I,
37                unsigned CopyToExec) const;
38 
39   bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
40                               MCRegister Reg, bool UseLiveOuts = false,
41                               bool IgnoreStart = false) const;
42   bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
43   MachineInstr *findInstrBackwards(MachineInstr &Origin,
44                                    std::function<bool(MachineInstr *)> Pred,
45                                    ArrayRef<MCRegister> NonModifiableRegs,
46                                    unsigned MaxInstructions = 20) const;
47   MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
48                                                   MCRegister Exec) const;
49   bool optimizeExecSequence() const;
50   bool optimizeVCmpxAndSaveexecSequence() const;
51   bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
52                                           MachineInstr &VCmp,
53                                           MCRegister Exec) const;
54 
55 public:
56   static char ID;
57 
58   SIOptimizeExecMasking() : MachineFunctionPass(ID) {
59     initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
60   }
61 
62   bool runOnMachineFunction(MachineFunction &MF) override;
63 
64   StringRef getPassName() const override {
65     return "SI optimize exec mask operations";
66   }
67 
68   void getAnalysisUsage(AnalysisUsage &AU) const override {
69     AU.setPreservesCFG();
70     MachineFunctionPass::getAnalysisUsage(AU);
71   }
72 };
73 
74 } // End anonymous namespace.
75 
76 INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
77                       "SI optimize exec mask operations", false, false)
78 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
79 INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
80                     "SI optimize exec mask operations", false, false)
81 
82 char SIOptimizeExecMasking::ID = 0;
83 
84 char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
85 
86 /// If \p MI is a copy from exec, return the register copied to.
87 Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
88   switch (MI.getOpcode()) {
89   case AMDGPU::COPY:
90   case AMDGPU::S_MOV_B64:
91   case AMDGPU::S_MOV_B64_term:
92   case AMDGPU::S_MOV_B32:
93   case AMDGPU::S_MOV_B32_term: {
94     const MachineOperand &Src = MI.getOperand(1);
95     if (Src.isReg() && Src.getReg() == TRI->getExec())
96       return MI.getOperand(0).getReg();
97   }
98   }
99 
100   return AMDGPU::NoRegister;
101 }
102 
103 /// If \p MI is a copy to exec, return the register copied from.
104 Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
105   switch (MI.getOpcode()) {
106   case AMDGPU::COPY:
107   case AMDGPU::S_MOV_B64:
108   case AMDGPU::S_MOV_B32: {
109     const MachineOperand &Dst = MI.getOperand(0);
110     if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
111         MI.getOperand(1).isReg())
112       return MI.getOperand(1).getReg();
113     break;
114   }
115   case AMDGPU::S_MOV_B64_term:
116   case AMDGPU::S_MOV_B32_term:
117     llvm_unreachable("should have been replaced");
118   }
119 
120   return Register();
121 }
122 
123 /// If \p MI is a logical operation on an exec value,
124 /// return the register copied to.
125 static Register isLogicalOpOnExec(const MachineInstr &MI) {
126   switch (MI.getOpcode()) {
127   case AMDGPU::S_AND_B64:
128   case AMDGPU::S_OR_B64:
129   case AMDGPU::S_XOR_B64:
130   case AMDGPU::S_ANDN2_B64:
131   case AMDGPU::S_ORN2_B64:
132   case AMDGPU::S_NAND_B64:
133   case AMDGPU::S_NOR_B64:
134   case AMDGPU::S_XNOR_B64: {
135     const MachineOperand &Src1 = MI.getOperand(1);
136     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
137       return MI.getOperand(0).getReg();
138     const MachineOperand &Src2 = MI.getOperand(2);
139     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
140       return MI.getOperand(0).getReg();
141     break;
142   }
143   case AMDGPU::S_AND_B32:
144   case AMDGPU::S_OR_B32:
145   case AMDGPU::S_XOR_B32:
146   case AMDGPU::S_ANDN2_B32:
147   case AMDGPU::S_ORN2_B32:
148   case AMDGPU::S_NAND_B32:
149   case AMDGPU::S_NOR_B32:
150   case AMDGPU::S_XNOR_B32: {
151     const MachineOperand &Src1 = MI.getOperand(1);
152     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC_LO)
153       return MI.getOperand(0).getReg();
154     const MachineOperand &Src2 = MI.getOperand(2);
155     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC_LO)
156       return MI.getOperand(0).getReg();
157     break;
158   }
159   }
160 
161   return AMDGPU::NoRegister;
162 }
163 
164 static unsigned getSaveExecOp(unsigned Opc) {
165   switch (Opc) {
166   case AMDGPU::S_AND_B64:
167     return AMDGPU::S_AND_SAVEEXEC_B64;
168   case AMDGPU::S_OR_B64:
169     return AMDGPU::S_OR_SAVEEXEC_B64;
170   case AMDGPU::S_XOR_B64:
171     return AMDGPU::S_XOR_SAVEEXEC_B64;
172   case AMDGPU::S_ANDN2_B64:
173     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
174   case AMDGPU::S_ORN2_B64:
175     return AMDGPU::S_ORN2_SAVEEXEC_B64;
176   case AMDGPU::S_NAND_B64:
177     return AMDGPU::S_NAND_SAVEEXEC_B64;
178   case AMDGPU::S_NOR_B64:
179     return AMDGPU::S_NOR_SAVEEXEC_B64;
180   case AMDGPU::S_XNOR_B64:
181     return AMDGPU::S_XNOR_SAVEEXEC_B64;
182   case AMDGPU::S_AND_B32:
183     return AMDGPU::S_AND_SAVEEXEC_B32;
184   case AMDGPU::S_OR_B32:
185     return AMDGPU::S_OR_SAVEEXEC_B32;
186   case AMDGPU::S_XOR_B32:
187     return AMDGPU::S_XOR_SAVEEXEC_B32;
188   case AMDGPU::S_ANDN2_B32:
189     return AMDGPU::S_ANDN2_SAVEEXEC_B32;
190   case AMDGPU::S_ORN2_B32:
191     return AMDGPU::S_ORN2_SAVEEXEC_B32;
192   case AMDGPU::S_NAND_B32:
193     return AMDGPU::S_NAND_SAVEEXEC_B32;
194   case AMDGPU::S_NOR_B32:
195     return AMDGPU::S_NOR_SAVEEXEC_B32;
196   case AMDGPU::S_XNOR_B32:
197     return AMDGPU::S_XNOR_SAVEEXEC_B32;
198   default:
199     return AMDGPU::INSTRUCTION_LIST_END;
200   }
201 }
202 
203 // These are only terminators to get correct spill code placement during
204 // register allocation, so turn them back into normal instructions.
205 bool SIOptimizeExecMasking::removeTerminatorBit(MachineInstr &MI) const {
206   switch (MI.getOpcode()) {
207   case AMDGPU::S_MOV_B32_term: {
208     bool RegSrc = MI.getOperand(1).isReg();
209     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
210     return true;
211   }
212   case AMDGPU::S_MOV_B64_term: {
213     bool RegSrc = MI.getOperand(1).isReg();
214     MI.setDesc(TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
215     return true;
216   }
217   case AMDGPU::S_XOR_B64_term: {
218     // This is only a terminator to get the correct spill code placement during
219     // register allocation.
220     MI.setDesc(TII->get(AMDGPU::S_XOR_B64));
221     return true;
222   }
223   case AMDGPU::S_XOR_B32_term: {
224     // This is only a terminator to get the correct spill code placement during
225     // register allocation.
226     MI.setDesc(TII->get(AMDGPU::S_XOR_B32));
227     return true;
228   }
229   case AMDGPU::S_OR_B64_term: {
230     // This is only a terminator to get the correct spill code placement during
231     // register allocation.
232     MI.setDesc(TII->get(AMDGPU::S_OR_B64));
233     return true;
234   }
235   case AMDGPU::S_OR_B32_term: {
236     // This is only a terminator to get the correct spill code placement during
237     // register allocation.
238     MI.setDesc(TII->get(AMDGPU::S_OR_B32));
239     return true;
240   }
241   case AMDGPU::S_ANDN2_B64_term: {
242     // This is only a terminator to get the correct spill code placement during
243     // register allocation.
244     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B64));
245     return true;
246   }
247   case AMDGPU::S_ANDN2_B32_term: {
248     // This is only a terminator to get the correct spill code placement during
249     // register allocation.
250     MI.setDesc(TII->get(AMDGPU::S_ANDN2_B32));
251     return true;
252   }
253   case AMDGPU::S_AND_B64_term: {
254     // This is only a terminator to get the correct spill code placement during
255     // register allocation.
256     MI.setDesc(TII->get(AMDGPU::S_AND_B64));
257     return true;
258   }
259   case AMDGPU::S_AND_B32_term: {
260     // This is only a terminator to get the correct spill code placement during
261     // register allocation.
262     MI.setDesc(TII->get(AMDGPU::S_AND_B32));
263     return true;
264   }
265   default:
266     return false;
267   }
268 }
269 
270 // Turn all pseudoterminators in the block into their equivalent non-terminator
271 // instructions. Returns the reverse iterator to the first non-terminator
272 // instruction in the block.
273 MachineBasicBlock::reverse_iterator
274 SIOptimizeExecMasking::fixTerminators(MachineBasicBlock &MBB) const {
275   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
276 
277   bool Seen = false;
278   MachineBasicBlock::reverse_iterator FirstNonTerm = I;
279   for (; I != E; ++I) {
280     if (!I->isTerminator())
281       return Seen ? FirstNonTerm : I;
282 
283     if (removeTerminatorBit(*I)) {
284       if (!Seen) {
285         FirstNonTerm = I;
286         Seen = true;
287       }
288     }
289   }
290 
291   return FirstNonTerm;
292 }
293 
294 MachineBasicBlock::reverse_iterator
295 SIOptimizeExecMasking::findExecCopy(MachineBasicBlock &MBB,
296                                     MachineBasicBlock::reverse_iterator I,
297                                     unsigned CopyToExec) const {
298   const unsigned InstLimit = 25;
299 
300   auto E = MBB.rend();
301   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
302     Register CopyFromExec = isCopyFromExec(*I);
303     if (CopyFromExec.isValid())
304       return I;
305   }
306 
307   return E;
308 }
309 
310 // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
311 // report the register as unavailable because a super-register with a lane mask
312 // is unavailable.
313 static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
314   for (MachineBasicBlock *Succ : MBB.successors()) {
315     if (Succ->isLiveIn(Reg))
316       return true;
317   }
318 
319   return false;
320 }
321 
322 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
323 // the beginning of the BB is reached or Pred evaluates to true - which can be
324 // an arbitrary condition based on the current MachineInstr, for instance an
325 // target instruction. Breaks prematurely by returning nullptr if  one of the
326 // registers given in NonModifiableRegs is modified by the current instruction.
327 MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
328     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
329     ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
330   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
331                                       E = Origin.getParent()->rend();
332   unsigned CurrentIteration = 0;
333 
334   for (++A; CurrentIteration < MaxInstructions && A != E; ++A) {
335     if (A->isDebugInstr())
336       continue;
337 
338     if (Pred(&*A))
339       return &*A;
340 
341     for (MCRegister Reg : NonModifiableRegs) {
342       if (A->modifiesRegister(Reg, TRI))
343         return nullptr;
344     }
345 
346     ++CurrentIteration;
347   }
348 
349   return nullptr;
350 }
351 
352 // Determine if a register Reg is not re-defined and still in use
353 // in the range (Stop..Start].
354 // It does so by backwards calculating liveness from the end of the BB until
355 // either Stop or the beginning of the BB is reached.
356 // After liveness is calculated, we can determine if Reg is still in use and not
357 // defined inbetween the instructions.
358 bool SIOptimizeExecMasking::isRegisterInUseBetween(MachineInstr &Stop,
359                                                    MachineInstr &Start,
360                                                    MCRegister Reg,
361                                                    bool UseLiveOuts,
362                                                    bool IgnoreStart) const {
363   LivePhysRegs LR(*TRI);
364   if (UseLiveOuts)
365     LR.addLiveOuts(*Stop.getParent());
366 
367   MachineBasicBlock::reverse_iterator A(Start);
368   MachineBasicBlock::reverse_iterator E(Stop);
369 
370   if (IgnoreStart)
371     ++A;
372 
373   for (; A != Stop.getParent()->rend() && A != Stop; ++A) {
374     LR.stepBackward(*A);
375   }
376 
377   return !LR.available(*MRI, Reg);
378 }
379 
380 // Determine if a register Reg is not re-defined and still in use
381 // in the range (Stop..BB.end].
382 bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
383                                                  MCRegister Reg) const {
384   return isRegisterInUseBetween(Stop, *Stop.getParent()->rbegin(), Reg, true);
385 }
386 
387 // Optimize sequences emitted for control flow lowering. They are originally
388 // emitted as the separate operations because spill code may need to be
389 // inserted for the saved copy of exec.
390 //
391 //     x = copy exec
392 //     z = s_<op>_b64 x, y
393 //     exec = copy z
394 // =>
395 //     x = s_<op>_saveexec_b64 y
396 //
397 bool SIOptimizeExecMasking::optimizeExecSequence() const {
398   MCRegister Exec = TRI->getExec();
399 
400   bool Changed = false;
401   for (MachineBasicBlock &MBB : *MF) {
402     MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
403     MachineBasicBlock::reverse_iterator E = MBB.rend();
404     if (I == E)
405       continue;
406 
407     // It's possible to see other terminator copies after the exec copy. This
408     // can happen if control flow pseudos had their outputs used by phis.
409     Register CopyToExec;
410 
411     unsigned SearchCount = 0;
412     const unsigned SearchLimit = 5;
413     while (I != E && SearchCount++ < SearchLimit) {
414       CopyToExec = isCopyToExec(*I);
415       if (CopyToExec)
416         break;
417       ++I;
418     }
419 
420     if (!CopyToExec)
421       continue;
422 
423     // Scan backwards to find the def.
424     auto *CopyToExecInst = &*I;
425     auto CopyFromExecInst = findExecCopy(MBB, I, CopyToExec);
426     if (CopyFromExecInst == E) {
427       auto PrepareExecInst = std::next(I);
428       if (PrepareExecInst == E)
429         continue;
430       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
431       if (CopyToExecInst->getOperand(1).isKill() &&
432           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
433         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
434 
435         PrepareExecInst->getOperand(0).setReg(Exec);
436 
437         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
438 
439         CopyToExecInst->eraseFromParent();
440         Changed = true;
441       }
442 
443       continue;
444     }
445 
446     if (isLiveOut(MBB, CopyToExec)) {
447       // The copied register is live out and has a second use in another block.
448       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
449       continue;
450     }
451 
452     Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
453     MachineInstr *SaveExecInst = nullptr;
454     SmallVector<MachineInstr *, 4> OtherUseInsts;
455 
456     for (MachineBasicBlock::iterator
457              J = std::next(CopyFromExecInst->getIterator()),
458              JE = I->getIterator();
459          J != JE; ++J) {
460       if (SaveExecInst && J->readsRegister(Exec, TRI)) {
461         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
462         // Make sure this is inserted after any VALU ops that may have been
463         // scheduled in between.
464         SaveExecInst = nullptr;
465         break;
466       }
467 
468       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
469 
470       if (J->modifiesRegister(CopyToExec, TRI)) {
471         if (SaveExecInst) {
472           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
473                             << printReg(CopyToExec, TRI) << '\n');
474           SaveExecInst = nullptr;
475           break;
476         }
477 
478         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
479         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
480           break;
481 
482         if (ReadsCopyFromExec) {
483           SaveExecInst = &*J;
484           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
485           continue;
486         } else {
487           LLVM_DEBUG(dbgs()
488                      << "Instruction does not read exec copy: " << *J << '\n');
489           break;
490         }
491       } else if (ReadsCopyFromExec && !SaveExecInst) {
492         // Make sure no other instruction is trying to use this copy, before it
493         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
494         // been another use, such as an inserted spill. For example:
495         //
496         // %sgpr0_sgpr1 = COPY %exec
497         // spill %sgpr0_sgpr1
498         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
499         //
500         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
501                           << '\n');
502         break;
503       }
504 
505       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
506         assert(SaveExecInst != &*J);
507         OtherUseInsts.push_back(&*J);
508       }
509     }
510 
511     if (!SaveExecInst)
512       continue;
513 
514     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
515 
516     MachineOperand &Src0 = SaveExecInst->getOperand(1);
517     MachineOperand &Src1 = SaveExecInst->getOperand(2);
518 
519     MachineOperand *OtherOp = nullptr;
520 
521     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
522       OtherOp = &Src1;
523     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
524       if (!SaveExecInst->isCommutable())
525         break;
526 
527       OtherOp = &Src0;
528     } else
529       llvm_unreachable("unexpected");
530 
531     CopyFromExecInst->eraseFromParent();
532 
533     auto InsPt = SaveExecInst->getIterator();
534     const DebugLoc &DL = SaveExecInst->getDebugLoc();
535 
536     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
537             CopyFromExec)
538         .addReg(OtherOp->getReg());
539     SaveExecInst->eraseFromParent();
540 
541     CopyToExecInst->eraseFromParent();
542 
543     for (MachineInstr *OtherInst : OtherUseInsts) {
544       OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
545                                     *TRI);
546     }
547 
548     Changed = true;
549   }
550 
551   return Changed;
552 }
553 
554 // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
555 // by looking at an instance of a s_and_saveexec instruction. Returns a pointer
556 // to the v_cmp instruction if it is safe to replace the sequence (see the
557 // conditions in the function body). This is after register allocation, so some
558 // checks on operand dependencies need to be considered.
559 MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
560     MachineInstr &SaveExec, MCRegister Exec) const {
561 
562   MachineInstr *VCmp = nullptr;
563 
564   Register SaveExecDest = SaveExec.getOperand(0).getReg();
565   if (!TRI->isSGPRReg(*MRI, SaveExecDest))
566     return nullptr;
567 
568   MachineOperand *SaveExecSrc0 =
569       TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
570   if (!SaveExecSrc0->isReg())
571     return nullptr;
572 
573   // Try to find the last v_cmp instruction that defs the saveexec input
574   // operand without any write to Exec or the saveexec input operand inbetween.
575   VCmp = findInstrBackwards(
576       SaveExec,
577       [&](MachineInstr *Check) {
578         return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
579                Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
580       },
581       {Exec, SaveExecSrc0->getReg()});
582 
583   if (!VCmp)
584     return nullptr;
585 
586   MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
587   assert(VCmpDest && "Should have an sdst operand!");
588 
589   // Check if any of the v_cmp source operands is written by the saveexec.
590   MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
591   if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
592       SaveExec.modifiesRegister(Src0->getReg(), TRI))
593     return nullptr;
594 
595   MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
596   if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
597       SaveExec.modifiesRegister(Src1->getReg(), TRI))
598     return nullptr;
599 
600   // Don't do the transformation if the destination operand is included in
601   // it's MBB Live-outs, meaning it's used in any of it's successors, leading
602   // to incorrect code if the v_cmp and therefore the def of
603   // the dest operand is removed.
604   if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
605     return nullptr;
606 
607   // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
608   // s_and_saveexec, skip the optimization.
609   if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
610                              true) ||
611       isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
612     return nullptr;
613 
614   // Try to determine if there is a write to any of the VCmp
615   // operands between the saveexec and the vcmp.
616   // If yes, additional VGPR spilling might need to be inserted. In this case,
617   // it's not worth replacing the instruction sequence.
618   SmallVector<MCRegister, 2> NonDefRegs;
619   if (Src0->isReg())
620     NonDefRegs.push_back(Src0->getReg());
621 
622   if (Src1->isReg())
623     NonDefRegs.push_back(Src1->getReg());
624 
625   if (!findInstrBackwards(
626           SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
627           NonDefRegs))
628     return nullptr;
629 
630   return VCmp;
631 }
632 
633 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
634 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
635 bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
636     MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
637   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
638 
639   if (NewOpcode == -1)
640     return false;
641 
642   MachineOperand *Src0 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src0);
643   MachineOperand *Src1 = TII->getNamedOperand(VCmp, AMDGPU::OpName::src1);
644 
645   Register MoveDest = SaveExecInstr.getOperand(0).getReg();
646 
647   MachineBasicBlock::instr_iterator InsertPosIt = SaveExecInstr.getIterator();
648   if (!SaveExecInstr.uses().empty()) {
649     bool IsSGPR32 = TRI->getRegSizeInBits(MoveDest, *MRI) == 32;
650     unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
651     BuildMI(*SaveExecInstr.getParent(), InsertPosIt,
652             SaveExecInstr.getDebugLoc(), TII->get(MovOpcode), MoveDest)
653         .addReg(Exec);
654   }
655 
656   // Omit dst as V_CMPX is implicitly writing to EXEC.
657   // Add dummy src and clamp modifiers, if needed.
658   auto Builder = BuildMI(*VCmp.getParent(), std::next(InsertPosIt),
659                          VCmp.getDebugLoc(), TII->get(NewOpcode));
660 
661   auto TryAddImmediateValueFromNamedOperand =
662       [&](unsigned OperandName) -> void {
663     if (auto *Mod = TII->getNamedOperand(VCmp, OperandName))
664       Builder.addImm(Mod->getImm());
665   };
666 
667   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
668   Builder.add(*Src0);
669 
670   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
671   Builder.add(*Src1);
672 
673   TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
674 
675   // The kill flags may no longer be correct.
676   if (Src0->isReg())
677     MRI->clearKillFlags(Src0->getReg());
678   if (Src1->isReg())
679     MRI->clearKillFlags(Src1->getReg());
680 
681   return true;
682 }
683 
684 // After all s_op_saveexec instructions are inserted,
685 // replace (on GFX10.3 and later)
686 // v_cmp_* SGPR, IMM, VGPR
687 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
688 // with
689 // s_mov_b32 EXEC_SGPR_DEST, exec_lo
690 // v_cmpx_* IMM, VGPR
691 // to reduce pipeline stalls.
692 bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
693   if (!ST->hasGFX10_3Insts())
694     return false;
695 
696   bool Changed = false;
697 
698   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
699   MCRegister Exec = TRI->getExec();
700   const unsigned AndSaveExecOpcode =
701       ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
702 
703   for (MachineBasicBlock &MBB : *MF) {
704     for (MachineInstr &MI : MBB) {
705       // Record relevant v_cmp / s_and_saveexec instruction pairs for
706       // replacement.
707       if (MI.getOpcode() != AndSaveExecOpcode)
708         continue;
709 
710       if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
711         SaveExecVCmpMapping[&MI] = VCmp;
712     }
713   }
714 
715   for (const auto &Entry : SaveExecVCmpMapping) {
716     MachineInstr *SaveExecInstr = Entry.getFirst();
717     MachineInstr *VCmpInstr = Entry.getSecond();
718 
719     if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
720       SaveExecInstr->eraseFromParent();
721       VCmpInstr->eraseFromParent();
722 
723       Changed = true;
724     }
725   }
726 
727   return Changed;
728 }
729 
730 bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
731   if (skipFunction(MF.getFunction()))
732     return false;
733 
734   this->MF = &MF;
735   ST = &MF.getSubtarget<GCNSubtarget>();
736   TRI = ST->getRegisterInfo();
737   TII = ST->getInstrInfo();
738   MRI = &MF.getRegInfo();
739 
740   bool Changed = optimizeExecSequence();
741   Changed |= optimizeVCmpxAndSaveexecSequence();
742 
743   return Changed;
744 }
745