1d88c1a5aSDimitry Andric //===-- SIOptimizeExecMasking.cpp -----------------------------------------===//
2d88c1a5aSDimitry Andric //
3d88c1a5aSDimitry Andric //                     The LLVM Compiler Infrastructure
4d88c1a5aSDimitry Andric //
5d88c1a5aSDimitry Andric // This file is distributed under the University of Illinois Open Source
6d88c1a5aSDimitry Andric // License. See LICENSE.TXT for details.
7d88c1a5aSDimitry Andric //
8d88c1a5aSDimitry Andric //===----------------------------------------------------------------------===//
9d88c1a5aSDimitry Andric 
10d88c1a5aSDimitry Andric #include "AMDGPU.h"
11d88c1a5aSDimitry Andric #include "AMDGPUSubtarget.h"
12d88c1a5aSDimitry Andric #include "SIInstrInfo.h"
13*4ba319b5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
142cab237bSDimitry Andric #include "llvm/ADT/SmallSet.h"
15d88c1a5aSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
16d88c1a5aSDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
17d88c1a5aSDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
18d88c1a5aSDimitry Andric #include "llvm/Support/Debug.h"
19d88c1a5aSDimitry Andric 
20d88c1a5aSDimitry Andric using namespace llvm;
21d88c1a5aSDimitry Andric 
22d88c1a5aSDimitry Andric #define DEBUG_TYPE "si-optimize-exec-masking"
23d88c1a5aSDimitry Andric 
24d88c1a5aSDimitry Andric namespace {
25d88c1a5aSDimitry Andric 
26d88c1a5aSDimitry Andric class SIOptimizeExecMasking : public MachineFunctionPass {
27d88c1a5aSDimitry Andric public:
28d88c1a5aSDimitry Andric   static char ID;
29d88c1a5aSDimitry Andric 
30d88c1a5aSDimitry Andric public:
SIOptimizeExecMasking()31d88c1a5aSDimitry Andric   SIOptimizeExecMasking() : MachineFunctionPass(ID) {
32d88c1a5aSDimitry Andric     initializeSIOptimizeExecMaskingPass(*PassRegistry::getPassRegistry());
33d88c1a5aSDimitry Andric   }
34d88c1a5aSDimitry Andric 
35d88c1a5aSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
36d88c1a5aSDimitry Andric 
getPassName() const37d88c1a5aSDimitry Andric   StringRef getPassName() const override {
38d88c1a5aSDimitry Andric     return "SI optimize exec mask operations";
39d88c1a5aSDimitry Andric   }
40d88c1a5aSDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const41d88c1a5aSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
42d88c1a5aSDimitry Andric     AU.setPreservesCFG();
43d88c1a5aSDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
44d88c1a5aSDimitry Andric   }
45d88c1a5aSDimitry Andric };
46d88c1a5aSDimitry Andric 
47d88c1a5aSDimitry Andric } // End anonymous namespace.
48d88c1a5aSDimitry Andric 
49d88c1a5aSDimitry Andric INITIALIZE_PASS_BEGIN(SIOptimizeExecMasking, DEBUG_TYPE,
50d88c1a5aSDimitry Andric                       "SI optimize exec mask operations", false, false)
51d88c1a5aSDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
52d88c1a5aSDimitry Andric INITIALIZE_PASS_END(SIOptimizeExecMasking, DEBUG_TYPE,
53d88c1a5aSDimitry Andric                     "SI optimize exec mask operations", false, false)
54d88c1a5aSDimitry Andric 
55d88c1a5aSDimitry Andric char SIOptimizeExecMasking::ID = 0;
56d88c1a5aSDimitry Andric 
57d88c1a5aSDimitry Andric char &llvm::SIOptimizeExecMaskingID = SIOptimizeExecMasking::ID;
58d88c1a5aSDimitry Andric 
59d88c1a5aSDimitry Andric /// If \p MI is a copy from exec, return the register copied to.
isCopyFromExec(const MachineInstr & MI)60d88c1a5aSDimitry Andric static unsigned isCopyFromExec(const MachineInstr &MI) {
61d88c1a5aSDimitry Andric   switch (MI.getOpcode()) {
62d88c1a5aSDimitry Andric   case AMDGPU::COPY:
63d88c1a5aSDimitry Andric   case AMDGPU::S_MOV_B64:
64d88c1a5aSDimitry Andric   case AMDGPU::S_MOV_B64_term: {
65d88c1a5aSDimitry Andric     const MachineOperand &Src = MI.getOperand(1);
66d88c1a5aSDimitry Andric     if (Src.isReg() && Src.getReg() == AMDGPU::EXEC)
67d88c1a5aSDimitry Andric       return MI.getOperand(0).getReg();
68d88c1a5aSDimitry Andric   }
69d88c1a5aSDimitry Andric   }
70d88c1a5aSDimitry Andric 
71d88c1a5aSDimitry Andric   return AMDGPU::NoRegister;
72d88c1a5aSDimitry Andric }
73d88c1a5aSDimitry Andric 
74d88c1a5aSDimitry Andric /// If \p MI is a copy to exec, return the register copied from.
isCopyToExec(const MachineInstr & MI)75d88c1a5aSDimitry Andric static unsigned isCopyToExec(const MachineInstr &MI) {
76d88c1a5aSDimitry Andric   switch (MI.getOpcode()) {
77d88c1a5aSDimitry Andric   case AMDGPU::COPY:
78d88c1a5aSDimitry Andric   case AMDGPU::S_MOV_B64: {
79d88c1a5aSDimitry Andric     const MachineOperand &Dst = MI.getOperand(0);
80*4ba319b5SDimitry Andric     if (Dst.isReg() && Dst.getReg() == AMDGPU::EXEC && MI.getOperand(1).isReg())
81d88c1a5aSDimitry Andric       return MI.getOperand(1).getReg();
82d88c1a5aSDimitry Andric     break;
83d88c1a5aSDimitry Andric   }
84d88c1a5aSDimitry Andric   case AMDGPU::S_MOV_B64_term:
85d88c1a5aSDimitry Andric     llvm_unreachable("should have been replaced");
86d88c1a5aSDimitry Andric   }
87d88c1a5aSDimitry Andric 
88d88c1a5aSDimitry Andric   return AMDGPU::NoRegister;
89d88c1a5aSDimitry Andric }
90d88c1a5aSDimitry Andric 
912cab237bSDimitry Andric /// If \p MI is a logical operation on an exec value,
922cab237bSDimitry Andric /// return the register copied to.
isLogicalOpOnExec(const MachineInstr & MI)932cab237bSDimitry Andric static unsigned isLogicalOpOnExec(const MachineInstr &MI) {
942cab237bSDimitry Andric   switch (MI.getOpcode()) {
952cab237bSDimitry Andric   case AMDGPU::S_AND_B64:
962cab237bSDimitry Andric   case AMDGPU::S_OR_B64:
972cab237bSDimitry Andric   case AMDGPU::S_XOR_B64:
982cab237bSDimitry Andric   case AMDGPU::S_ANDN2_B64:
992cab237bSDimitry Andric   case AMDGPU::S_ORN2_B64:
1002cab237bSDimitry Andric   case AMDGPU::S_NAND_B64:
1012cab237bSDimitry Andric   case AMDGPU::S_NOR_B64:
1022cab237bSDimitry Andric   case AMDGPU::S_XNOR_B64: {
1032cab237bSDimitry Andric     const MachineOperand &Src1 = MI.getOperand(1);
1042cab237bSDimitry Andric     if (Src1.isReg() && Src1.getReg() == AMDGPU::EXEC)
1052cab237bSDimitry Andric       return MI.getOperand(0).getReg();
1062cab237bSDimitry Andric     const MachineOperand &Src2 = MI.getOperand(2);
1072cab237bSDimitry Andric     if (Src2.isReg() && Src2.getReg() == AMDGPU::EXEC)
1082cab237bSDimitry Andric       return MI.getOperand(0).getReg();
1092cab237bSDimitry Andric   }
1102cab237bSDimitry Andric   }
1112cab237bSDimitry Andric 
1122cab237bSDimitry Andric   return AMDGPU::NoRegister;
1132cab237bSDimitry Andric }
1142cab237bSDimitry Andric 
getSaveExecOp(unsigned Opc)115d88c1a5aSDimitry Andric static unsigned getSaveExecOp(unsigned Opc) {
116d88c1a5aSDimitry Andric   switch (Opc) {
117d88c1a5aSDimitry Andric   case AMDGPU::S_AND_B64:
118d88c1a5aSDimitry Andric     return AMDGPU::S_AND_SAVEEXEC_B64;
119d88c1a5aSDimitry Andric   case AMDGPU::S_OR_B64:
120d88c1a5aSDimitry Andric     return AMDGPU::S_OR_SAVEEXEC_B64;
121d88c1a5aSDimitry Andric   case AMDGPU::S_XOR_B64:
122d88c1a5aSDimitry Andric     return AMDGPU::S_XOR_SAVEEXEC_B64;
123d88c1a5aSDimitry Andric   case AMDGPU::S_ANDN2_B64:
124d88c1a5aSDimitry Andric     return AMDGPU::S_ANDN2_SAVEEXEC_B64;
125d88c1a5aSDimitry Andric   case AMDGPU::S_ORN2_B64:
126d88c1a5aSDimitry Andric     return AMDGPU::S_ORN2_SAVEEXEC_B64;
127d88c1a5aSDimitry Andric   case AMDGPU::S_NAND_B64:
128d88c1a5aSDimitry Andric     return AMDGPU::S_NAND_SAVEEXEC_B64;
129d88c1a5aSDimitry Andric   case AMDGPU::S_NOR_B64:
130d88c1a5aSDimitry Andric     return AMDGPU::S_NOR_SAVEEXEC_B64;
131d88c1a5aSDimitry Andric   case AMDGPU::S_XNOR_B64:
132d88c1a5aSDimitry Andric     return AMDGPU::S_XNOR_SAVEEXEC_B64;
133d88c1a5aSDimitry Andric   default:
134d88c1a5aSDimitry Andric     return AMDGPU::INSTRUCTION_LIST_END;
135d88c1a5aSDimitry Andric   }
136d88c1a5aSDimitry Andric }
137d88c1a5aSDimitry Andric 
138d88c1a5aSDimitry Andric // These are only terminators to get correct spill code placement during
139d88c1a5aSDimitry Andric // register allocation, so turn them back into normal instructions. Only one of
140d88c1a5aSDimitry Andric // these is expected per block.
removeTerminatorBit(const SIInstrInfo & TII,MachineInstr & MI)141d88c1a5aSDimitry Andric static bool removeTerminatorBit(const SIInstrInfo &TII, MachineInstr &MI) {
142d88c1a5aSDimitry Andric   switch (MI.getOpcode()) {
143d88c1a5aSDimitry Andric   case AMDGPU::S_MOV_B64_term: {
144d88c1a5aSDimitry Andric     MI.setDesc(TII.get(AMDGPU::COPY));
145d88c1a5aSDimitry Andric     return true;
146d88c1a5aSDimitry Andric   }
147d88c1a5aSDimitry Andric   case AMDGPU::S_XOR_B64_term: {
148d88c1a5aSDimitry Andric     // This is only a terminator to get the correct spill code placement during
149d88c1a5aSDimitry Andric     // register allocation.
150d88c1a5aSDimitry Andric     MI.setDesc(TII.get(AMDGPU::S_XOR_B64));
151d88c1a5aSDimitry Andric     return true;
152d88c1a5aSDimitry Andric   }
153d88c1a5aSDimitry Andric   case AMDGPU::S_ANDN2_B64_term: {
154d88c1a5aSDimitry Andric     // This is only a terminator to get the correct spill code placement during
155d88c1a5aSDimitry Andric     // register allocation.
156d88c1a5aSDimitry Andric     MI.setDesc(TII.get(AMDGPU::S_ANDN2_B64));
157d88c1a5aSDimitry Andric     return true;
158d88c1a5aSDimitry Andric   }
159d88c1a5aSDimitry Andric   default:
160d88c1a5aSDimitry Andric     return false;
161d88c1a5aSDimitry Andric   }
162d88c1a5aSDimitry Andric }
163d88c1a5aSDimitry Andric 
fixTerminators(const SIInstrInfo & TII,MachineBasicBlock & MBB)164d88c1a5aSDimitry Andric static MachineBasicBlock::reverse_iterator fixTerminators(
165d88c1a5aSDimitry Andric   const SIInstrInfo &TII,
166d88c1a5aSDimitry Andric   MachineBasicBlock &MBB) {
167d88c1a5aSDimitry Andric   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
168d88c1a5aSDimitry Andric   for (; I != E; ++I) {
169d88c1a5aSDimitry Andric     if (!I->isTerminator())
170d88c1a5aSDimitry Andric       return I;
171d88c1a5aSDimitry Andric 
172d88c1a5aSDimitry Andric     if (removeTerminatorBit(TII, *I))
173d88c1a5aSDimitry Andric       return I;
174d88c1a5aSDimitry Andric   }
175d88c1a5aSDimitry Andric 
176d88c1a5aSDimitry Andric   return E;
177d88c1a5aSDimitry Andric }
178d88c1a5aSDimitry Andric 
findExecCopy(const SIInstrInfo & TII,MachineBasicBlock & MBB,MachineBasicBlock::reverse_iterator I,unsigned CopyToExec)179d88c1a5aSDimitry Andric static MachineBasicBlock::reverse_iterator findExecCopy(
180d88c1a5aSDimitry Andric   const SIInstrInfo &TII,
181d88c1a5aSDimitry Andric   MachineBasicBlock &MBB,
182d88c1a5aSDimitry Andric   MachineBasicBlock::reverse_iterator I,
183d88c1a5aSDimitry Andric   unsigned CopyToExec) {
184d88c1a5aSDimitry Andric   const unsigned InstLimit = 25;
185d88c1a5aSDimitry Andric 
186d88c1a5aSDimitry Andric   auto E = MBB.rend();
187d88c1a5aSDimitry Andric   for (unsigned N = 0; N <= InstLimit && I != E; ++I, ++N) {
188d88c1a5aSDimitry Andric     unsigned CopyFromExec = isCopyFromExec(*I);
189d88c1a5aSDimitry Andric     if (CopyFromExec != AMDGPU::NoRegister)
190d88c1a5aSDimitry Andric       return I;
191d88c1a5aSDimitry Andric   }
192d88c1a5aSDimitry Andric 
193d88c1a5aSDimitry Andric   return E;
194d88c1a5aSDimitry Andric }
195d88c1a5aSDimitry Andric 
196d88c1a5aSDimitry Andric // XXX - Seems LivePhysRegs doesn't work correctly since it will incorrectly
197d88c1a5aSDimitry Andric // repor tthe register as unavailable because a super-register with a lane mask
198d88c1a5aSDimitry Andric // as unavailable.
isLiveOut(const MachineBasicBlock & MBB,unsigned Reg)199d88c1a5aSDimitry Andric static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
200d88c1a5aSDimitry Andric   for (MachineBasicBlock *Succ : MBB.successors()) {
201d88c1a5aSDimitry Andric     if (Succ->isLiveIn(Reg))
202d88c1a5aSDimitry Andric       return true;
203d88c1a5aSDimitry Andric   }
204d88c1a5aSDimitry Andric 
205d88c1a5aSDimitry Andric   return false;
206d88c1a5aSDimitry Andric }
207d88c1a5aSDimitry Andric 
runOnMachineFunction(MachineFunction & MF)208d88c1a5aSDimitry Andric bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
2092cab237bSDimitry Andric   if (skipFunction(MF.getFunction()))
2102cab237bSDimitry Andric     return false;
2112cab237bSDimitry Andric 
212*4ba319b5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
213d88c1a5aSDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
214d88c1a5aSDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
215d88c1a5aSDimitry Andric 
216d88c1a5aSDimitry Andric   // Optimize sequences emitted for control flow lowering. They are originally
217d88c1a5aSDimitry Andric   // emitted as the separate operations because spill code may need to be
218d88c1a5aSDimitry Andric   // inserted for the saved copy of exec.
219d88c1a5aSDimitry Andric   //
220d88c1a5aSDimitry Andric   //     x = copy exec
221d88c1a5aSDimitry Andric   //     z = s_<op>_b64 x, y
222d88c1a5aSDimitry Andric   //     exec = copy z
223d88c1a5aSDimitry Andric   // =>
224d88c1a5aSDimitry Andric   //     x = s_<op>_saveexec_b64 y
225d88c1a5aSDimitry Andric   //
226d88c1a5aSDimitry Andric 
227d88c1a5aSDimitry Andric   for (MachineBasicBlock &MBB : MF) {
228d88c1a5aSDimitry Andric     MachineBasicBlock::reverse_iterator I = fixTerminators(*TII, MBB);
229d88c1a5aSDimitry Andric     MachineBasicBlock::reverse_iterator E = MBB.rend();
230d88c1a5aSDimitry Andric     if (I == E)
231d88c1a5aSDimitry Andric       continue;
232d88c1a5aSDimitry Andric 
233d88c1a5aSDimitry Andric     unsigned CopyToExec = isCopyToExec(*I);
234d88c1a5aSDimitry Andric     if (CopyToExec == AMDGPU::NoRegister)
235d88c1a5aSDimitry Andric       continue;
236d88c1a5aSDimitry Andric 
237d88c1a5aSDimitry Andric     // Scan backwards to find the def.
238d88c1a5aSDimitry Andric     auto CopyToExecInst = &*I;
239d88c1a5aSDimitry Andric     auto CopyFromExecInst = findExecCopy(*TII, MBB, I, CopyToExec);
2402cab237bSDimitry Andric     if (CopyFromExecInst == E) {
2412cab237bSDimitry Andric       auto PrepareExecInst = std::next(I);
2422cab237bSDimitry Andric       if (PrepareExecInst == E)
243d88c1a5aSDimitry Andric         continue;
2442cab237bSDimitry Andric       // Fold exec = COPY (S_AND_B64 reg, exec) -> exec = S_AND_B64 reg, exec
2452cab237bSDimitry Andric       if (CopyToExecInst->getOperand(1).isKill() &&
2462cab237bSDimitry Andric           isLogicalOpOnExec(*PrepareExecInst) == CopyToExec) {
247*4ba319b5SDimitry Andric         LLVM_DEBUG(dbgs() << "Fold exec copy: " << *PrepareExecInst);
2482cab237bSDimitry Andric 
2492cab237bSDimitry Andric         PrepareExecInst->getOperand(0).setReg(AMDGPU::EXEC);
2502cab237bSDimitry Andric 
251*4ba319b5SDimitry Andric         LLVM_DEBUG(dbgs() << "into: " << *PrepareExecInst << '\n');
2522cab237bSDimitry Andric 
2532cab237bSDimitry Andric         CopyToExecInst->eraseFromParent();
2542cab237bSDimitry Andric       }
2552cab237bSDimitry Andric 
2562cab237bSDimitry Andric       continue;
2572cab237bSDimitry Andric     }
258d88c1a5aSDimitry Andric 
259d88c1a5aSDimitry Andric     if (isLiveOut(MBB, CopyToExec)) {
260d88c1a5aSDimitry Andric       // The copied register is live out and has a second use in another block.
261*4ba319b5SDimitry Andric       LLVM_DEBUG(dbgs() << "Exec copy source register is live out\n");
262d88c1a5aSDimitry Andric       continue;
263d88c1a5aSDimitry Andric     }
264d88c1a5aSDimitry Andric 
265d88c1a5aSDimitry Andric     unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
266d88c1a5aSDimitry Andric     MachineInstr *SaveExecInst = nullptr;
267d88c1a5aSDimitry Andric     SmallVector<MachineInstr *, 4> OtherUseInsts;
268d88c1a5aSDimitry Andric 
269d88c1a5aSDimitry Andric     for (MachineBasicBlock::iterator J
270d88c1a5aSDimitry Andric            = std::next(CopyFromExecInst->getIterator()), JE = I->getIterator();
271d88c1a5aSDimitry Andric          J != JE; ++J) {
272d88c1a5aSDimitry Andric       if (SaveExecInst && J->readsRegister(AMDGPU::EXEC, TRI)) {
273*4ba319b5SDimitry Andric         LLVM_DEBUG(dbgs() << "exec read prevents saveexec: " << *J << '\n');
274d88c1a5aSDimitry Andric         // Make sure this is inserted after any VALU ops that may have been
275d88c1a5aSDimitry Andric         // scheduled in between.
276d88c1a5aSDimitry Andric         SaveExecInst = nullptr;
277d88c1a5aSDimitry Andric         break;
278d88c1a5aSDimitry Andric       }
279d88c1a5aSDimitry Andric 
2802cab237bSDimitry Andric       bool ReadsCopyFromExec = J->readsRegister(CopyFromExec, TRI);
2812cab237bSDimitry Andric 
282d88c1a5aSDimitry Andric       if (J->modifiesRegister(CopyToExec, TRI)) {
283d88c1a5aSDimitry Andric         if (SaveExecInst) {
284*4ba319b5SDimitry Andric           LLVM_DEBUG(dbgs() << "Multiple instructions modify "
2852cab237bSDimitry Andric                             << printReg(CopyToExec, TRI) << '\n');
286d88c1a5aSDimitry Andric           SaveExecInst = nullptr;
287d88c1a5aSDimitry Andric           break;
288d88c1a5aSDimitry Andric         }
289d88c1a5aSDimitry Andric 
290d88c1a5aSDimitry Andric         unsigned SaveExecOp = getSaveExecOp(J->getOpcode());
291d88c1a5aSDimitry Andric         if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
292d88c1a5aSDimitry Andric           break;
293d88c1a5aSDimitry Andric 
2942cab237bSDimitry Andric         if (ReadsCopyFromExec) {
295d88c1a5aSDimitry Andric           SaveExecInst = &*J;
296*4ba319b5SDimitry Andric           LLVM_DEBUG(dbgs() << "Found save exec op: " << *SaveExecInst << '\n');
297d88c1a5aSDimitry Andric           continue;
298d88c1a5aSDimitry Andric         } else {
299*4ba319b5SDimitry Andric           LLVM_DEBUG(dbgs()
300*4ba319b5SDimitry Andric                      << "Instruction does not read exec copy: " << *J << '\n');
301d88c1a5aSDimitry Andric           break;
302d88c1a5aSDimitry Andric         }
3032cab237bSDimitry Andric       } else if (ReadsCopyFromExec && !SaveExecInst) {
3042cab237bSDimitry Andric         // Make sure no other instruction is trying to use this copy, before it
3052cab237bSDimitry Andric         // will be rewritten by the saveexec, i.e. hasOneUse. There may have
3062cab237bSDimitry Andric         // been another use, such as an inserted spill. For example:
3072cab237bSDimitry Andric         //
3082cab237bSDimitry Andric         // %sgpr0_sgpr1 = COPY %exec
3092cab237bSDimitry Andric         // spill %sgpr0_sgpr1
3102cab237bSDimitry Andric         // %sgpr2_sgpr3 = S_AND_B64 %sgpr0_sgpr1
3112cab237bSDimitry Andric         //
312*4ba319b5SDimitry Andric         LLVM_DEBUG(dbgs() << "Found second use of save inst candidate: " << *J
313*4ba319b5SDimitry Andric                           << '\n');
3142cab237bSDimitry Andric         break;
315d88c1a5aSDimitry Andric       }
316d88c1a5aSDimitry Andric 
317d88c1a5aSDimitry Andric       if (SaveExecInst && J->readsRegister(CopyToExec, TRI)) {
318d88c1a5aSDimitry Andric         assert(SaveExecInst != &*J);
319d88c1a5aSDimitry Andric         OtherUseInsts.push_back(&*J);
320d88c1a5aSDimitry Andric       }
321d88c1a5aSDimitry Andric     }
322d88c1a5aSDimitry Andric 
323d88c1a5aSDimitry Andric     if (!SaveExecInst)
324d88c1a5aSDimitry Andric       continue;
325d88c1a5aSDimitry Andric 
326*4ba319b5SDimitry Andric     LLVM_DEBUG(dbgs() << "Insert save exec op: " << *SaveExecInst << '\n');
327d88c1a5aSDimitry Andric 
328d88c1a5aSDimitry Andric     MachineOperand &Src0 = SaveExecInst->getOperand(1);
329d88c1a5aSDimitry Andric     MachineOperand &Src1 = SaveExecInst->getOperand(2);
330d88c1a5aSDimitry Andric 
331d88c1a5aSDimitry Andric     MachineOperand *OtherOp = nullptr;
332d88c1a5aSDimitry Andric 
333d88c1a5aSDimitry Andric     if (Src0.isReg() && Src0.getReg() == CopyFromExec) {
334d88c1a5aSDimitry Andric       OtherOp = &Src1;
335d88c1a5aSDimitry Andric     } else if (Src1.isReg() && Src1.getReg() == CopyFromExec) {
336d88c1a5aSDimitry Andric       if (!SaveExecInst->isCommutable())
337d88c1a5aSDimitry Andric         break;
338d88c1a5aSDimitry Andric 
339d88c1a5aSDimitry Andric       OtherOp = &Src0;
340d88c1a5aSDimitry Andric     } else
341d88c1a5aSDimitry Andric       llvm_unreachable("unexpected");
342d88c1a5aSDimitry Andric 
343d88c1a5aSDimitry Andric     CopyFromExecInst->eraseFromParent();
344d88c1a5aSDimitry Andric 
345d88c1a5aSDimitry Andric     auto InsPt = SaveExecInst->getIterator();
346d88c1a5aSDimitry Andric     const DebugLoc &DL = SaveExecInst->getDebugLoc();
347d88c1a5aSDimitry Andric 
348d88c1a5aSDimitry Andric     BuildMI(MBB, InsPt, DL, TII->get(getSaveExecOp(SaveExecInst->getOpcode())),
349d88c1a5aSDimitry Andric             CopyFromExec)
350d88c1a5aSDimitry Andric       .addReg(OtherOp->getReg());
351d88c1a5aSDimitry Andric     SaveExecInst->eraseFromParent();
352d88c1a5aSDimitry Andric 
353d88c1a5aSDimitry Andric     CopyToExecInst->eraseFromParent();
354d88c1a5aSDimitry Andric 
355d88c1a5aSDimitry Andric     for (MachineInstr *OtherInst : OtherUseInsts) {
356d88c1a5aSDimitry Andric       OtherInst->substituteRegister(CopyToExec, AMDGPU::EXEC,
357d88c1a5aSDimitry Andric                                     AMDGPU::NoSubRegister, *TRI);
358d88c1a5aSDimitry Andric     }
359d88c1a5aSDimitry Andric   }
360d88c1a5aSDimitry Andric 
361d88c1a5aSDimitry Andric   return true;
362d88c1a5aSDimitry Andric 
363d88c1a5aSDimitry Andric }
364