12cab237bSDimitry Andric //===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===//
22cab237bSDimitry Andric //
32cab237bSDimitry Andric //                     The LLVM Compiler Infrastructure
42cab237bSDimitry Andric //
52cab237bSDimitry Andric // This file is distributed under the University of Illinois Open Source
62cab237bSDimitry Andric // License. See LICENSE.TXT for details.
72cab237bSDimitry Andric //
82cab237bSDimitry Andric //===----------------------------------------------------------------------===//
92cab237bSDimitry Andric //
102cab237bSDimitry Andric /// \file
114ba319b5SDimitry Andric /// This pass removes redundant S_OR_B64 instructions enabling lanes in
122cab237bSDimitry Andric /// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
132cab237bSDimitry Andric /// vector instructions between them we can only keep outer SI_END_CF, given
142cab237bSDimitry Andric /// that CFG is structured and exec bits of the outer end statement are always
152cab237bSDimitry Andric /// not less than exec bit of the inner one.
162cab237bSDimitry Andric ///
172cab237bSDimitry Andric /// This needs to be done before the RA to eliminate saved exec bits registers
182cab237bSDimitry Andric /// but after register coalescer to have no vector registers copies in between
192cab237bSDimitry Andric /// of different end cf statements.
202cab237bSDimitry Andric ///
212cab237bSDimitry Andric //===----------------------------------------------------------------------===//
222cab237bSDimitry Andric 
232cab237bSDimitry Andric #include "AMDGPU.h"
242cab237bSDimitry Andric #include "AMDGPUSubtarget.h"
252cab237bSDimitry Andric #include "SIInstrInfo.h"
264ba319b5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
272cab237bSDimitry Andric #include "llvm/CodeGen/LiveIntervals.h"
282cab237bSDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
292cab237bSDimitry Andric 
302cab237bSDimitry Andric using namespace llvm;
312cab237bSDimitry Andric 
322cab237bSDimitry Andric #define DEBUG_TYPE "si-optimize-exec-masking-pre-ra"
332cab237bSDimitry Andric 
342cab237bSDimitry Andric namespace {
352cab237bSDimitry Andric 
362cab237bSDimitry Andric class SIOptimizeExecMaskingPreRA : public MachineFunctionPass {
372cab237bSDimitry Andric public:
382cab237bSDimitry Andric   static char ID;
392cab237bSDimitry Andric 
402cab237bSDimitry Andric public:
SIOptimizeExecMaskingPreRA()412cab237bSDimitry Andric   SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
422cab237bSDimitry Andric     initializeSIOptimizeExecMaskingPreRAPass(*PassRegistry::getPassRegistry());
432cab237bSDimitry Andric   }
442cab237bSDimitry Andric 
452cab237bSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
462cab237bSDimitry Andric 
getPassName() const472cab237bSDimitry Andric   StringRef getPassName() const override {
482cab237bSDimitry Andric     return "SI optimize exec mask operations pre-RA";
492cab237bSDimitry Andric   }
502cab237bSDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const512cab237bSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
522cab237bSDimitry Andric     AU.addRequired<LiveIntervals>();
532cab237bSDimitry Andric     AU.setPreservesAll();
542cab237bSDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
552cab237bSDimitry Andric   }
562cab237bSDimitry Andric };
572cab237bSDimitry Andric 
582cab237bSDimitry Andric } // End anonymous namespace.
592cab237bSDimitry Andric 
602cab237bSDimitry Andric INITIALIZE_PASS_BEGIN(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
612cab237bSDimitry Andric                       "SI optimize exec mask operations pre-RA", false, false)
622cab237bSDimitry Andric INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
632cab237bSDimitry Andric INITIALIZE_PASS_END(SIOptimizeExecMaskingPreRA, DEBUG_TYPE,
642cab237bSDimitry Andric                     "SI optimize exec mask operations pre-RA", false, false)
652cab237bSDimitry Andric 
662cab237bSDimitry Andric char SIOptimizeExecMaskingPreRA::ID = 0;
672cab237bSDimitry Andric 
682cab237bSDimitry Andric char &llvm::SIOptimizeExecMaskingPreRAID = SIOptimizeExecMaskingPreRA::ID;
692cab237bSDimitry Andric 
createSIOptimizeExecMaskingPreRAPass()702cab237bSDimitry Andric FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
712cab237bSDimitry Andric   return new SIOptimizeExecMaskingPreRA();
722cab237bSDimitry Andric }
732cab237bSDimitry Andric 
isEndCF(const MachineInstr & MI,const SIRegisterInfo * TRI)742cab237bSDimitry Andric static bool isEndCF(const MachineInstr& MI, const SIRegisterInfo* TRI) {
752cab237bSDimitry Andric   return MI.getOpcode() == AMDGPU::S_OR_B64 &&
762cab237bSDimitry Andric          MI.modifiesRegister(AMDGPU::EXEC, TRI);
772cab237bSDimitry Andric }
782cab237bSDimitry Andric 
isFullExecCopy(const MachineInstr & MI)792cab237bSDimitry Andric static bool isFullExecCopy(const MachineInstr& MI) {
802cab237bSDimitry Andric   return MI.isFullCopy() && MI.getOperand(1).getReg() == AMDGPU::EXEC;
812cab237bSDimitry Andric }
822cab237bSDimitry Andric 
getOrNonExecReg(const MachineInstr & MI,const SIInstrInfo & TII)832cab237bSDimitry Andric static unsigned getOrNonExecReg(const MachineInstr &MI,
842cab237bSDimitry Andric                                 const SIInstrInfo &TII) {
852cab237bSDimitry Andric   auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
862cab237bSDimitry Andric   if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
872cab237bSDimitry Andric      return Op->getReg();
882cab237bSDimitry Andric   Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
892cab237bSDimitry Andric   if (Op->isReg() && Op->getReg() != AMDGPU::EXEC)
902cab237bSDimitry Andric      return Op->getReg();
912cab237bSDimitry Andric   return AMDGPU::NoRegister;
922cab237bSDimitry Andric }
932cab237bSDimitry Andric 
getOrExecSource(const MachineInstr & MI,const SIInstrInfo & TII,const MachineRegisterInfo & MRI)942cab237bSDimitry Andric static MachineInstr* getOrExecSource(const MachineInstr &MI,
952cab237bSDimitry Andric                                      const SIInstrInfo &TII,
962cab237bSDimitry Andric                                      const MachineRegisterInfo &MRI) {
972cab237bSDimitry Andric   auto SavedExec = getOrNonExecReg(MI, TII);
982cab237bSDimitry Andric   if (SavedExec == AMDGPU::NoRegister)
992cab237bSDimitry Andric     return nullptr;
1002cab237bSDimitry Andric   auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
1012cab237bSDimitry Andric   if (!SaveExecInst || !isFullExecCopy(*SaveExecInst))
1022cab237bSDimitry Andric     return nullptr;
1032cab237bSDimitry Andric   return SaveExecInst;
1042cab237bSDimitry Andric }
1052cab237bSDimitry Andric 
106*b5893f02SDimitry Andric // Optimize sequence
107*b5893f02SDimitry Andric //    %sel = V_CNDMASK_B32_e64 0, 1, %cc
108*b5893f02SDimitry Andric //    %cmp = V_CMP_NE_U32 1, %1
109*b5893f02SDimitry Andric //    $vcc = S_AND_B64 $exec, %cmp
110*b5893f02SDimitry Andric //    S_CBRANCH_VCC[N]Z
111*b5893f02SDimitry Andric // =>
112*b5893f02SDimitry Andric //    $vcc = S_ANDN2_B64 $exec, %cc
113*b5893f02SDimitry Andric //    S_CBRANCH_VCC[N]Z
114*b5893f02SDimitry Andric //
115*b5893f02SDimitry Andric // It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the
116*b5893f02SDimitry Andric // rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but
117*b5893f02SDimitry Andric // only 3 first instructions are really needed. S_AND_B64 with exec is a
118*b5893f02SDimitry Andric // required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive
119*b5893f02SDimitry Andric // lanes.
120*b5893f02SDimitry Andric //
121*b5893f02SDimitry Andric // Returns %cc register on success.
optimizeVcndVcmpPair(MachineBasicBlock & MBB,const GCNSubtarget & ST,MachineRegisterInfo & MRI,LiveIntervals * LIS)122*b5893f02SDimitry Andric static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
123*b5893f02SDimitry Andric                                      const GCNSubtarget &ST,
124*b5893f02SDimitry Andric                                      MachineRegisterInfo &MRI,
125*b5893f02SDimitry Andric                                      LiveIntervals *LIS) {
126*b5893f02SDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
127*b5893f02SDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
128*b5893f02SDimitry Andric   const unsigned AndOpc = AMDGPU::S_AND_B64;
129*b5893f02SDimitry Andric   const unsigned Andn2Opc = AMDGPU::S_ANDN2_B64;
130*b5893f02SDimitry Andric   const unsigned CondReg = AMDGPU::VCC;
131*b5893f02SDimitry Andric   const unsigned ExecReg = AMDGPU::EXEC;
132*b5893f02SDimitry Andric 
133*b5893f02SDimitry Andric   auto I = llvm::find_if(MBB.terminators(), [](const MachineInstr &MI) {
134*b5893f02SDimitry Andric                            unsigned Opc = MI.getOpcode();
135*b5893f02SDimitry Andric                            return Opc == AMDGPU::S_CBRANCH_VCCZ ||
136*b5893f02SDimitry Andric                                   Opc == AMDGPU::S_CBRANCH_VCCNZ; });
137*b5893f02SDimitry Andric   if (I == MBB.terminators().end())
138*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
139*b5893f02SDimitry Andric 
140*b5893f02SDimitry Andric   auto *And = TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister,
141*b5893f02SDimitry Andric                                    *I, MRI, LIS);
142*b5893f02SDimitry Andric   if (!And || And->getOpcode() != AndOpc ||
143*b5893f02SDimitry Andric       !And->getOperand(1).isReg() || !And->getOperand(2).isReg())
144*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
145*b5893f02SDimitry Andric 
146*b5893f02SDimitry Andric   MachineOperand *AndCC = &And->getOperand(1);
147*b5893f02SDimitry Andric   unsigned CmpReg = AndCC->getReg();
148*b5893f02SDimitry Andric   unsigned CmpSubReg = AndCC->getSubReg();
149*b5893f02SDimitry Andric   if (CmpReg == ExecReg) {
150*b5893f02SDimitry Andric     AndCC = &And->getOperand(2);
151*b5893f02SDimitry Andric     CmpReg = AndCC->getReg();
152*b5893f02SDimitry Andric     CmpSubReg = AndCC->getSubReg();
153*b5893f02SDimitry Andric   } else if (And->getOperand(2).getReg() != ExecReg) {
154*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
155*b5893f02SDimitry Andric   }
156*b5893f02SDimitry Andric 
157*b5893f02SDimitry Andric   auto *Cmp = TRI->findReachingDef(CmpReg, CmpSubReg, *And, MRI, LIS);
158*b5893f02SDimitry Andric   if (!Cmp || !(Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
159*b5893f02SDimitry Andric                 Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
160*b5893f02SDimitry Andric       Cmp->getParent() != And->getParent())
161*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
162*b5893f02SDimitry Andric 
163*b5893f02SDimitry Andric   MachineOperand *Op1 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src0);
164*b5893f02SDimitry Andric   MachineOperand *Op2 = TII->getNamedOperand(*Cmp, AMDGPU::OpName::src1);
165*b5893f02SDimitry Andric   if (Op1->isImm() && Op2->isReg())
166*b5893f02SDimitry Andric     std::swap(Op1, Op2);
167*b5893f02SDimitry Andric   if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
168*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
169*b5893f02SDimitry Andric 
170*b5893f02SDimitry Andric   unsigned SelReg = Op1->getReg();
171*b5893f02SDimitry Andric   auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
172*b5893f02SDimitry Andric   if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
173*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
174*b5893f02SDimitry Andric 
175*b5893f02SDimitry Andric   Op1 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
176*b5893f02SDimitry Andric   Op2 = TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
177*b5893f02SDimitry Andric   MachineOperand *CC = TII->getNamedOperand(*Sel, AMDGPU::OpName::src2);
178*b5893f02SDimitry Andric   if (!Op1->isImm() || !Op2->isImm() || !CC->isReg() ||
179*b5893f02SDimitry Andric       Op1->getImm() != 0 || Op2->getImm() != 1)
180*b5893f02SDimitry Andric     return AMDGPU::NoRegister;
181*b5893f02SDimitry Andric 
182*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
183*b5893f02SDimitry Andric                     << *Cmp << '\t' << *And);
184*b5893f02SDimitry Andric 
185*b5893f02SDimitry Andric   unsigned CCReg = CC->getReg();
186*b5893f02SDimitry Andric   LIS->RemoveMachineInstrFromMaps(*And);
187*b5893f02SDimitry Andric   MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
188*b5893f02SDimitry Andric                                 TII->get(Andn2Opc), And->getOperand(0).getReg())
189*b5893f02SDimitry Andric                             .addReg(ExecReg)
190*b5893f02SDimitry Andric                             .addReg(CCReg, CC->getSubReg());
191*b5893f02SDimitry Andric   And->eraseFromParent();
192*b5893f02SDimitry Andric   LIS->InsertMachineInstrInMaps(*Andn2);
193*b5893f02SDimitry Andric 
194*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
195*b5893f02SDimitry Andric 
196*b5893f02SDimitry Andric   // Try to remove compare. Cmp value should not used in between of cmp
197*b5893f02SDimitry Andric   // and s_and_b64 if VCC or just unused if any other register.
198*b5893f02SDimitry Andric   if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
199*b5893f02SDimitry Andric        MRI.use_nodbg_empty(CmpReg)) ||
200*b5893f02SDimitry Andric       (CmpReg == CondReg &&
201*b5893f02SDimitry Andric        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
202*b5893f02SDimitry Andric                     [&](const MachineInstr &MI) {
203*b5893f02SDimitry Andric                       return MI.readsRegister(CondReg, TRI); }))) {
204*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
205*b5893f02SDimitry Andric 
206*b5893f02SDimitry Andric     LIS->RemoveMachineInstrFromMaps(*Cmp);
207*b5893f02SDimitry Andric     Cmp->eraseFromParent();
208*b5893f02SDimitry Andric 
209*b5893f02SDimitry Andric     // Try to remove v_cndmask_b32.
210*b5893f02SDimitry Andric     if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
211*b5893f02SDimitry Andric         MRI.use_nodbg_empty(SelReg)) {
212*b5893f02SDimitry Andric       LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
213*b5893f02SDimitry Andric 
214*b5893f02SDimitry Andric       LIS->RemoveMachineInstrFromMaps(*Sel);
215*b5893f02SDimitry Andric       Sel->eraseFromParent();
216*b5893f02SDimitry Andric     }
217*b5893f02SDimitry Andric   }
218*b5893f02SDimitry Andric 
219*b5893f02SDimitry Andric   return CCReg;
220*b5893f02SDimitry Andric }
221*b5893f02SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)2222cab237bSDimitry Andric bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
2232cab237bSDimitry Andric   if (skipFunction(MF.getFunction()))
2242cab237bSDimitry Andric     return false;
2252cab237bSDimitry Andric 
2264ba319b5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2272cab237bSDimitry Andric   const SIRegisterInfo *TRI = ST.getRegisterInfo();
2282cab237bSDimitry Andric   const SIInstrInfo *TII = ST.getInstrInfo();
2292cab237bSDimitry Andric   MachineRegisterInfo &MRI = MF.getRegInfo();
2302cab237bSDimitry Andric   LiveIntervals *LIS = &getAnalysis<LiveIntervals>();
2312cab237bSDimitry Andric   DenseSet<unsigned> RecalcRegs({AMDGPU::EXEC_LO, AMDGPU::EXEC_HI});
2322cab237bSDimitry Andric   bool Changed = false;
2332cab237bSDimitry Andric 
2342cab237bSDimitry Andric   for (MachineBasicBlock &MBB : MF) {
2352cab237bSDimitry Andric 
236*b5893f02SDimitry Andric     if (unsigned Reg = optimizeVcndVcmpPair(MBB, ST, MRI, LIS)) {
237*b5893f02SDimitry Andric       RecalcRegs.insert(Reg);
238*b5893f02SDimitry Andric       RecalcRegs.insert(AMDGPU::VCC_LO);
239*b5893f02SDimitry Andric       RecalcRegs.insert(AMDGPU::VCC_HI);
240*b5893f02SDimitry Andric       RecalcRegs.insert(AMDGPU::SCC);
241*b5893f02SDimitry Andric       Changed = true;
242*b5893f02SDimitry Andric     }
243*b5893f02SDimitry Andric 
2442cab237bSDimitry Andric     // Try to remove unneeded instructions before s_endpgm.
2452cab237bSDimitry Andric     if (MBB.succ_empty()) {
246*b5893f02SDimitry Andric       if (MBB.empty())
247*b5893f02SDimitry Andric         continue;
248*b5893f02SDimitry Andric 
249*b5893f02SDimitry Andric       // Skip this if the endpgm has any implicit uses, otherwise we would need
250*b5893f02SDimitry Andric       // to be careful to update / remove them.
251*b5893f02SDimitry Andric       MachineInstr &Term = MBB.back();
252*b5893f02SDimitry Andric       if (Term.getOpcode() != AMDGPU::S_ENDPGM ||
253*b5893f02SDimitry Andric           Term.getNumOperands() != 0)
2542cab237bSDimitry Andric         continue;
2552cab237bSDimitry Andric 
2562cab237bSDimitry Andric       SmallVector<MachineBasicBlock*, 4> Blocks({&MBB});
2572cab237bSDimitry Andric 
2582cab237bSDimitry Andric       while (!Blocks.empty()) {
2592cab237bSDimitry Andric         auto CurBB = Blocks.pop_back_val();
2602cab237bSDimitry Andric         auto I = CurBB->rbegin(), E = CurBB->rend();
2612cab237bSDimitry Andric         if (I != E) {
2622cab237bSDimitry Andric           if (I->isUnconditionalBranch() || I->getOpcode() == AMDGPU::S_ENDPGM)
2632cab237bSDimitry Andric             ++I;
2642cab237bSDimitry Andric           else if (I->isBranch())
2652cab237bSDimitry Andric             continue;
2662cab237bSDimitry Andric         }
2672cab237bSDimitry Andric 
2682cab237bSDimitry Andric         while (I != E) {
2694ba319b5SDimitry Andric           if (I->isDebugInstr()) {
2702cab237bSDimitry Andric             I = std::next(I);
2712cab237bSDimitry Andric             continue;
2722cab237bSDimitry Andric           }
2732cab237bSDimitry Andric 
2742cab237bSDimitry Andric           if (I->mayStore() || I->isBarrier() || I->isCall() ||
2752cab237bSDimitry Andric               I->hasUnmodeledSideEffects() || I->hasOrderedMemoryRef())
2762cab237bSDimitry Andric             break;
2772cab237bSDimitry Andric 
2784ba319b5SDimitry Andric           LLVM_DEBUG(dbgs()
2794ba319b5SDimitry Andric                      << "Removing no effect instruction: " << *I << '\n');
2802cab237bSDimitry Andric 
2812cab237bSDimitry Andric           for (auto &Op : I->operands()) {
2822cab237bSDimitry Andric             if (Op.isReg())
2832cab237bSDimitry Andric               RecalcRegs.insert(Op.getReg());
2842cab237bSDimitry Andric           }
2852cab237bSDimitry Andric 
2862cab237bSDimitry Andric           auto Next = std::next(I);
2872cab237bSDimitry Andric           LIS->RemoveMachineInstrFromMaps(*I);
2882cab237bSDimitry Andric           I->eraseFromParent();
2892cab237bSDimitry Andric           I = Next;
2902cab237bSDimitry Andric 
2912cab237bSDimitry Andric           Changed = true;
2922cab237bSDimitry Andric         }
2932cab237bSDimitry Andric 
2942cab237bSDimitry Andric         if (I != E)
2952cab237bSDimitry Andric           continue;
2962cab237bSDimitry Andric 
2972cab237bSDimitry Andric         // Try to ascend predecessors.
2982cab237bSDimitry Andric         for (auto *Pred : CurBB->predecessors()) {
2992cab237bSDimitry Andric           if (Pred->succ_size() == 1)
3002cab237bSDimitry Andric             Blocks.push_back(Pred);
3012cab237bSDimitry Andric         }
3022cab237bSDimitry Andric       }
3032cab237bSDimitry Andric       continue;
3042cab237bSDimitry Andric     }
3052cab237bSDimitry Andric 
3062cab237bSDimitry Andric     // Try to collapse adjacent endifs.
3072cab237bSDimitry Andric     auto Lead = MBB.begin(), E = MBB.end();
3082cab237bSDimitry Andric     if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI))
3092cab237bSDimitry Andric       continue;
3102cab237bSDimitry Andric 
3112cab237bSDimitry Andric     const MachineBasicBlock* Succ = *MBB.succ_begin();
3122cab237bSDimitry Andric     if (!MBB.isLayoutSuccessor(Succ))
3132cab237bSDimitry Andric       continue;
3142cab237bSDimitry Andric 
3152cab237bSDimitry Andric     auto I = std::next(Lead);
3162cab237bSDimitry Andric 
3172cab237bSDimitry Andric     for ( ; I != E; ++I)
3182cab237bSDimitry Andric       if (!TII->isSALU(*I) || I->readsRegister(AMDGPU::EXEC, TRI))
3192cab237bSDimitry Andric         break;
3202cab237bSDimitry Andric 
3212cab237bSDimitry Andric     if (I != E)
3222cab237bSDimitry Andric       continue;
3232cab237bSDimitry Andric 
3242cab237bSDimitry Andric     const auto NextLead = Succ->begin();
3252cab237bSDimitry Andric     if (NextLead == Succ->end() || !isEndCF(*NextLead, TRI) ||
3262cab237bSDimitry Andric         !getOrExecSource(*NextLead, *TII, MRI))
3272cab237bSDimitry Andric       continue;
3282cab237bSDimitry Andric 
3294ba319b5SDimitry Andric     LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
3302cab237bSDimitry Andric 
3312cab237bSDimitry Andric     auto SaveExec = getOrExecSource(*Lead, *TII, MRI);
3322cab237bSDimitry Andric     unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII);
3332cab237bSDimitry Andric     for (auto &Op : Lead->operands()) {
3342cab237bSDimitry Andric       if (Op.isReg())
3352cab237bSDimitry Andric         RecalcRegs.insert(Op.getReg());
3362cab237bSDimitry Andric     }
3372cab237bSDimitry Andric 
3382cab237bSDimitry Andric     LIS->RemoveMachineInstrFromMaps(*Lead);
3392cab237bSDimitry Andric     Lead->eraseFromParent();
3402cab237bSDimitry Andric     if (SaveExecReg) {
3412cab237bSDimitry Andric       LIS->removeInterval(SaveExecReg);
3422cab237bSDimitry Andric       LIS->createAndComputeVirtRegInterval(SaveExecReg);
3432cab237bSDimitry Andric     }
3442cab237bSDimitry Andric 
3452cab237bSDimitry Andric     Changed = true;
3462cab237bSDimitry Andric 
3472cab237bSDimitry Andric     // If the only use of saved exec in the removed instruction is S_AND_B64
3482cab237bSDimitry Andric     // fold the copy now.
3492cab237bSDimitry Andric     if (!SaveExec || !SaveExec->isFullCopy())
3502cab237bSDimitry Andric       continue;
3512cab237bSDimitry Andric 
3522cab237bSDimitry Andric     unsigned SavedExec = SaveExec->getOperand(0).getReg();
3532cab237bSDimitry Andric     bool SafeToReplace = true;
3542cab237bSDimitry Andric     for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
3552cab237bSDimitry Andric       if (U.getParent() != SaveExec->getParent()) {
3562cab237bSDimitry Andric         SafeToReplace = false;
3572cab237bSDimitry Andric         break;
3582cab237bSDimitry Andric       }
3592cab237bSDimitry Andric 
3604ba319b5SDimitry Andric       LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
3612cab237bSDimitry Andric     }
3622cab237bSDimitry Andric 
3632cab237bSDimitry Andric     if (SafeToReplace) {
3642cab237bSDimitry Andric       LIS->RemoveMachineInstrFromMaps(*SaveExec);
3652cab237bSDimitry Andric       SaveExec->eraseFromParent();
3662cab237bSDimitry Andric       MRI.replaceRegWith(SavedExec, AMDGPU::EXEC);
3672cab237bSDimitry Andric       LIS->removeInterval(SavedExec);
3682cab237bSDimitry Andric     }
3692cab237bSDimitry Andric   }
3702cab237bSDimitry Andric 
3712cab237bSDimitry Andric   if (Changed) {
3722cab237bSDimitry Andric     for (auto Reg : RecalcRegs) {
3732cab237bSDimitry Andric       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
3742cab237bSDimitry Andric         LIS->removeInterval(Reg);
3752cab237bSDimitry Andric         if (!MRI.reg_empty(Reg))
3762cab237bSDimitry Andric           LIS->createAndComputeVirtRegInterval(Reg);
3772cab237bSDimitry Andric       } else {
3782cab237bSDimitry Andric         for (MCRegUnitIterator U(Reg, TRI); U.isValid(); ++U)
3792cab237bSDimitry Andric           LIS->removeRegUnit(*U);
3802cab237bSDimitry Andric       }
3812cab237bSDimitry Andric     }
3822cab237bSDimitry Andric   }
3832cab237bSDimitry Andric 
3842cab237bSDimitry Andric   return Changed;
3852cab237bSDimitry Andric }
386