1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "si-pre-emit-peephole"
25 
26 namespace {
27 
28 class SIPreEmitPeephole : public MachineFunctionPass {
29 private:
30   const SIInstrInfo *TII = nullptr;
31   const SIRegisterInfo *TRI = nullptr;
32 
33   bool optimizeVccBranch(MachineInstr &MI) const;
34   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
35 
36 public:
37   static char ID;
38 
39   SIPreEmitPeephole() : MachineFunctionPass(ID) {
40     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
41   }
42 
43   bool runOnMachineFunction(MachineFunction &MF) override;
44 };
45 
46 } // End anonymous namespace.
47 
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49                 "SI peephole optimizations", false, false)
50 
51 char SIPreEmitPeephole::ID = 0;
52 
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
54 
55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
56   // Match:
57   // sreg = -1
58   // vcc = S_AND_B64 exec, sreg
59   // S_CBRANCH_VCC[N]Z
60   // =>
61   // S_CBRANCH_EXEC[N]Z
62   // We end up with this pattern sometimes after basic block placement.
63   // It happens while combining a block which assigns -1 to a saved mask and
64   // another block which consumes that saved mask and then a branch.
65   bool Changed = false;
66   MachineBasicBlock &MBB = *MI.getParent();
67   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68   const bool IsWave32 = ST.isWave32();
69   const unsigned CondReg = TRI->getVCC();
70   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72 
73   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
74                                       E = MBB.rend();
75   bool ReadsCond = false;
76   unsigned Threshold = 5;
77   for (++A; A != E; ++A) {
78     if (!--Threshold)
79       return false;
80     if (A->modifiesRegister(ExecReg, TRI))
81       return false;
82     if (A->modifiesRegister(CondReg, TRI)) {
83       if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
84         return false;
85       break;
86     }
87     ReadsCond |= A->readsRegister(CondReg, TRI);
88   }
89   if (A == E)
90     return false;
91 
92   MachineOperand &Op1 = A->getOperand(1);
93   MachineOperand &Op2 = A->getOperand(2);
94   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
95     TII->commuteInstruction(*A);
96     Changed = true;
97   }
98   if (Op1.getReg() != ExecReg)
99     return Changed;
100   if (Op2.isImm() && Op2.getImm() != -1)
101     return Changed;
102 
103   Register SReg;
104   if (Op2.isReg()) {
105     SReg = Op2.getReg();
106     auto M = std::next(A);
107     bool ReadsSreg = false;
108     for (; M != E; ++M) {
109       if (M->definesRegister(SReg, TRI))
110         break;
111       if (M->modifiesRegister(SReg, TRI))
112         return Changed;
113       ReadsSreg |= M->readsRegister(SReg, TRI);
114     }
115     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
116         M->getOperand(1).getImm() != -1)
117       return Changed;
118     // First if sreg is only used in and instruction fold the immediate
119     // into that and.
120     if (!ReadsSreg && Op2.isKill()) {
121       A->getOperand(2).ChangeToImmediate(-1);
122       M->eraseFromParent();
123     }
124   }
125 
126   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
127       MI.killsRegister(CondReg, TRI))
128     A->eraseFromParent();
129 
130   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
131   if (SReg == ExecReg) {
132     if (IsVCCZ) {
133       MI.eraseFromParent();
134       return true;
135     }
136     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
137   } else {
138     MI.setDesc(
139         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
140   }
141 
142   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
143   MI.addImplicitDefUseOperands(*MBB.getParent());
144 
145   return true;
146 }
147 
148 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
149                                        MachineInstr &MI) const {
150   MachineBasicBlock &MBB = *MI.getParent();
151   const MachineFunction &MF = *MBB.getParent();
152   const MachineRegisterInfo &MRI = MF.getRegInfo();
153   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
154   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
155   SmallVector<MachineInstr *, 4> ToRemove;
156   bool IdxOn = true;
157 
158   if (!MI.isIdenticalTo(First))
159     return false;
160 
161   // Scan back to find an identical S_SET_GPR_IDX_ON
162   for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
163        E = MI.getIterator(); I != E; ++I) {
164     switch (I->getOpcode()) {
165     case AMDGPU::S_SET_GPR_IDX_MODE:
166       return false;
167     case AMDGPU::S_SET_GPR_IDX_OFF:
168       IdxOn = false;
169       ToRemove.push_back(&*I);
170       break;
171     default:
172       if (I->modifiesRegister(AMDGPU::M0, TRI))
173         return false;
174       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
175         return false;
176       if (llvm::any_of(I->operands(),
177                        [&MRI, this](const MachineOperand &MO) {
178                          return MO.isReg() &&
179                                 TRI->isVectorRegister(MRI, MO.getReg());
180                        })) {
181         // The only exception allowed here is another indirect vector move
182         // with the same mode.
183         if (!IdxOn ||
184             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
185                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
186               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
187           return false;
188       }
189     }
190   }
191 
192   MI.eraseFromParent();
193   for (MachineInstr *RI : ToRemove)
194     RI->eraseFromParent();
195   return true;
196 }
197 
198 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
199   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
200   TII = ST.getInstrInfo();
201   TRI = &TII->getRegisterInfo();
202   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
203   bool Changed = false;
204 
205   for (MachineBasicBlock &MBB : MF) {
206     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
207     if (MBBE != MBB.end()) {
208       MachineInstr &MI = *MBBE;
209       switch (MI.getOpcode()) {
210       case AMDGPU::S_CBRANCH_VCCZ:
211       case AMDGPU::S_CBRANCH_VCCNZ:
212         Changed |= optimizeVccBranch(MI);
213         continue;
214       case AMDGPU::SI_RETURN_TO_EPILOG:
215         // FIXME: This is not an optimization and should be
216         // moved somewhere else.
217         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
218 
219         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
220         // because external bytecode will be appended at the end.
221         if (&MBB != &MF.back() || &MI != &MBB.back()) {
222           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
223           // at the end and jump there.
224           if (!EmptyMBBAtEnd) {
225             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
226             MF.insert(MF.end(), EmptyMBBAtEnd);
227           }
228 
229           MBB.addSuccessor(EmptyMBBAtEnd);
230           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
231               .addMBB(EmptyMBBAtEnd);
232           MI.eraseFromParent();
233           MBBE = MBB.getFirstTerminator();
234         }
235         break;
236       default:
237         break;
238       }
239     }
240 
241     if (!ST.hasVGPRIndexMode())
242       continue;
243 
244     MachineInstr *SetGPRMI = nullptr;
245     const unsigned Threshold = 20;
246     unsigned Count = 0;
247     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
248     // second is not needed. Do expensive checks in the optimizeSetGPR()
249     // and limit the distance to 20 instructions for compile time purposes.
250     for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
251       MachineInstr &MI = *MBBI;
252       ++MBBI;
253 
254       if (Count == Threshold)
255         SetGPRMI = nullptr;
256       else
257         ++Count;
258 
259       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
260         continue;
261 
262       Count = 0;
263       if (!SetGPRMI) {
264         SetGPRMI = &MI;
265         continue;
266       }
267 
268       if (optimizeSetGPR(*SetGPRMI, MI))
269         Changed = true;
270       else
271         SetGPRMI = &MI;
272     }
273   }
274 
275   return Changed;
276 }
277