1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 #include "AMDGPU.h"
16 =======
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 >>>>>>> Move Combiner to PreLegalize step
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPreLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   CombinerHelper &Helper;
40 
41 public:
42   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct ClampI64ToI16MatchInfo {
46     int64_t Cmp1;
47     int64_t Cmp2;
48     Register Origin;
49   };
50 
51   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
52                           MachineFunction &MF,
53                           ClampI64ToI16MatchInfo &MatchInfo);
54 
55   void applyClampI64ToI16(MachineInstr &MI,
56                           const ClampI64ToI16MatchInfo &MatchInfo);
57 };
58 
59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
60     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
61     ClampI64ToI16MatchInfo &MatchInfo) {
62   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
63 
64   // Try to find a pattern where an i64 value should get clamped to short.
65   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
66   if (SrcType != LLT::scalar(64))
67     return false;
68 
69   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
70   if (DstType != LLT::scalar(16))
71     return false;
72 
73   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
74 
75   Register Base;
76 
77   // Try to match a combination of min / max MIR opcodes.
78   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
79     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
80       return false;
81     }
82   }
83 
84   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
85     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
86       return false;
87     }
88   }
89 
90   const auto Cmp1 = MatchInfo.Cmp1;
91   const auto Cmp2 = MatchInfo.Cmp2;
92   const auto Diff = std::abs(Cmp2 - Cmp1);
93 
94   // If the difference between both comparison values is 0 or 1, there is no
95   // need to clamp.
96   if (Diff == 0 || Diff == 1)
97     return false;
98 
99   const int64_t Min = std::numeric_limits<int16_t>::min();
100   const int64_t Max = std::numeric_limits<int16_t>::max();
101 
102   // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
103   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
104           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
105 }
106 
107 // We want to find a combination of instructions that
108 // gets generated when an i64 gets clamped to i16.
109 // The corresponding pattern is:
110 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
111 // This can be efficiently written as following:
112 // v_cvt_pk_i16_i32 v0, v0, v1
113 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
114 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
115     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
116   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
117 
118   Register Src = MatchInfo.Origin;
119   assert(MRI.getType(Src) == LLT::scalar(64));
120   const LLT S32 = LLT::scalar(32);
121 
122   B.setMBB(*MI.getParent());
123   B.setInstrAndDebugLoc(MI);
124 
125   auto Unmerge = B.buildUnmerge(S32, Src);
126   Register Hi32 = Unmerge.getReg(0);
127   Register Lo32 = Unmerge.getReg(1);
128   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
129   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
130 
131   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
132   assert(MI.getOpcode() != CvtOpcode);
133 
134   const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
135 
136   Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
137   MRI.setType(CvtDst, S32);
138 
139   auto CvtPk = B.buildInstr(CvtOpcode);
140   CvtPk.addDef(CvtDst);
141   CvtPk.addReg(Hi32);
142   CvtPk.addReg(Lo32);
143   CvtPk.setMIFlags(MI.getFlags());
144 
145   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
146   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
147 
148   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
149   MRI.setRegClass(MinBoundaryDst.getReg(0), REG_CLASS);
150 
151   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
152   MRI.setRegClass(MaxBoundaryDst.getReg(0), REG_CLASS);
153 
154   Register MedDst = MRI.createVirtualRegister(REG_CLASS);
155   MRI.setType(MedDst, S32);
156 
157   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
158   Med.addDef(MedDst);
159   Med.addReg(MinBoundaryDst.getReg(0));
160   Med.addReg(CvtDst);
161   Med.addReg(MaxBoundaryDst.getReg(0));
162   Med.setMIFlags(MI.getFlags());
163 
164   Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
165   B.buildTrunc(TruncDst, MedDst);
166   B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
167 
168   MI.eraseFromParent();
169 }
170 
171 class AMDGPUPreLegalizerCombinerHelperState {
172 protected:
173   CombinerHelper &Helper;
174   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
175 
176 public:
177   AMDGPUPreLegalizerCombinerHelperState(
178       CombinerHelper &Helper,
179       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
180       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
181 };
182 
183 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
184 #include "AMDGPUGenPreLegalizeGICombiner.inc"
185 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
186 
187 namespace {
188 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
189 #include "AMDGPUGenPreLegalizeGICombiner.inc"
190 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
191 
192 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
193   GISelKnownBits *KB;
194   MachineDominatorTree *MDT;
195 
196 public:
197   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
198 
199   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
200                                   const AMDGPULegalizerInfo *LI,
201                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
202       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
203                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
204         KB(KB), MDT(MDT) {
205     if (!GeneratedRuleCfg.parseCommandLineOption())
206       report_fatal_error("Invalid rule identifier");
207   }
208 
209   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
210                        MachineIRBuilder &B) const override;
211 };
212 
213 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
214                                               MachineInstr &MI,
215                                               MachineIRBuilder &B) const {
216   CombinerHelper Helper(Observer, B, KB, MDT);
217   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
218   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
219                                                  PreLegalizerHelper);
220 
221   if (Generated.tryCombineAll(Observer, MI, B, Helper))
222     return true;
223 
224   switch (MI.getOpcode()) {
225   case TargetOpcode::G_CONCAT_VECTORS:
226     return Helper.tryCombineConcatVectors(MI);
227   case TargetOpcode::G_SHUFFLE_VECTOR:
228     return Helper.tryCombineShuffleVector(MI);
229   }
230 
231   return false;
232 }
233 
234 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
235 #include "AMDGPUGenPreLegalizeGICombiner.inc"
236 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
237 
238 // Pass boilerplate
239 // ================
240 
241 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
242 public:
243   static char ID;
244 
245   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
246 
247   StringRef getPassName() const override {
248     return "AMDGPUPreLegalizerCombiner";
249   }
250 
251   bool runOnMachineFunction(MachineFunction &MF) override;
252 
253   void getAnalysisUsage(AnalysisUsage &AU) const override;
254 private:
255   bool IsOptNone;
256 };
257 } // end anonymous namespace
258 
259 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
260   AU.addRequired<TargetPassConfig>();
261   AU.setPreservesCFG();
262   getSelectionDAGFallbackAnalysisUsage(AU);
263   AU.addRequired<GISelKnownBitsAnalysis>();
264   AU.addPreserved<GISelKnownBitsAnalysis>();
265   if (!IsOptNone) {
266     AU.addRequired<MachineDominatorTree>();
267     AU.addPreserved<MachineDominatorTree>();
268   }
269   MachineFunctionPass::getAnalysisUsage(AU);
270 }
271 
272 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
273   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
274   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
275 }
276 
277 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
278   if (MF.getProperties().hasProperty(
279           MachineFunctionProperties::Property::FailedISel))
280     return false;
281   auto *TPC = &getAnalysis<TargetPassConfig>();
282   const Function &F = MF.getFunction();
283   bool EnableOpt =
284       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
285 
286   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
287   const AMDGPULegalizerInfo *LI =
288       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
289 
290   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
291   MachineDominatorTree *MDT =
292       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
293   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
294                                         F.hasMinSize(), LI, KB, MDT);
295   Combiner C(PCInfo, TPC);
296   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
297 }
298 
299 char AMDGPUPreLegalizerCombiner::ID = 0;
300 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
301                       "Combine AMDGPU machine instrs before legalization",
302                       false, false)
303 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
304 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
305 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
306                     "Combine AMDGPU machine instrs before legalization", false,
307                     false)
308 
309 namespace llvm {
310 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
311   return new AMDGPUPreLegalizerCombiner(IsOptNone);
312 }
313 } // end namespace llvm
314