1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 #include "AMDGPU.h"
16 =======
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 >>>>>>> Move Combiner to PreLegalize step
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPreLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   CombinerHelper &Helper;
40 
41 public:
42   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct ClampI64ToI16MatchInfo {
46     int64_t Cmp1;
47     int64_t Cmp2;
48     Register Origin;
49   };
50 
51   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
52                           MachineFunction &MF,
53                           ClampI64ToI16MatchInfo &MatchInfo);
54 
55   void applyClampI64ToI16(MachineInstr &MI,
56                           const ClampI64ToI16MatchInfo &MatchInfo);
57 };
58 
59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
60     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
61     ClampI64ToI16MatchInfo &MatchInfo) {
62   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
63 
64   // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or
65   // below).
66   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
67 
68   if (SrcType != LLT::scalar(64))
69     return false;
70 
71   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
72 
73   if (DstType != LLT::scalar(16))
74     return false;
75 
76   MachineIRBuilder B(MI);
77 
78   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
79 
80   Register Base;
81 
82   // match max / min pattern
83   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
84     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
85       return false;
86     }
87   }
88 
89   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
90     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
91       return false;
92     }
93   }
94 
95   const auto Cmp1 = MatchInfo.Cmp1;
96   const auto Cmp2 = MatchInfo.Cmp2;
97   const auto Diff = std::abs(Cmp2 - Cmp1);
98 
99   // we don't need to clamp here.
100   if (Diff == 0 || Diff == 1)
101     return false;
102 
103   const int64_t Min = std::numeric_limits<int16_t>::min();
104   const int64_t Max = std::numeric_limits<int16_t>::max();
105 
106   // are we really trying to clamp against the relevant boundaries?
107   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
108           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
109 }
110 
111 // We want to find a combination of instructions that
112 // gets generated when an i64 gets clamped to i16.
113 // The corresponding pattern is:
114 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
115 // This can be efficiently written as following:
116 // v_cvt_pk_i16_i32 v0, v0, v1
117 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
118 
119 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
120     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
121   LLVM_DEBUG(dbgs() << "Combining MI\n");
122 
123   MachineIRBuilder B(MI);
124   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
125 
126   Register Src = MatchInfo.Origin;
127   assert(MRI.getType(Src) == LLT::scalar(64));
128   const LLT S32 = LLT::scalar(32);
129 
130   auto Unmerge = B.buildUnmerge(S32, Src);
131   Register Hi32 = Unmerge->getOperand(0).getReg();
132   Register Lo32 = Unmerge->getOperand(1).getReg();
133   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
134   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
135 
136   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
137   assert(MI.getOpcode() != CvtOpcode);
138 
139   const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
140 
141   Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
142   MRI.setType(CvtDst, S32);
143 
144   auto CvtPk = B.buildInstr(CvtOpcode);
145   CvtPk.addDef(CvtDst);
146   CvtPk.addReg(Hi32);
147   CvtPk.addReg(Lo32);
148   CvtPk.setMIFlags(MI.getFlags());
149 
150   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
151   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
152 
153   Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
154   MRI.setType(MinBoundaryDst, S32);
155   B.buildConstant(MinBoundaryDst, min);
156 
157   Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
158   MRI.setType(MaxBoundaryDst, S32);
159   B.buildConstant(MaxBoundaryDst, max);
160 
161   Register MedDst = MRI.createVirtualRegister(REG_CLASS);
162   MRI.setType(MedDst, S32);
163 
164   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
165   Med.addDef(MedDst);
166   Med.addReg(MinBoundaryDst);
167   Med.addReg(CvtDst);
168   Med.addReg(MaxBoundaryDst);
169   Med.setMIFlags(MI.getFlags());
170 
171   Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
172   B.buildTrunc(TruncDst, MedDst);
173   B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
174 
175   MI.eraseFromParent();
176 }
177 
178 class AMDGPUPreLegalizerCombinerHelperState {
179 protected:
180   CombinerHelper &Helper;
181   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
182 
183 public:
184   AMDGPUPreLegalizerCombinerHelperState(
185       CombinerHelper &Helper,
186       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
187       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
188 };
189 
190 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
191 #include "AMDGPUGenPreLegalizeGICombiner.inc"
192 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
193 
194 namespace {
195 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
196 #include "AMDGPUGenPreLegalizeGICombiner.inc"
197 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
198 
199 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
200   GISelKnownBits *KB;
201   MachineDominatorTree *MDT;
202 
203 public:
204   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
205 
206   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
207                                   const AMDGPULegalizerInfo *LI,
208                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
209       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
210                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
211         KB(KB), MDT(MDT) {
212     if (!GeneratedRuleCfg.parseCommandLineOption())
213       report_fatal_error("Invalid rule identifier");
214   }
215 
216   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
217                        MachineIRBuilder &B) const override;
218 };
219 
220 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
221                                               MachineInstr &MI,
222                                               MachineIRBuilder &B) const {
223   CombinerHelper Helper(Observer, B, KB, MDT);
224   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
225   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
226                                                  PreLegalizerHelper);
227 
228   if (Generated.tryCombineAll(Observer, MI, B, Helper))
229     return true;
230 
231   switch (MI.getOpcode()) {
232   case TargetOpcode::G_CONCAT_VECTORS:
233     return Helper.tryCombineConcatVectors(MI);
234   case TargetOpcode::G_SHUFFLE_VECTOR:
235     return Helper.tryCombineShuffleVector(MI);
236   }
237 
238   return false;
239 }
240 
241 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
242 #include "AMDGPUGenPreLegalizeGICombiner.inc"
243 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
244 
245 // Pass boilerplate
246 // ================
247 
248 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
249 public:
250   static char ID;
251 
252   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
253 
254   StringRef getPassName() const override {
255     return "AMDGPUPreLegalizerCombiner";
256   }
257 
258   bool runOnMachineFunction(MachineFunction &MF) override;
259 
260   void getAnalysisUsage(AnalysisUsage &AU) const override;
261 private:
262   bool IsOptNone;
263 };
264 } // end anonymous namespace
265 
266 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
267   AU.addRequired<TargetPassConfig>();
268   AU.setPreservesCFG();
269   getSelectionDAGFallbackAnalysisUsage(AU);
270   AU.addRequired<GISelKnownBitsAnalysis>();
271   AU.addPreserved<GISelKnownBitsAnalysis>();
272   if (!IsOptNone) {
273     AU.addRequired<MachineDominatorTree>();
274     AU.addPreserved<MachineDominatorTree>();
275   }
276   MachineFunctionPass::getAnalysisUsage(AU);
277 }
278 
279 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
280   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
281   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
282 }
283 
284 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
285   if (MF.getProperties().hasProperty(
286           MachineFunctionProperties::Property::FailedISel))
287     return false;
288   auto *TPC = &getAnalysis<TargetPassConfig>();
289   const Function &F = MF.getFunction();
290   bool EnableOpt =
291       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
292 
293   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
294   const AMDGPULegalizerInfo *LI =
295       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
296 
297   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
298   MachineDominatorTree *MDT =
299       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
300   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
301                                         F.hasMinSize(), LI, KB, MDT);
302   Combiner C(PCInfo, TPC);
303   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
304 }
305 
306 char AMDGPUPreLegalizerCombiner::ID = 0;
307 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
308                       "Combine AMDGPU machine instrs before legalization",
309                       false, false)
310 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
311 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
312 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
313                     "Combine AMDGPU machine instrs before legalization", false,
314                     false)
315 
316 namespace llvm {
317 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
318   return new AMDGPUPreLegalizerCombiner(IsOptNone);
319 }
320 } // end namespace llvm
321