1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 #include "AMDGPU.h"
16 =======
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 >>>>>>> Move Combiner to PreLegalize step
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPreLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   CombinerHelper &Helper;
40 
41 public:
42   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct ClampI64ToI16MatchInfo {
46     int64_t Cmp1;
47     int64_t Cmp2;
48     Register Origin;
49   };
50 
51   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
52                           MachineFunction &MF,
53                           ClampI64ToI16MatchInfo &MatchInfo);
54 
55   void applyClampI64ToI16(MachineInstr &MI,
56                           const ClampI64ToI16MatchInfo &MatchInfo);
57 };
58 
59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
60     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
61     ClampI64ToI16MatchInfo &MatchInfo) {
62   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
63 
64   // Try to find a pattern where an i64 value should get clamped to short.
65   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
66   if (SrcType != LLT::scalar(64))
67     return false;
68 
69   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
70   if (DstType != LLT::scalar(16))
71     return false;
72 
73   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
74 
75   Register Base;
76 
77   // Try to match a combination of min / max MIR opcodes.
78   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
79     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
80       return false;
81     }
82   }
83 
84   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
85     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
86       return false;
87     }
88   }
89 
90   const auto Cmp1 = MatchInfo.Cmp1;
91   const auto Cmp2 = MatchInfo.Cmp2;
92   const auto Diff = std::abs(Cmp2 - Cmp1);
93 
94   // If the difference between both comparison values is 0 or 1, there is no need to clamp.
95   if (Diff == 0 || Diff == 1)
96     return false;
97 
98   const int64_t Min = std::numeric_limits<int16_t>::min();
99   const int64_t Max = std::numeric_limits<int16_t>::max();
100 
101   // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
102   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
103           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
104 }
105 
106 // We want to find a combination of instructions that
107 // gets generated when an i64 gets clamped to i16.
108 // The corresponding pattern is:
109 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
110 // This can be efficiently written as following:
111 // v_cvt_pk_i16_i32 v0, v0, v1
112 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
113 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
114     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
115   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
116 
117   Register Src = MatchInfo.Origin;
118   assert(MRI.getType(Src) == LLT::scalar(64));
119   const LLT S32 = LLT::scalar(32);
120 
121   B.setMBB(*MI.getParent());
122   B.setInstrAndDebugLoc(MI);
123 
124   auto Unmerge = B.buildUnmerge(S32, Src);
125   Register Hi32 = Unmerge.getReg(0);
126   Register Lo32 = Unmerge.getReg(1);
127   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
128   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
129 
130   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
131   assert(MI.getOpcode() != CvtOpcode);
132 
133   const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
134 
135   Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
136   MRI.setType(CvtDst, S32);
137 
138   auto CvtPk = B.buildInstr(CvtOpcode);
139   CvtPk.addDef(CvtDst);
140   CvtPk.addReg(Hi32);
141   CvtPk.addReg(Lo32);
142   CvtPk.setMIFlags(MI.getFlags());
143 
144   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
145   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
146 
147   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
148   MRI.setRegClass(MinBoundaryDst.getReg(0), REG_CLASS);
149 
150   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
151   MRI.setRegClass(MaxBoundaryDst.getReg(0), REG_CLASS);
152 
153   Register MedDst = MRI.createVirtualRegister(REG_CLASS);
154   MRI.setType(MedDst, S32);
155 
156   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
157   Med.addDef(MedDst);
158   Med.addReg(MinBoundaryDst.getReg(0));
159   Med.addReg(CvtDst);
160   Med.addReg(MaxBoundaryDst.getReg(0));
161   Med.setMIFlags(MI.getFlags());
162 
163   Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
164   B.buildTrunc(TruncDst, MedDst);
165   B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
166 
167   MI.eraseFromParent();
168 }
169 
170 class AMDGPUPreLegalizerCombinerHelperState {
171 protected:
172   CombinerHelper &Helper;
173   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
174 
175 public:
176   AMDGPUPreLegalizerCombinerHelperState(
177       CombinerHelper &Helper,
178       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
179       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
180 };
181 
182 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
183 #include "AMDGPUGenPreLegalizeGICombiner.inc"
184 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
185 
186 namespace {
187 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
188 #include "AMDGPUGenPreLegalizeGICombiner.inc"
189 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
190 
191 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
192   GISelKnownBits *KB;
193   MachineDominatorTree *MDT;
194 
195 public:
196   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
197 
198   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
199                                   const AMDGPULegalizerInfo *LI,
200                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
201       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
202                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
203         KB(KB), MDT(MDT) {
204     if (!GeneratedRuleCfg.parseCommandLineOption())
205       report_fatal_error("Invalid rule identifier");
206   }
207 
208   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
209                        MachineIRBuilder &B) const override;
210 };
211 
212 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
213                                               MachineInstr &MI,
214                                               MachineIRBuilder &B) const {
215   CombinerHelper Helper(Observer, B, KB, MDT);
216   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
217   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
218                                                  PreLegalizerHelper);
219 
220   if (Generated.tryCombineAll(Observer, MI, B, Helper))
221     return true;
222 
223   switch (MI.getOpcode()) {
224   case TargetOpcode::G_CONCAT_VECTORS:
225     return Helper.tryCombineConcatVectors(MI);
226   case TargetOpcode::G_SHUFFLE_VECTOR:
227     return Helper.tryCombineShuffleVector(MI);
228   }
229 
230   return false;
231 }
232 
233 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
234 #include "AMDGPUGenPreLegalizeGICombiner.inc"
235 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
236 
237 // Pass boilerplate
238 // ================
239 
240 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
241 public:
242   static char ID;
243 
244   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
245 
246   StringRef getPassName() const override {
247     return "AMDGPUPreLegalizerCombiner";
248   }
249 
250   bool runOnMachineFunction(MachineFunction &MF) override;
251 
252   void getAnalysisUsage(AnalysisUsage &AU) const override;
253 private:
254   bool IsOptNone;
255 };
256 } // end anonymous namespace
257 
258 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
259   AU.addRequired<TargetPassConfig>();
260   AU.setPreservesCFG();
261   getSelectionDAGFallbackAnalysisUsage(AU);
262   AU.addRequired<GISelKnownBitsAnalysis>();
263   AU.addPreserved<GISelKnownBitsAnalysis>();
264   if (!IsOptNone) {
265     AU.addRequired<MachineDominatorTree>();
266     AU.addPreserved<MachineDominatorTree>();
267   }
268   MachineFunctionPass::getAnalysisUsage(AU);
269 }
270 
271 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
272   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
273   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
274 }
275 
276 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
277   if (MF.getProperties().hasProperty(
278           MachineFunctionProperties::Property::FailedISel))
279     return false;
280   auto *TPC = &getAnalysis<TargetPassConfig>();
281   const Function &F = MF.getFunction();
282   bool EnableOpt =
283       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
284 
285   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
286   const AMDGPULegalizerInfo *LI =
287       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
288 
289   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
290   MachineDominatorTree *MDT =
291       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
292   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
293                                         F.hasMinSize(), LI, KB, MDT);
294   Combiner C(PCInfo, TPC);
295   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
296 }
297 
298 char AMDGPUPreLegalizerCombiner::ID = 0;
299 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
300                       "Combine AMDGPU machine instrs before legalization",
301                       false, false)
302 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
303 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
304 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
305                     "Combine AMDGPU machine instrs before legalization", false,
306                     false)
307 
308 namespace llvm {
309 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
310   return new AMDGPUPreLegalizerCombiner(IsOptNone);
311 }
312 } // end namespace llvm
313