1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 =======
18 #include "AMDGPULegalizerInfo.h"
19 =======
20 >>>>>>> Added and used new target pseudo for v_cvt_pk_i16_i32, changes due to code review.
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> Move Combiner to PreLegalize step
23 #include "llvm/CodeGen/GlobalISel/Combiner.h"
24 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
26 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/MachineDominators.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/Target/TargetMachine.h"
31 
32 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 class AMDGPUPreLegalizerCombinerHelper {
38 protected:
39   MachineIRBuilder &B;
40   MachineFunction &MF;
41   MachineRegisterInfo &MRI;
42   CombinerHelper &Helper;
43 
44 public:
45   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
46       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
47 
48   struct ClampI64ToI16MatchInfo {
49     int64_t Cmp1;
50     int64_t Cmp2;
51     Register Origin;
52   };
53 
54   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
55                           MachineFunction &MF,
56                           ClampI64ToI16MatchInfo &MatchInfo);
57 
58   void applyClampI64ToI16(MachineInstr &MI,
59                           const ClampI64ToI16MatchInfo &MatchInfo);
60 };
61 
62 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
63     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
64     ClampI64ToI16MatchInfo &MatchInfo) {
65   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
66 
67   // Try to find a pattern where an i64 value should get clamped to short.
68   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
69   if (SrcType != LLT::scalar(64))
70     return false;
71 
72   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
73   if (DstType != LLT::scalar(16))
74     return false;
75 
76   Register Base;
77 
78   // Try to match a combination of min / max MIR opcodes.
79   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
80     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
81       return false;
82     }
83   }
84 
85   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
86     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
87       return false;
88     }
89   }
90 
91   const auto Cmp1 = MatchInfo.Cmp1;
92   const auto Cmp2 = MatchInfo.Cmp2;
93   const auto Diff = std::abs(Cmp2 - Cmp1);
94 
95   // If the difference between both comparison values is 0 or 1, there is no
96   // need to clamp.
97   if (Diff == 0 || Diff == 1)
98     return false;
99 
100   const int64_t Min = std::numeric_limits<int16_t>::min();
101   const int64_t Max = std::numeric_limits<int16_t>::max();
102 
103   // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
104   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
105           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
106 }
107 
108 // We want to find a combination of instructions that
109 // gets generated when an i64 gets clamped to i16.
110 // The corresponding pattern is:
111 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
112 // This can be efficiently written as following:
113 // v_cvt_pk_i16_i32 v0, v0, v1
114 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
115 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
116     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
117   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
118 
119   Register Src = MatchInfo.Origin;
120   assert(MRI.getType(Src) == LLT::scalar(64));
121   const LLT S32 = LLT::scalar(32);
122 
123   B.setMBB(*MI.getParent());
124   B.setInstrAndDebugLoc(MI);
125 
126   auto Unmerge = B.buildUnmerge(S32, Src);
127   Register Hi32 = Unmerge.getReg(0);
128   Register Lo32 = Unmerge.getReg(1);
129   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
130   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
131 
132   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
133 
134   Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
135   const LLT V2S16 = LLT::vector(2, 16);
136   MRI.setType(CvtDst, V2S16);
137 
138   B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
139     {CvtDst},
140     {Hi32, Lo32},
141     MI.getFlags());
142 
143   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
144   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
145 
146   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
147   MRI.setRegClass(MinBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
148 
149   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
150   MRI.setRegClass(MaxBoundaryDst.getReg(0), &AMDGPU::VGPR_32RegClass);
151 
152   Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
153   MRI.setType(MedDst, S32);
154 
155   B.buildInstr(AMDGPU::V_MED3_I32,
156     {MedDst},
157     {MinBoundaryDst.getReg(0), CvtDst, MaxBoundaryDst.getReg(0)},
158     MI.getFlags());
159 
160   Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
161   B.buildTrunc(TruncDst, MedDst);
162   B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
163 
164   MI.eraseFromParent();
165 }
166 
167 class AMDGPUPreLegalizerCombinerHelperState {
168 protected:
169   CombinerHelper &Helper;
170   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
171 
172 public:
173   AMDGPUPreLegalizerCombinerHelperState(
174       CombinerHelper &Helper,
175       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
176       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
177 };
178 
179 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
180 #include "AMDGPUGenPreLegalizeGICombiner.inc"
181 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
182 
183 namespace {
184 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
185 #include "AMDGPUGenPreLegalizeGICombiner.inc"
186 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
187 
188 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
189   GISelKnownBits *KB;
190   MachineDominatorTree *MDT;
191 
192 public:
193   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
194 
195   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
196                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
197       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
198                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
199         KB(KB), MDT(MDT) {
200     if (!GeneratedRuleCfg.parseCommandLineOption())
201       report_fatal_error("Invalid rule identifier");
202   }
203 
204   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
205                        MachineIRBuilder &B) const override;
206 };
207 
208 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
209                                               MachineInstr &MI,
210                                               MachineIRBuilder &B) const {
211   CombinerHelper Helper(Observer, B, KB, MDT);
212   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
213   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
214                                                  PreLegalizerHelper);
215 
216   if (Generated.tryCombineAll(Observer, MI, B, Helper))
217     return true;
218 
219   switch (MI.getOpcode()) {
220   case TargetOpcode::G_CONCAT_VECTORS:
221     return Helper.tryCombineConcatVectors(MI);
222   case TargetOpcode::G_SHUFFLE_VECTOR:
223     return Helper.tryCombineShuffleVector(MI);
224   }
225 
226   return false;
227 }
228 
229 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
230 #include "AMDGPUGenPreLegalizeGICombiner.inc"
231 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
232 
233 // Pass boilerplate
234 // ================
235 
236 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
237 public:
238   static char ID;
239 
240   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
241 
242   StringRef getPassName() const override {
243     return "AMDGPUPreLegalizerCombiner";
244   }
245 
246   bool runOnMachineFunction(MachineFunction &MF) override;
247 
248   void getAnalysisUsage(AnalysisUsage &AU) const override;
249 private:
250   bool IsOptNone;
251 };
252 } // end anonymous namespace
253 
254 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
255   AU.addRequired<TargetPassConfig>();
256   AU.setPreservesCFG();
257   getSelectionDAGFallbackAnalysisUsage(AU);
258   AU.addRequired<GISelKnownBitsAnalysis>();
259   AU.addPreserved<GISelKnownBitsAnalysis>();
260   if (!IsOptNone) {
261     AU.addRequired<MachineDominatorTree>();
262     AU.addPreserved<MachineDominatorTree>();
263   }
264   MachineFunctionPass::getAnalysisUsage(AU);
265 }
266 
267 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
268   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
269   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
270 }
271 
272 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
273   if (MF.getProperties().hasProperty(
274           MachineFunctionProperties::Property::FailedISel))
275     return false;
276   auto *TPC = &getAnalysis<TargetPassConfig>();
277   const Function &F = MF.getFunction();
278   bool EnableOpt =
279       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
280 
281   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
282   MachineDominatorTree *MDT =
283       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
284   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
285                                         F.hasMinSize(), KB, MDT);
286   Combiner C(PCInfo, TPC);
287   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
288 }
289 
290 char AMDGPUPreLegalizerCombiner::ID = 0;
291 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
292                       "Combine AMDGPU machine instrs before legalization",
293                       false, false)
294 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
295 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
296 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
297                     "Combine AMDGPU machine instrs before legalization", false,
298                     false)
299 
300 namespace llvm {
301 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
302   return new AMDGPUPreLegalizerCombiner(IsOptNone);
303 }
304 } // end namespace llvm
305