1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 =======
18 #include "AMDGPULegalizerInfo.h"
19 =======
20 >>>>>>> Added and used new target pseudo for v_cvt_pk_i16_i32, changes due to code review.
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> Move Combiner to PreLegalize step
23 #include "llvm/CodeGen/GlobalISel/Combiner.h"
24 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
26 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/MachineDominators.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/Target/TargetMachine.h"
31 
32 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 class AMDGPUPreLegalizerCombinerHelper {
38 protected:
39   MachineIRBuilder &B;
40   MachineFunction &MF;
41   MachineRegisterInfo &MRI;
42   CombinerHelper &Helper;
43 
44 public:
45   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
46       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
47 
48   struct ClampI64ToI16MatchInfo {
49     int64_t Cmp1;
50     int64_t Cmp2;
51     Register Origin;
52   };
53 
54   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
55                           MachineFunction &MF,
56                           ClampI64ToI16MatchInfo &MatchInfo);
57 
58   void applyClampI64ToI16(MachineInstr &MI,
59                           const ClampI64ToI16MatchInfo &MatchInfo);
60 };
61 
62 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
63     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
64     ClampI64ToI16MatchInfo &MatchInfo) {
65   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
66 
67   // Try to find a pattern where an i64 value should get clamped to short.
68   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
69   if (SrcType != LLT::scalar(64))
70     return false;
71 
72   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
73   if (DstType != LLT::scalar(16))
74     return false;
75 
76   Register Base;
77 
78   // Try to match a combination of min / max MIR opcodes.
79   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
80     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
81       return false;
82     }
83   }
84 
85   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
86     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
87       return false;
88     }
89   }
90 
91   const auto Cmp1 = MatchInfo.Cmp1;
92   const auto Cmp2 = MatchInfo.Cmp2;
93   const auto Diff = std::abs(Cmp2 - Cmp1);
94 
95   // If the difference between both comparison values is 0 or 1, there is no
96   // need to clamp.
97   if (Diff == 0 || Diff == 1)
98     return false;
99 
100   const int64_t Min = std::numeric_limits<int16_t>::min();
101   const int64_t Max = std::numeric_limits<int16_t>::max();
102 
103   // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
104   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
105           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
106 }
107 
108 // We want to find a combination of instructions that
109 // gets generated when an i64 gets clamped to i16.
110 // The corresponding pattern is:
111 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
112 // This can be efficiently written as following:
113 // v_cvt_pk_i16_i32 v0, v0, v1
114 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
115 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
116     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
117   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
118 
119   Register Src = MatchInfo.Origin;
120   assert(MRI.getType(Src) == LLT::scalar(64));
121   const LLT S32 = LLT::scalar(32);
122 
123   B.setMBB(*MI.getParent());
124   B.setInstrAndDebugLoc(MI);
125 
126   auto Unmerge = B.buildUnmerge(S32, Src);
127 
128   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
129 
130   const LLT V2S16 = LLT::vector(2, 16);
131   auto CvtPk = B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
132     {V2S16},
133     {Unmerge.getReg(0), Unmerge.getReg(1)},
134     MI.getFlags());
135 
136   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
137   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
138   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
139   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
140 
141   auto Bitcast = B.buildBitcast({S32}, CvtPk);
142 
143   auto Med3 = B.buildInstr(AMDGPU::G_AMDGPU_MED3_S32,
144     {S32},
145     {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
146     MI.getFlags());
147 
148   auto Trunc = B.buildTrunc(LLT::scalar(16), Med3);
149   B.buildCopy(MI.getOperand(0).getReg(), Trunc);
150 
151   MI.eraseFromParent();
152 }
153 
154 class AMDGPUPreLegalizerCombinerHelperState {
155 protected:
156   CombinerHelper &Helper;
157   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
158 
159 public:
160   AMDGPUPreLegalizerCombinerHelperState(
161       CombinerHelper &Helper,
162       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
163       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
164 };
165 
166 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
167 #include "AMDGPUGenPreLegalizeGICombiner.inc"
168 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
169 
170 namespace {
171 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
172 #include "AMDGPUGenPreLegalizeGICombiner.inc"
173 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
174 
175 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
176   GISelKnownBits *KB;
177   MachineDominatorTree *MDT;
178 
179 public:
180   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
181 
182   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
183                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
184       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
185                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
186         KB(KB), MDT(MDT) {
187     if (!GeneratedRuleCfg.parseCommandLineOption())
188       report_fatal_error("Invalid rule identifier");
189   }
190 
191   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
192                        MachineIRBuilder &B) const override;
193 };
194 
195 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
196                                               MachineInstr &MI,
197                                               MachineIRBuilder &B) const {
198   CombinerHelper Helper(Observer, B, KB, MDT);
199   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
200   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
201                                                  PreLegalizerHelper);
202 
203   if (Generated.tryCombineAll(Observer, MI, B, Helper))
204     return true;
205 
206   switch (MI.getOpcode()) {
207   case TargetOpcode::G_CONCAT_VECTORS:
208     return Helper.tryCombineConcatVectors(MI);
209   case TargetOpcode::G_SHUFFLE_VECTOR:
210     return Helper.tryCombineShuffleVector(MI);
211   }
212 
213   return false;
214 }
215 
216 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
217 #include "AMDGPUGenPreLegalizeGICombiner.inc"
218 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
219 
220 // Pass boilerplate
221 // ================
222 
223 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
224 public:
225   static char ID;
226 
227   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
228 
229   StringRef getPassName() const override {
230     return "AMDGPUPreLegalizerCombiner";
231   }
232 
233   bool runOnMachineFunction(MachineFunction &MF) override;
234 
235   void getAnalysisUsage(AnalysisUsage &AU) const override;
236 private:
237   bool IsOptNone;
238 };
239 } // end anonymous namespace
240 
241 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
242   AU.addRequired<TargetPassConfig>();
243   AU.setPreservesCFG();
244   getSelectionDAGFallbackAnalysisUsage(AU);
245   AU.addRequired<GISelKnownBitsAnalysis>();
246   AU.addPreserved<GISelKnownBitsAnalysis>();
247   if (!IsOptNone) {
248     AU.addRequired<MachineDominatorTree>();
249     AU.addPreserved<MachineDominatorTree>();
250   }
251   MachineFunctionPass::getAnalysisUsage(AU);
252 }
253 
254 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
255   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
256   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
257 }
258 
259 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
260   if (MF.getProperties().hasProperty(
261           MachineFunctionProperties::Property::FailedISel))
262     return false;
263   auto *TPC = &getAnalysis<TargetPassConfig>();
264   const Function &F = MF.getFunction();
265   bool EnableOpt =
266       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
267 
268   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
269   MachineDominatorTree *MDT =
270       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
271   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
272                                         F.hasMinSize(), KB, MDT);
273   Combiner C(PCInfo, TPC);
274   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
275 }
276 
277 char AMDGPUPreLegalizerCombiner::ID = 0;
278 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
279                       "Combine AMDGPU machine instrs before legalization",
280                       false, false)
281 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
282 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
283 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
284                     "Combine AMDGPU machine instrs before legalization", false,
285                     false)
286 
287 namespace llvm {
288 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
289   return new AMDGPUPreLegalizerCombiner(IsOptNone);
290 }
291 } // end namespace llvm
292