1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 =======
18 #include "AMDGPULegalizerInfo.h"
19 =======
20 >>>>>>> Added and used new target pseudo for v_cvt_pk_i16_i32, changes due to code review.
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> Move Combiner to PreLegalize step
23 #include "llvm/CodeGen/GlobalISel/Combiner.h"
24 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
26 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/MachineDominators.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/Target/TargetMachine.h"
31 
32 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
33 
34 using namespace llvm;
35 using namespace MIPatternMatch;
36 
37 class AMDGPUPreLegalizerCombinerHelper {
38 protected:
39   MachineIRBuilder &B;
40   MachineFunction &MF;
41   MachineRegisterInfo &MRI;
42   CombinerHelper &Helper;
43 
44 public:
45   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
46       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
47 
48   struct ClampI64ToI16MatchInfo {
49     int64_t Cmp1;
50     int64_t Cmp2;
51     Register Origin;
52   };
53 
54   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
55                           MachineFunction &MF,
56                           ClampI64ToI16MatchInfo &MatchInfo);
57 
58   void applyClampI64ToI16(MachineInstr &MI,
59                           const ClampI64ToI16MatchInfo &MatchInfo);
60 };
61 
62 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
63     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
64     ClampI64ToI16MatchInfo &MatchInfo) {
65   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
66 
67   // Try to find a pattern where an i64 value should get clamped to short.
68   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
69   if (SrcType != LLT::scalar(64))
70     return false;
71 
72   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
73   if (DstType != LLT::scalar(16))
74     return false;
75 
76   Register Base;
77 
78   // Try to match a combination of min / max MIR opcodes.
79   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
80     if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
81       return false;
82     }
83   }
84 
85   if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
86     if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
87       return false;
88     }
89   }
90 
91   const auto Cmp1 = MatchInfo.Cmp1;
92   const auto Cmp2 = MatchInfo.Cmp2;
93   const auto Diff = std::abs(Cmp2 - Cmp1);
94 
95   // If the difference between both comparison values is 0 or 1, there is no
96   // need to clamp.
97   if (Diff == 0 || Diff == 1)
98     return false;
99 
100   const int64_t Min = std::numeric_limits<int16_t>::min();
101   const int64_t Max = std::numeric_limits<int16_t>::max();
102 
103   // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
104   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
105           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
106 }
107 
108 // We want to find a combination of instructions that
109 // gets generated when an i64 gets clamped to i16.
110 // The corresponding pattern is:
111 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
112 // This can be efficiently written as following:
113 // v_cvt_pk_i16_i32 v0, v0, v1
114 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
115 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
116     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
117   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
118 
119   Register Src = MatchInfo.Origin;
120   assert(MRI.getType(Src) == LLT::scalar(64));
121   const LLT S32 = LLT::scalar(32);
122 
123   B.setMBB(*MI.getParent());
124   B.setInstrAndDebugLoc(MI);
125 
126   auto Unmerge = B.buildUnmerge(S32, Src);
127 
128   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
129 
130   const LLT V2S16 = LLT::vector(2, 16);
131   auto CvtPk = B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32,
132     {V2S16},
133     {Unmerge.getReg(0), Unmerge.getReg(1)},
134     MI.getFlags());
135 
136   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
137   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
138   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
139   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
140 
141   auto Bitcast = B.buildBitcast({S32}, CvtPk);
142 
143   auto Med3 = B.buildInstr(AMDGPU::G_AMDGPU_MED3,
144     {S32},
145     {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
146     MI.getFlags());
147 
148   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
149 
150   MI.eraseFromParent();
151 }
152 
153 class AMDGPUPreLegalizerCombinerHelperState {
154 protected:
155   CombinerHelper &Helper;
156   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
157 
158 public:
159   AMDGPUPreLegalizerCombinerHelperState(
160       CombinerHelper &Helper,
161       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
162       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
163 };
164 
165 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
166 #include "AMDGPUGenPreLegalizeGICombiner.inc"
167 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
168 
169 namespace {
170 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
171 #include "AMDGPUGenPreLegalizeGICombiner.inc"
172 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
173 
174 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
175   GISelKnownBits *KB;
176   MachineDominatorTree *MDT;
177 
178 public:
179   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
180 
181   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
182                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
183       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
184                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
185         KB(KB), MDT(MDT) {
186     if (!GeneratedRuleCfg.parseCommandLineOption())
187       report_fatal_error("Invalid rule identifier");
188   }
189 
190   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
191                        MachineIRBuilder &B) const override;
192 };
193 
194 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
195                                               MachineInstr &MI,
196                                               MachineIRBuilder &B) const {
197   CombinerHelper Helper(Observer, B, KB, MDT);
198   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
199   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
200                                                  PreLegalizerHelper);
201 
202   if (Generated.tryCombineAll(Observer, MI, B, Helper))
203     return true;
204 
205   switch (MI.getOpcode()) {
206   case TargetOpcode::G_CONCAT_VECTORS:
207     return Helper.tryCombineConcatVectors(MI);
208   case TargetOpcode::G_SHUFFLE_VECTOR:
209     return Helper.tryCombineShuffleVector(MI);
210   }
211 
212   return false;
213 }
214 
215 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
216 #include "AMDGPUGenPreLegalizeGICombiner.inc"
217 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
218 
219 // Pass boilerplate
220 // ================
221 
222 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
223 public:
224   static char ID;
225 
226   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
227 
228   StringRef getPassName() const override {
229     return "AMDGPUPreLegalizerCombiner";
230   }
231 
232   bool runOnMachineFunction(MachineFunction &MF) override;
233 
234   void getAnalysisUsage(AnalysisUsage &AU) const override;
235 private:
236   bool IsOptNone;
237 };
238 } // end anonymous namespace
239 
240 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
241   AU.addRequired<TargetPassConfig>();
242   AU.setPreservesCFG();
243   getSelectionDAGFallbackAnalysisUsage(AU);
244   AU.addRequired<GISelKnownBitsAnalysis>();
245   AU.addPreserved<GISelKnownBitsAnalysis>();
246   if (!IsOptNone) {
247     AU.addRequired<MachineDominatorTree>();
248     AU.addPreserved<MachineDominatorTree>();
249   }
250   MachineFunctionPass::getAnalysisUsage(AU);
251 }
252 
253 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
254   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
255   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
256 }
257 
258 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
259   if (MF.getProperties().hasProperty(
260           MachineFunctionProperties::Property::FailedISel))
261     return false;
262   auto *TPC = &getAnalysis<TargetPassConfig>();
263   const Function &F = MF.getFunction();
264   bool EnableOpt =
265       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
266 
267   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
268   MachineDominatorTree *MDT =
269       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
270   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
271                                         F.hasMinSize(), KB, MDT);
272   Combiner C(PCInfo, TPC);
273   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
274 }
275 
276 char AMDGPUPreLegalizerCombiner::ID = 0;
277 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
278                       "Combine AMDGPU machine instrs before legalization",
279                       false, false)
280 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
281 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
282 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
283                     "Combine AMDGPU machine instrs before legalization", false,
284                     false)
285 
286 namespace llvm {
287 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
288   return new AMDGPUPreLegalizerCombiner(IsOptNone);
289 }
290 } // end namespace llvm
291