1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 #include "AMDGPU.h"
16 =======
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPUTargetMachine.h"
19 >>>>>>> Move Combiner to PreLegalize step
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/TargetPassConfig.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPreLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   CombinerHelper &Helper;
40 
41 public:
42   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct ClampI64ToI16MatchInfo {
46     int64_t Cmp1;
47     int64_t Cmp2;
48     Register Origin;
49   };
50 
51   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
52                           MachineFunction &MF,
53                           ClampI64ToI16MatchInfo &MatchInfo);
54 
55   void applyClampI64ToI16(MachineInstr &MI,
56                           const ClampI64ToI16MatchInfo &MatchInfo);
57 };
58 
59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
60     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
61     ClampI64ToI16MatchInfo &MatchInfo) {
62   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
63 
64   // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or
65   // below).
66   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
67 
68   if (SrcType != LLT::scalar(64))
69     return false;
70 
71   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
72 
73   if (DstType != LLT::scalar(16))
74     return false;
75 
76   MachineIRBuilder B(MI);
77 
78   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n");
79 
80   Register Base;
81 
82   // match max / min pattern
83   if (!mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1))))
84     return false;
85 
86   if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2))))
87     return false;
88 
89   const auto Cmp1 = MatchInfo.Cmp1;
90   const auto Cmp2 = MatchInfo.Cmp2;
91   const auto Diff = std::abs(Cmp2 - Cmp1);
92 
93   // we don't need to clamp here.
94   if (Diff == 0 || Diff == 1)
95     return false;
96 
97   const int64_t Min = std::numeric_limits<int16_t>::min();
98   const int64_t Max = std::numeric_limits<int16_t>::max();
99 
100   // are we really trying to clamp against the relevant boundaries?
101   return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
102           (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
103 }
104 
105 // We want to find a combination of instructions that
106 // gets generated when an i64 gets clamped to i16.
107 // The corresponding pattern is:
108 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
109 // This can be efficiently written as following:
110 // v_cvt_pk_i16_i32 v0, v0, v1
111 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
112 
113 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
114     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
115   LLVM_DEBUG(dbgs() << "Combining MI\n");
116 
117   MachineIRBuilder B(MI);
118   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
119 
120   Register Src = MatchInfo.Origin;
121   assert(MRI.getType(Src) == LLT::scalar(64));
122   const LLT S32 = LLT::scalar(32);
123 
124   auto Unmerge = B.buildUnmerge(S32, Src);
125   Register Hi32 = Unmerge->getOperand(0).getReg();
126   Register Lo32 = Unmerge->getOperand(1).getReg();
127   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
128   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
129 
130   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
131   assert(MI.getOpcode() != CvtOpcode);
132 
133   const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
134 
135   Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
136   MRI.setType(CvtDst, S32);
137 
138   auto CvtPk = B.buildInstr(CvtOpcode);
139   CvtPk.addDef(CvtDst);
140   CvtPk.addReg(Hi32);
141   CvtPk.addReg(Lo32);
142   CvtPk.setMIFlags(MI.getFlags());
143 
144   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
145   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
146 
147   Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
148   MRI.setType(MinBoundaryDst, S32);
149   B.buildConstant(MinBoundaryDst, min);
150 
151   Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
152   MRI.setType(MaxBoundaryDst, S32);
153   B.buildConstant(MaxBoundaryDst, max);
154 
155   Register MedDst = MRI.createVirtualRegister(REG_CLASS);
156   MRI.setType(MedDst, S32);
157 
158   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
159   Med.addDef(MedDst);
160   Med.addReg(MinBoundaryDst);
161   Med.addReg(CvtDst);
162   Med.addReg(MaxBoundaryDst);
163   Med.setMIFlags(MI.getFlags());
164 
165   Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16));
166   B.buildTrunc(TruncDst, MedDst);
167   B.buildCopy(MI.getOperand(0).getReg(), TruncDst);
168 
169   MI.eraseFromParent();
170 }
171 
172 class AMDGPUPreLegalizerCombinerHelperState {
173 protected:
174   CombinerHelper &Helper;
175   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
176 
177 public:
178   AMDGPUPreLegalizerCombinerHelperState(
179       CombinerHelper &Helper,
180       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
181       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
182 };
183 
184 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
185 #include "AMDGPUGenPreLegalizeGICombiner.inc"
186 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
187 
188 namespace {
189 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
190 #include "AMDGPUGenPreLegalizeGICombiner.inc"
191 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
192 
193 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
194   GISelKnownBits *KB;
195   MachineDominatorTree *MDT;
196 
197 public:
198   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
199 
200   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
201                                   const AMDGPULegalizerInfo *LI,
202                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
203       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
204                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
205         KB(KB), MDT(MDT) {
206     if (!GeneratedRuleCfg.parseCommandLineOption())
207       report_fatal_error("Invalid rule identifier");
208   }
209 
210   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
211                        MachineIRBuilder &B) const override;
212 };
213 
214 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
215                                               MachineInstr &MI,
216                                               MachineIRBuilder &B) const {
217   CombinerHelper Helper(Observer, B, KB, MDT);
218   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
219   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
220                                                  PreLegalizerHelper);
221 
222   if (Generated.tryCombineAll(Observer, MI, B, Helper))
223     return true;
224 
225   switch (MI.getOpcode()) {
226   case TargetOpcode::G_CONCAT_VECTORS:
227     return Helper.tryCombineConcatVectors(MI);
228   case TargetOpcode::G_SHUFFLE_VECTOR:
229     return Helper.tryCombineShuffleVector(MI);
230   }
231 
232   return false;
233 }
234 
235 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
236 #include "AMDGPUGenPreLegalizeGICombiner.inc"
237 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
238 
239 // Pass boilerplate
240 // ================
241 
242 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
243 public:
244   static char ID;
245 
246   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
247 
248   StringRef getPassName() const override {
249     return "AMDGPUPreLegalizerCombiner";
250   }
251 
252   bool runOnMachineFunction(MachineFunction &MF) override;
253 
254   void getAnalysisUsage(AnalysisUsage &AU) const override;
255 private:
256   bool IsOptNone;
257 };
258 } // end anonymous namespace
259 
260 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
261   AU.addRequired<TargetPassConfig>();
262   AU.setPreservesCFG();
263   getSelectionDAGFallbackAnalysisUsage(AU);
264   AU.addRequired<GISelKnownBitsAnalysis>();
265   AU.addPreserved<GISelKnownBitsAnalysis>();
266   if (!IsOptNone) {
267     AU.addRequired<MachineDominatorTree>();
268     AU.addPreserved<MachineDominatorTree>();
269   }
270   MachineFunctionPass::getAnalysisUsage(AU);
271 }
272 
273 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
274   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
275   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
276 }
277 
278 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
279   if (MF.getProperties().hasProperty(
280           MachineFunctionProperties::Property::FailedISel))
281     return false;
282   auto *TPC = &getAnalysis<TargetPassConfig>();
283   const Function &F = MF.getFunction();
284   bool EnableOpt =
285       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
286 
287   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
288   const AMDGPULegalizerInfo *LI =
289       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
290 
291   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
292   MachineDominatorTree *MDT =
293       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
294   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
295                                         F.hasMinSize(), LI, KB, MDT);
296   Combiner C(PCInfo, TPC);
297   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
298 }
299 
300 char AMDGPUPreLegalizerCombiner::ID = 0;
301 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
302                       "Combine AMDGPU machine instrs before legalization",
303                       false, false)
304 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
305 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
306 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
307                     "Combine AMDGPU machine instrs before legalization", false,
308                     false)
309 
310 namespace llvm {
311 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
312   return new AMDGPUPreLegalizerCombiner(IsOptNone);
313 }
314 } // end namespace llvm
315