1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/CodeGen/GlobalISel/Combiner.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Target/TargetMachine.h"
25 
26 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
27 
28 using namespace llvm;
29 using namespace MIPatternMatch;
30 
31 class AMDGPUPostLegalizerCombinerHelper {
32 protected:
33   MachineIRBuilder &B;
34   MachineFunction &MF;
35   MachineRegisterInfo &MRI;
36   CombinerHelper &Helper;
37 
38 public:
39   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
40       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
41 
42   struct FMinFMaxLegacyInfo {
43     Register LHS;
44     Register RHS;
45     Register True;
46     Register False;
47     CmpInst::Predicate Pred;
48   };
49 
50   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
51   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
52   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
53                                          const FMinFMaxLegacyInfo &Info);
54 
55   bool matchUCharToFloat(MachineInstr &MI);
56   void applyUCharToFloat(MachineInstr &MI);
57 
58   // FIXME: Should be able to have 2 separate matchdatas rather than custom
59   // struct boilerplate.
60   struct CvtF32UByteMatchInfo {
61     Register CvtVal;
62     unsigned ShiftOffset;
63   };
64 
65   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
66   void applyCvtF32UByteN(MachineInstr &MI,
67                          const CvtF32UByteMatchInfo &MatchInfo);
68 };
69 
70 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
71     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
72   // FIXME: Combines should have subtarget predicates, and we shouldn't need
73   // this here.
74   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
75     return false;
76 
77   // FIXME: Type predicate on pattern
78   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
79     return false;
80 
81   Register Cond = MI.getOperand(1).getReg();
82   if (!MRI.hasOneNonDBGUse(Cond) ||
83       !mi_match(Cond, MRI,
84                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
85     return false;
86 
87   Info.True = MI.getOperand(2).getReg();
88   Info.False = MI.getOperand(3).getReg();
89 
90   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
91       !(Info.LHS == Info.False && Info.RHS == Info.True))
92     return false;
93 
94   switch (Info.Pred) {
95   case CmpInst::FCMP_FALSE:
96   case CmpInst::FCMP_OEQ:
97   case CmpInst::FCMP_ONE:
98   case CmpInst::FCMP_ORD:
99   case CmpInst::FCMP_UNO:
100   case CmpInst::FCMP_UEQ:
101   case CmpInst::FCMP_UNE:
102   case CmpInst::FCMP_TRUE:
103     return false;
104   default:
105     return true;
106   }
107 }
108 
109 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
110     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
111   B.setInstrAndDebugLoc(MI);
112   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
113     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
114   };
115 
116   switch (Info.Pred) {
117   case CmpInst::FCMP_ULT:
118   case CmpInst::FCMP_ULE:
119     if (Info.LHS == Info.True)
120       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121     else
122       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
123     break;
124   case CmpInst::FCMP_OLE:
125   case CmpInst::FCMP_OLT: {
126     // We need to permute the operands to get the correct NaN behavior. The
127     // selected operand is the second one based on the failing compare with NaN,
128     // so permute it based on the compare type the hardware uses.
129     if (Info.LHS == Info.True)
130       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
131     else
132       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
133     break;
134   }
135   case CmpInst::FCMP_UGE:
136   case CmpInst::FCMP_UGT: {
137     if (Info.LHS == Info.True)
138       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
139     else
140       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
141     break;
142   }
143   case CmpInst::FCMP_OGT:
144   case CmpInst::FCMP_OGE: {
145     if (Info.LHS == Info.True)
146       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
147     else
148       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
149     break;
150   }
151   default:
152     llvm_unreachable("predicate should not have matched");
153   }
154 
155   MI.eraseFromParent();
156 }
157 
158 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
159   Register DstReg = MI.getOperand(0).getReg();
160 
161   // TODO: We could try to match extracting the higher bytes, which would be
162   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
163   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
164   // about in practice.
165   LLT Ty = MRI.getType(DstReg);
166   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
167     Register SrcReg = MI.getOperand(1).getReg();
168     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
169     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
170     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
171     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
172   }
173 
174   return false;
175 }
176 
177 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
178   B.setInstrAndDebugLoc(MI);
179 
180   const LLT S32 = LLT::scalar(32);
181 
182   Register DstReg = MI.getOperand(0).getReg();
183   Register SrcReg = MI.getOperand(1).getReg();
184   LLT Ty = MRI.getType(DstReg);
185   LLT SrcTy = MRI.getType(SrcReg);
186   if (SrcTy != S32)
187     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
188 
189   if (Ty == S32) {
190     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
191                    {SrcReg}, MI.getFlags());
192   } else {
193     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
194                              {SrcReg}, MI.getFlags());
195     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
196   }
197 
198   MI.eraseFromParent();
199 }
200 
201 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
202     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
203   Register SrcReg = MI.getOperand(1).getReg();
204 
205   // Look through G_ZEXT.
206   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
207 
208   Register Src0;
209   int64_t ShiftAmt;
210   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
211   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
212     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
213 
214     unsigned ShiftOffset = 8 * Offset;
215     if (IsShr)
216       ShiftOffset += ShiftAmt;
217     else
218       ShiftOffset -= ShiftAmt;
219 
220     MatchInfo.CvtVal = Src0;
221     MatchInfo.ShiftOffset = ShiftOffset;
222     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
223   }
224 
225   // TODO: Simplify demanded bits.
226   return false;
227 }
228 
229 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
230     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
231   B.setInstrAndDebugLoc(MI);
232   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
233 
234   const LLT S32 = LLT::scalar(32);
235   Register CvtSrc = MatchInfo.CvtVal;
236   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
237   if (SrcTy != S32) {
238     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
239     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
240   }
241 
242   assert(MI.getOpcode() != NewOpc);
243   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
244   MI.eraseFromParent();
245 }
246 
247 class AMDGPUPostLegalizerCombinerHelperState {
248 protected:
249   CombinerHelper &Helper;
250   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
251 
252 public:
253   AMDGPUPostLegalizerCombinerHelperState(
254       CombinerHelper &Helper,
255       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
256       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
257 };
258 
259 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
260 #include "AMDGPUGenPostLegalizeGICombiner.inc"
261 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
262 
263 namespace {
264 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
265 #include "AMDGPUGenPostLegalizeGICombiner.inc"
266 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
267 
268 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
269   GISelKnownBits *KB;
270   MachineDominatorTree *MDT;
271 
272 public:
273   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
274 
275   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
276                                   const AMDGPULegalizerInfo *LI,
277                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
278       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
279                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
280         KB(KB), MDT(MDT) {
281     if (!GeneratedRuleCfg.parseCommandLineOption())
282       report_fatal_error("Invalid rule identifier");
283   }
284 
285   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
286                MachineIRBuilder &B) const override;
287 };
288 
289 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
290                                               MachineInstr &MI,
291                                               MachineIRBuilder &B) const {
292   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
293   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
294   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
295                                                  PostLegalizerHelper);
296 
297   if (Generated.tryCombineAll(Observer, MI, B))
298     return true;
299 
300   switch (MI.getOpcode()) {
301   case TargetOpcode::G_SHL:
302   case TargetOpcode::G_LSHR:
303   case TargetOpcode::G_ASHR:
304     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
305     // common case, splitting this into a move and a 32-bit shift is faster and
306     // the same code size.
307     return Helper.tryCombineShiftToUnmerge(MI, 32);
308   }
309 
310   return false;
311 }
312 
313 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
314 #include "AMDGPUGenPostLegalizeGICombiner.inc"
315 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
316 
317 // Pass boilerplate
318 // ================
319 
320 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
321 public:
322   static char ID;
323 
324   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
325 
326   StringRef getPassName() const override {
327     return "AMDGPUPostLegalizerCombiner";
328   }
329 
330   bool runOnMachineFunction(MachineFunction &MF) override;
331 
332   void getAnalysisUsage(AnalysisUsage &AU) const override;
333 private:
334   bool IsOptNone;
335 };
336 } // end anonymous namespace
337 
338 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
339   AU.addRequired<TargetPassConfig>();
340   AU.setPreservesCFG();
341   getSelectionDAGFallbackAnalysisUsage(AU);
342   AU.addRequired<GISelKnownBitsAnalysis>();
343   AU.addPreserved<GISelKnownBitsAnalysis>();
344   if (!IsOptNone) {
345     AU.addRequired<MachineDominatorTree>();
346     AU.addPreserved<MachineDominatorTree>();
347   }
348   MachineFunctionPass::getAnalysisUsage(AU);
349 }
350 
351 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
352   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
353   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
354 }
355 
356 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
357   if (MF.getProperties().hasProperty(
358           MachineFunctionProperties::Property::FailedISel))
359     return false;
360   auto *TPC = &getAnalysis<TargetPassConfig>();
361   const Function &F = MF.getFunction();
362   bool EnableOpt =
363       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
364 
365   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
366   const AMDGPULegalizerInfo *LI
367     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
368 
369   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
370   MachineDominatorTree *MDT =
371       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
372   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
373                                          F.hasMinSize(), LI, KB, MDT);
374   Combiner C(PCInfo, TPC);
375   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
376 }
377 
378 char AMDGPUPostLegalizerCombiner::ID = 0;
379 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
380                       "Combine AMDGPU machine instrs after legalization",
381                       false, false)
382 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
383 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
384 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
385                     "Combine AMDGPU machine instrs after legalization", false,
386                     false)
387 
388 namespace llvm {
389 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
390   return new AMDGPUPostLegalizerCombiner(IsOptNone);
391 }
392 } // end namespace llvm
393