1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/MachineDominators.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 struct FMinFMaxLegacyInfo {
33   Register LHS;
34   Register RHS;
35   Register True;
36   Register False;
37   CmpInst::Predicate Pred;
38 };
39 
40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
42                                 MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
43   // FIXME: Combines should have subtarget predicates, and we shouldn't need
44   // this here.
45   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
46     return false;
47 
48   // FIXME: Type predicate on pattern
49   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
50     return false;
51 
52   Register Cond = MI.getOperand(1).getReg();
53   if (!MRI.hasOneNonDBGUse(Cond) ||
54       !mi_match(Cond, MRI,
55                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
56     return false;
57 
58   Info.True = MI.getOperand(2).getReg();
59   Info.False = MI.getOperand(3).getReg();
60 
61   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
62       !(Info.LHS == Info.False && Info.RHS == Info.True))
63     return false;
64 
65   switch (Info.Pred) {
66   case CmpInst::FCMP_FALSE:
67   case CmpInst::FCMP_OEQ:
68   case CmpInst::FCMP_ONE:
69   case CmpInst::FCMP_ORD:
70   case CmpInst::FCMP_UNO:
71   case CmpInst::FCMP_UEQ:
72   case CmpInst::FCMP_UNE:
73   case CmpInst::FCMP_TRUE:
74     return false;
75   default:
76     return true;
77   }
78 }
79 
80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
81                                               const FMinFMaxLegacyInfo &Info) {
82 
83   auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
84     MachineIRBuilder MIB(MI);
85     MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
86   };
87 
88   switch (Info.Pred) {
89   case CmpInst::FCMP_ULT:
90   case CmpInst::FCMP_ULE:
91     if (Info.LHS == Info.True)
92       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
93     else
94       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
95     break;
96   case CmpInst::FCMP_OLE:
97   case CmpInst::FCMP_OLT: {
98     // We need to permute the operands to get the correct NaN behavior. The
99     // selected operand is the second one based on the failing compare with NaN,
100     // so permute it based on the compare type the hardware uses.
101     if (Info.LHS == Info.True)
102       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
103     else
104       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
105     break;
106   }
107   case CmpInst::FCMP_UGE:
108   case CmpInst::FCMP_UGT: {
109     if (Info.LHS == Info.True)
110       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
111     else
112       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
113     break;
114   }
115   case CmpInst::FCMP_OGT:
116   case CmpInst::FCMP_OGE: {
117     if (Info.LHS == Info.True)
118       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
119     else
120       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121     break;
122   }
123   default:
124     llvm_unreachable("predicate should not have matched");
125   }
126 
127   MI.eraseFromParent();
128 }
129 
130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
131                               MachineFunction &MF, CombinerHelper &Helper) {
132   Register DstReg = MI.getOperand(0).getReg();
133 
134   // TODO: We could try to match extracting the higher bytes, which would be
135   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
136   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
137   // about in practice.
138   LLT Ty = MRI.getType(DstReg);
139   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
140     const APInt Mask = APInt::getHighBitsSet(32, 24);
141     return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(),
142                                                     Mask);
143   }
144 
145   return false;
146 }
147 
148 static void applyUCharToFloat(MachineInstr &MI) {
149   MachineIRBuilder B(MI);
150 
151   const LLT S32 = LLT::scalar(32);
152 
153   Register DstReg = MI.getOperand(0).getReg();
154   LLT Ty = B.getMRI()->getType(DstReg);
155 
156   if (Ty == S32) {
157     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
158                    {MI.getOperand(1)}, MI.getFlags());
159   } else {
160     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
161                              {MI.getOperand(1)}, MI.getFlags());
162     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
163   }
164 
165   MI.eraseFromParent();
166 }
167 
168 // FIXME: Should be able to have 2 separate matchdatas rather than custom struct
169 // boilerplate.
170 struct CvtF32UByteMatchInfo {
171   Register CvtVal;
172   unsigned ShiftOffset;
173 };
174 
175 static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
176                               MachineFunction &MF,
177                               CvtF32UByteMatchInfo &MatchInfo) {
178   Register SrcReg = MI.getOperand(1).getReg();
179 
180   // Look through G_ZEXT.
181   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
182 
183   Register Src0;
184   int64_t ShiftAmt;
185   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
186   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
187     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
188 
189     unsigned ShiftOffset = 8 * Offset;
190     if (IsShr)
191       ShiftOffset += ShiftAmt;
192     else
193       ShiftOffset -= ShiftAmt;
194 
195     MatchInfo.CvtVal = Src0;
196     MatchInfo.ShiftOffset = ShiftOffset;
197     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
198   }
199 
200   // TODO: Simplify demanded bits.
201   return false;
202 }
203 
204 static void applyCvtF32UByteN(MachineInstr &MI,
205                               const CvtF32UByteMatchInfo &MatchInfo) {
206   MachineIRBuilder B(MI);
207   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
208 
209   const LLT S32 = LLT::scalar(32);
210   Register CvtSrc = MatchInfo.CvtVal;
211   LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
212   if (SrcTy != S32) {
213     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
214     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
215   }
216 
217   assert(MI.getOpcode() != NewOpc);
218   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
219   MI.eraseFromParent();
220 }
221 
222 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
223 #include "AMDGPUGenPostLegalizeGICombiner.inc"
224 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
225 
226 namespace {
227 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
228 #include "AMDGPUGenPostLegalizeGICombiner.inc"
229 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
230 
231 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
232   GISelKnownBits *KB;
233   MachineDominatorTree *MDT;
234 
235 public:
236   AMDGPUGenPostLegalizerCombinerHelper Generated;
237 
238   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
239                                   const AMDGPULegalizerInfo *LI,
240                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
241       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
242                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
243         KB(KB), MDT(MDT) {
244     if (!Generated.parseCommandLineOption())
245       report_fatal_error("Invalid rule identifier");
246   }
247 
248   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
249                MachineIRBuilder &B) const override;
250 };
251 
252 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
253                                               MachineInstr &MI,
254                                               MachineIRBuilder &B) const {
255   CombinerHelper Helper(Observer, B, KB, MDT);
256 
257   if (Generated.tryCombineAll(Observer, MI, B, Helper))
258     return true;
259 
260   switch (MI.getOpcode()) {
261   case TargetOpcode::G_SHL:
262   case TargetOpcode::G_LSHR:
263   case TargetOpcode::G_ASHR:
264     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
265     // common case, splitting this into a move and a 32-bit shift is faster and
266     // the same code size.
267     return Helper.tryCombineShiftToUnmerge(MI, 32);
268   }
269 
270   return false;
271 }
272 
273 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
274 #include "AMDGPUGenPostLegalizeGICombiner.inc"
275 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
276 
277 // Pass boilerplate
278 // ================
279 
280 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
281 public:
282   static char ID;
283 
284   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
285 
286   StringRef getPassName() const override {
287     return "AMDGPUPostLegalizerCombiner";
288   }
289 
290   bool runOnMachineFunction(MachineFunction &MF) override;
291 
292   void getAnalysisUsage(AnalysisUsage &AU) const override;
293 private:
294   bool IsOptNone;
295 };
296 } // end anonymous namespace
297 
298 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
299   AU.addRequired<TargetPassConfig>();
300   AU.setPreservesCFG();
301   getSelectionDAGFallbackAnalysisUsage(AU);
302   AU.addRequired<GISelKnownBitsAnalysis>();
303   AU.addPreserved<GISelKnownBitsAnalysis>();
304   if (!IsOptNone) {
305     AU.addRequired<MachineDominatorTree>();
306     AU.addPreserved<MachineDominatorTree>();
307   }
308   MachineFunctionPass::getAnalysisUsage(AU);
309 }
310 
311 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
312   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
313   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
314 }
315 
316 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
317   if (MF.getProperties().hasProperty(
318           MachineFunctionProperties::Property::FailedISel))
319     return false;
320   auto *TPC = &getAnalysis<TargetPassConfig>();
321   const Function &F = MF.getFunction();
322   bool EnableOpt =
323       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
324 
325   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
326   const AMDGPULegalizerInfo *LI
327     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
328 
329   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
330   MachineDominatorTree *MDT =
331       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
332   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
333                                          F.hasMinSize(), LI, KB, MDT);
334   Combiner C(PCInfo, TPC);
335   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
336 }
337 
338 char AMDGPUPostLegalizerCombiner::ID = 0;
339 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
340                       "Combine AMDGPU machine instrs after legalization",
341                       false, false)
342 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
343 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
344 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
345                     "Combine AMDGPU machine instrs after legalization", false,
346                     false)
347 
348 namespace llvm {
349 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
350   return new AMDGPUPostLegalizerCombiner(IsOptNone);
351 }
352 } // end namespace llvm
353