1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/MachineDominators.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 struct FMinFMaxLegacyInfo {
33   Register LHS;
34   Register RHS;
35   Register True;
36   Register False;
37   CmpInst::Predicate Pred;
38 };
39 
40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
42                                 MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
43   // FIXME: Combines should have subtarget predicates, and we shouldn't need
44   // this here.
45   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
46     return false;
47 
48   // FIXME: Type predicate on pattern
49   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
50     return false;
51 
52   Register Cond = MI.getOperand(1).getReg();
53   if (!MRI.hasOneNonDBGUse(Cond) ||
54       !mi_match(Cond, MRI,
55                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
56     return false;
57 
58   Info.True = MI.getOperand(2).getReg();
59   Info.False = MI.getOperand(3).getReg();
60 
61   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
62       !(Info.LHS == Info.False && Info.RHS == Info.True))
63     return false;
64 
65   switch (Info.Pred) {
66   case CmpInst::FCMP_FALSE:
67   case CmpInst::FCMP_OEQ:
68   case CmpInst::FCMP_ONE:
69   case CmpInst::FCMP_ORD:
70   case CmpInst::FCMP_UNO:
71   case CmpInst::FCMP_UEQ:
72   case CmpInst::FCMP_UNE:
73   case CmpInst::FCMP_TRUE:
74     return false;
75   default:
76     return true;
77   }
78 }
79 
80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
81                                               const FMinFMaxLegacyInfo &Info) {
82 
83   auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
84     MachineIRBuilder MIB(MI);
85     MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
86   };
87 
88   switch (Info.Pred) {
89   case CmpInst::FCMP_ULT:
90   case CmpInst::FCMP_ULE:
91     if (Info.LHS == Info.True)
92       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
93     else
94       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
95     break;
96   case CmpInst::FCMP_OLE:
97   case CmpInst::FCMP_OLT: {
98     // We need to permute the operands to get the correct NaN behavior. The
99     // selected operand is the second one based on the failing compare with NaN,
100     // so permute it based on the compare type the hardware uses.
101     if (Info.LHS == Info.True)
102       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
103     else
104       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
105     break;
106   }
107   case CmpInst::FCMP_UGE:
108   case CmpInst::FCMP_UGT: {
109     if (Info.LHS == Info.True)
110       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
111     else
112       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
113     break;
114   }
115   case CmpInst::FCMP_OGT:
116   case CmpInst::FCMP_OGE: {
117     if (Info.LHS == Info.True)
118       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
119     else
120       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121     break;
122   }
123   default:
124     llvm_unreachable("predicate should not have matched");
125   }
126 
127   MI.eraseFromParent();
128 }
129 
130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
131                               MachineFunction &MF, CombinerHelper &Helper) {
132   Register DstReg = MI.getOperand(0).getReg();
133 
134   // TODO: We could try to match extracting the higher bytes, which would be
135   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
136   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
137   // about in practice.
138   LLT Ty = MRI.getType(DstReg);
139   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
140     const APInt Mask = APInt::getHighBitsSet(32, 24);
141     return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(),
142                                                     Mask);
143   }
144 
145   return false;
146 }
147 
148 static void applyUCharToFloat(MachineInstr &MI) {
149   MachineIRBuilder B(MI);
150 
151   const LLT S32 = LLT::scalar(32);
152 
153   Register DstReg = MI.getOperand(0).getReg();
154   LLT Ty = B.getMRI()->getType(DstReg);
155 
156   if (Ty == S32) {
157     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
158                    {MI.getOperand(1)}, MI.getFlags());
159   } else {
160     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
161                              {MI.getOperand(1)}, MI.getFlags());
162     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
163   }
164 
165   MI.eraseFromParent();
166 }
167 
168 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
169 #include "AMDGPUGenPostLegalizeGICombiner.inc"
170 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
171 
172 namespace {
173 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
174 #include "AMDGPUGenPostLegalizeGICombiner.inc"
175 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
176 
177 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
178   GISelKnownBits *KB;
179   MachineDominatorTree *MDT;
180 
181 public:
182   AMDGPUGenPostLegalizerCombinerHelper Generated;
183 
184   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
185                                   const AMDGPULegalizerInfo *LI,
186                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
187       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
188                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
189         KB(KB), MDT(MDT) {
190     if (!Generated.parseCommandLineOption())
191       report_fatal_error("Invalid rule identifier");
192   }
193 
194   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
195                MachineIRBuilder &B) const override;
196 };
197 
198 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
199                                               MachineInstr &MI,
200                                               MachineIRBuilder &B) const {
201   CombinerHelper Helper(Observer, B, KB, MDT);
202 
203   if (Generated.tryCombineAll(Observer, MI, B, Helper))
204     return true;
205 
206   switch (MI.getOpcode()) {
207   case TargetOpcode::G_SHL:
208   case TargetOpcode::G_LSHR:
209   case TargetOpcode::G_ASHR:
210     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
211     // common case, splitting this into a move and a 32-bit shift is faster and
212     // the same code size.
213     return Helper.tryCombineShiftToUnmerge(MI, 32);
214   }
215 
216   return false;
217 }
218 
219 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
220 #include "AMDGPUGenPostLegalizeGICombiner.inc"
221 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
222 
223 // Pass boilerplate
224 // ================
225 
226 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
227 public:
228   static char ID;
229 
230   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
231 
232   StringRef getPassName() const override {
233     return "AMDGPUPostLegalizerCombiner";
234   }
235 
236   bool runOnMachineFunction(MachineFunction &MF) override;
237 
238   void getAnalysisUsage(AnalysisUsage &AU) const override;
239 private:
240   bool IsOptNone;
241 };
242 } // end anonymous namespace
243 
244 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
245   AU.addRequired<TargetPassConfig>();
246   AU.setPreservesCFG();
247   getSelectionDAGFallbackAnalysisUsage(AU);
248   AU.addRequired<GISelKnownBitsAnalysis>();
249   AU.addPreserved<GISelKnownBitsAnalysis>();
250   if (!IsOptNone) {
251     AU.addRequired<MachineDominatorTree>();
252     AU.addPreserved<MachineDominatorTree>();
253   }
254   MachineFunctionPass::getAnalysisUsage(AU);
255 }
256 
257 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
258   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
259   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
260 }
261 
262 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
263   if (MF.getProperties().hasProperty(
264           MachineFunctionProperties::Property::FailedISel))
265     return false;
266   auto *TPC = &getAnalysis<TargetPassConfig>();
267   const Function &F = MF.getFunction();
268   bool EnableOpt =
269       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
270 
271   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
272   const AMDGPULegalizerInfo *LI
273     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
274 
275   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
276   MachineDominatorTree *MDT =
277       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
278   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
279                                          F.hasMinSize(), LI, KB, MDT);
280   Combiner C(PCInfo, TPC);
281   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
282 }
283 
284 char AMDGPUPostLegalizerCombiner::ID = 0;
285 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
286                       "Combine AMDGPU machine instrs after legalization",
287                       false, false)
288 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
289 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
290 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
291                     "Combine AMDGPU machine instrs after legalization", false,
292                     false)
293 
294 namespace llvm {
295 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
296   return new AMDGPUPostLegalizerCombiner(IsOptNone);
297 }
298 } // end namespace llvm
299