1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 #include "AMDGPU.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 =======
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 >>>>>>> clang-format
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/CodeGen/GlobalISel/Combiner.h"
24 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
26 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
27 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
28 #include "llvm/CodeGen/MachineDominators.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 <<<<<<< HEAD
31 #include "llvm/Target/TargetMachine.h"
32 =======
33 #include "llvm/Support/Debug.h"
34 >>>>>>> clang-format
35 
36 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
37 
38 using namespace llvm;
39 using namespace MIPatternMatch;
40 
41 class AMDGPUPostLegalizerCombinerHelper {
42 protected:
43   MachineIRBuilder &B;
44   MachineFunction &MF;
45   MachineRegisterInfo &MRI;
46   CombinerHelper &Helper;
47 
48 public:
49   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
50       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
51 
52   struct FMinFMaxLegacyInfo {
53     Register LHS;
54     Register RHS;
55     Register True;
56     Register False;
57     CmpInst::Predicate Pred;
58   };
59 
60   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
61   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
62   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
63                                          const FMinFMaxLegacyInfo &Info);
64 
65   bool matchUCharToFloat(MachineInstr &MI);
66   void applyUCharToFloat(MachineInstr &MI);
67 
68   // FIXME: Should be able to have 2 separate matchdatas rather than custom
69   // struct boilerplate.
70   struct CvtF32UByteMatchInfo {
71     Register CvtVal;
72     unsigned ShiftOffset;
73   };
74 
75   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
76   void applyCvtF32UByteN(MachineInstr &MI,
77                          const CvtF32UByteMatchInfo &MatchInfo);
78 };
79 
80 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
81     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
82   // FIXME: Combines should have subtarget predicates, and we shouldn't need
83   // this here.
84   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
85     return false;
86 
87   // FIXME: Type predicate on pattern
88   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
89     return false;
90 
91   Register Cond = MI.getOperand(1).getReg();
92   if (!MRI.hasOneNonDBGUse(Cond) ||
93       !mi_match(Cond, MRI,
94                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
95     return false;
96 
97   Info.True = MI.getOperand(2).getReg();
98   Info.False = MI.getOperand(3).getReg();
99 
100   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
101       !(Info.LHS == Info.False && Info.RHS == Info.True))
102     return false;
103 
104   switch (Info.Pred) {
105   case CmpInst::FCMP_FALSE:
106   case CmpInst::FCMP_OEQ:
107   case CmpInst::FCMP_ONE:
108   case CmpInst::FCMP_ORD:
109   case CmpInst::FCMP_UNO:
110   case CmpInst::FCMP_UEQ:
111   case CmpInst::FCMP_UNE:
112   case CmpInst::FCMP_TRUE:
113     return false;
114   default:
115     return true;
116   }
117 }
118 
119 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
120     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
121   B.setInstrAndDebugLoc(MI);
122   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
123     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
124   };
125 
126   switch (Info.Pred) {
127   case CmpInst::FCMP_ULT:
128   case CmpInst::FCMP_ULE:
129     if (Info.LHS == Info.True)
130       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
131     else
132       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
133     break;
134   case CmpInst::FCMP_OLE:
135   case CmpInst::FCMP_OLT: {
136     // We need to permute the operands to get the correct NaN behavior. The
137     // selected operand is the second one based on the failing compare with NaN,
138     // so permute it based on the compare type the hardware uses.
139     if (Info.LHS == Info.True)
140       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
141     else
142       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
143     break;
144   }
145   case CmpInst::FCMP_UGE:
146   case CmpInst::FCMP_UGT: {
147     if (Info.LHS == Info.True)
148       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
149     else
150       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
151     break;
152   }
153   case CmpInst::FCMP_OGT:
154   case CmpInst::FCMP_OGE: {
155     if (Info.LHS == Info.True)
156       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
157     else
158       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
159     break;
160   }
161   default:
162     llvm_unreachable("predicate should not have matched");
163   }
164 
165   MI.eraseFromParent();
166 }
167 
168 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
169   Register DstReg = MI.getOperand(0).getReg();
170 
171   // TODO: We could try to match extracting the higher bytes, which would be
172   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
173   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
174   // about in practice.
175   LLT Ty = MRI.getType(DstReg);
176   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
177     Register SrcReg = MI.getOperand(1).getReg();
178     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
179     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
180     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
181     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
182   }
183 
184   return false;
185 }
186 
187 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
188   B.setInstrAndDebugLoc(MI);
189 
190   const LLT S32 = LLT::scalar(32);
191 
192   Register DstReg = MI.getOperand(0).getReg();
193   Register SrcReg = MI.getOperand(1).getReg();
194   LLT Ty = MRI.getType(DstReg);
195   LLT SrcTy = MRI.getType(SrcReg);
196   if (SrcTy != S32)
197     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
198 
199   if (Ty == S32) {
200     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
201                  MI.getFlags());
202   } else {
203     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
204                              MI.getFlags());
205     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
206   }
207 
208   MI.eraseFromParent();
209 }
210 
211 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
212     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
213   Register SrcReg = MI.getOperand(1).getReg();
214 
215   // Look through G_ZEXT.
216   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
217 
218   Register Src0;
219   int64_t ShiftAmt;
220   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
221   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
222     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
223 
224     unsigned ShiftOffset = 8 * Offset;
225     if (IsShr)
226       ShiftOffset += ShiftAmt;
227     else
228       ShiftOffset -= ShiftAmt;
229 
230     MatchInfo.CvtVal = Src0;
231     MatchInfo.ShiftOffset = ShiftOffset;
232     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
233   }
234 
235   // TODO: Simplify demanded bits.
236   return false;
237 }
238 
239 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
240     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
241   B.setInstrAndDebugLoc(MI);
242   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
243 
244   const LLT S32 = LLT::scalar(32);
245   Register CvtSrc = MatchInfo.CvtVal;
246   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
247   if (SrcTy != S32) {
248     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
249     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
250   }
251 
252   assert(MI.getOpcode() != NewOpc);
253   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
254   MI.eraseFromParent();
255 }
256 
257 
258 class AMDGPUPostLegalizerCombinerHelperState {
259 protected:
260   CombinerHelper &Helper;
261   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
262 
263 public:
264   AMDGPUPostLegalizerCombinerHelperState(
265       CombinerHelper &Helper,
266       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
267       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
268 };
269 
270 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
271 #include "AMDGPUGenPostLegalizeGICombiner.inc"
272 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
273 
274 namespace {
275 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
276 #include "AMDGPUGenPostLegalizeGICombiner.inc"
277 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
278 
279 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
280   GISelKnownBits *KB;
281   MachineDominatorTree *MDT;
282 
283 public:
284   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
285 
286   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
287                                   const AMDGPULegalizerInfo *LI,
288                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
289       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
290                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
291         KB(KB), MDT(MDT) {
292     if (!GeneratedRuleCfg.parseCommandLineOption())
293       report_fatal_error("Invalid rule identifier");
294   }
295 
296   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
297                MachineIRBuilder &B) const override;
298 };
299 
300 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
301                                               MachineInstr &MI,
302                                               MachineIRBuilder &B) const {
303   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
304   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
305   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
306                                                  PostLegalizerHelper);
307 
308   if (Generated.tryCombineAll(Observer, MI, B))
309     return true;
310 
311   switch (MI.getOpcode()) {
312   case TargetOpcode::G_SHL:
313   case TargetOpcode::G_LSHR:
314   case TargetOpcode::G_ASHR:
315     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
316     // common case, splitting this into a move and a 32-bit shift is faster and
317     // the same code size.
318     return Helper.tryCombineShiftToUnmerge(MI, 32);
319   }
320 
321   return false;
322 }
323 
324 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
325 #include "AMDGPUGenPostLegalizeGICombiner.inc"
326 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
327 
328 // Pass boilerplate
329 // ================
330 
331 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
332 public:
333   static char ID;
334 
335   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
336 
337   StringRef getPassName() const override {
338     return "AMDGPUPostLegalizerCombiner";
339   }
340 
341   bool runOnMachineFunction(MachineFunction &MF) override;
342 
343   void getAnalysisUsage(AnalysisUsage &AU) const override;
344 
345 private:
346   bool IsOptNone;
347 };
348 } // end anonymous namespace
349 
350 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
351   AU.addRequired<TargetPassConfig>();
352   AU.setPreservesCFG();
353   getSelectionDAGFallbackAnalysisUsage(AU);
354   AU.addRequired<GISelKnownBitsAnalysis>();
355   AU.addPreserved<GISelKnownBitsAnalysis>();
356   if (!IsOptNone) {
357     AU.addRequired<MachineDominatorTree>();
358     AU.addPreserved<MachineDominatorTree>();
359   }
360   MachineFunctionPass::getAnalysisUsage(AU);
361 }
362 
363 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
364     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
365   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
366 }
367 
368 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
369   if (MF.getProperties().hasProperty(
370           MachineFunctionProperties::Property::FailedISel))
371     return false;
372   auto *TPC = &getAnalysis<TargetPassConfig>();
373   const Function &F = MF.getFunction();
374   bool EnableOpt =
375       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
376 
377   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
378   const AMDGPULegalizerInfo *LI =
379       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
380 
381   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
382   MachineDominatorTree *MDT =
383       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
384   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
385                                          F.hasMinSize(), LI, KB, MDT);
386   Combiner C(PCInfo, TPC);
387   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
388 }
389 
390 char AMDGPUPostLegalizerCombiner::ID = 0;
391 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
392                       "Combine AMDGPU machine instrs after legalization", false,
393                       false)
394 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
395 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
396 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
397                     "Combine AMDGPU machine instrs after legalization", false,
398                     false)
399 
400 namespace llvm {
401 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
402   return new AMDGPUPostLegalizerCombiner(IsOptNone);
403 }
404 } // end namespace llvm
405