1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 <<<<<<< HEAD
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "GCNSubtarget.h"
19 =======
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> clang-format
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 =======
25 #include "AMDGPUTargetMachine.h"
26 #include "AMDGPULegalizerInfo.h"
27 
28 
29 >>>>>>> Updating formatting changes.
30 #include "llvm/CodeGen/GlobalISel/Combiner.h"
31 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
32 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
33 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
34 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
35 #include "llvm/CodeGen/MachineDominators.h"
36 #include "llvm/CodeGen/TargetPassConfig.h"
37 <<<<<<< HEAD
38 #include "llvm/Target/TargetMachine.h"
39 =======
40 #include "llvm/Support/Debug.h"
41 <<<<<<< HEAD
42 >>>>>>> clang-format
43 =======
44 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
45 >>>>>>> Updating formatting changes.
46 
47 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
48 
49 using namespace llvm;
50 using namespace MIPatternMatch;
51 
52 class AMDGPUPostLegalizerCombinerHelper {
53 protected:
54   MachineIRBuilder &B;
55   MachineFunction &MF;
56   MachineRegisterInfo &MRI;
57   CombinerHelper &Helper;
58 
59 public:
60   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
61       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
62 
63   struct FMinFMaxLegacyInfo {
64     Register LHS;
65     Register RHS;
66     Register True;
67     Register False;
68     CmpInst::Predicate Pred;
69   };
70 
71   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
72   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
73   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
74                                          const FMinFMaxLegacyInfo &Info);
75 
76   bool matchUCharToFloat(MachineInstr &MI);
77   void applyUCharToFloat(MachineInstr &MI);
78 
79   // FIXME: Should be able to have 2 separate matchdatas rather than custom
80   // struct boilerplate.
81   struct CvtF32UByteMatchInfo {
82     Register CvtVal;
83     unsigned ShiftOffset;
84   };
85 
86   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
87   void applyCvtF32UByteN(MachineInstr &MI,
88                          const CvtF32UByteMatchInfo &MatchInfo);
89 };
90 
91 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
92     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
93   // FIXME: Combines should have subtarget predicates, and we shouldn't need
94   // this here.
95   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
96     return false;
97 
98   // FIXME: Type predicate on pattern
99   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
100     return false;
101 
102   Register Cond = MI.getOperand(1).getReg();
103   if (!MRI.hasOneNonDBGUse(Cond) ||
104       !mi_match(Cond, MRI,
105                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
106     return false;
107 
108   Info.True = MI.getOperand(2).getReg();
109   Info.False = MI.getOperand(3).getReg();
110 
111   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
112       !(Info.LHS == Info.False && Info.RHS == Info.True))
113     return false;
114 
115   switch (Info.Pred) {
116   case CmpInst::FCMP_FALSE:
117   case CmpInst::FCMP_OEQ:
118   case CmpInst::FCMP_ONE:
119   case CmpInst::FCMP_ORD:
120   case CmpInst::FCMP_UNO:
121   case CmpInst::FCMP_UEQ:
122   case CmpInst::FCMP_UNE:
123   case CmpInst::FCMP_TRUE:
124     return false;
125   default:
126     return true;
127   }
128 }
129 
130 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
131     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
132   B.setInstrAndDebugLoc(MI);
133   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
134     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
135   };
136 
137   switch (Info.Pred) {
138   case CmpInst::FCMP_ULT:
139   case CmpInst::FCMP_ULE:
140     if (Info.LHS == Info.True)
141       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
142     else
143       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
144     break;
145   case CmpInst::FCMP_OLE:
146   case CmpInst::FCMP_OLT: {
147     // We need to permute the operands to get the correct NaN behavior. The
148     // selected operand is the second one based on the failing compare with NaN,
149     // so permute it based on the compare type the hardware uses.
150     if (Info.LHS == Info.True)
151       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
152     else
153       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
154     break;
155   }
156   case CmpInst::FCMP_UGE:
157   case CmpInst::FCMP_UGT: {
158     if (Info.LHS == Info.True)
159       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
160     else
161       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
162     break;
163   }
164   case CmpInst::FCMP_OGT:
165   case CmpInst::FCMP_OGE: {
166     if (Info.LHS == Info.True)
167       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
168     else
169       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
170     break;
171   }
172   default:
173     llvm_unreachable("predicate should not have matched");
174   }
175 
176   MI.eraseFromParent();
177 }
178 
179 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
180   Register DstReg = MI.getOperand(0).getReg();
181 
182   // TODO: We could try to match extracting the higher bytes, which would be
183   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
184   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
185   // about in practice.
186   LLT Ty = MRI.getType(DstReg);
187   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
188     Register SrcReg = MI.getOperand(1).getReg();
189     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
190     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
191     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
192     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
193   }
194 
195   return false;
196 }
197 
198 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
199   B.setInstrAndDebugLoc(MI);
200 
201   const LLT S32 = LLT::scalar(32);
202 
203   Register DstReg = MI.getOperand(0).getReg();
204   Register SrcReg = MI.getOperand(1).getReg();
205   LLT Ty = MRI.getType(DstReg);
206   LLT SrcTy = MRI.getType(SrcReg);
207   if (SrcTy != S32)
208     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
209 
210   if (Ty == S32) {
211     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
212                    {SrcReg}, MI.getFlags());
213   } else {
214     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
215                              {SrcReg}, MI.getFlags());
216     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
217   }
218 
219   MI.eraseFromParent();
220 }
221 
222 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
223     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
224   Register SrcReg = MI.getOperand(1).getReg();
225 
226   // Look through G_ZEXT.
227   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
228 
229   Register Src0;
230   int64_t ShiftAmt;
231   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
232   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
233     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
234 
235     unsigned ShiftOffset = 8 * Offset;
236     if (IsShr)
237       ShiftOffset += ShiftAmt;
238     else
239       ShiftOffset -= ShiftAmt;
240 
241     MatchInfo.CvtVal = Src0;
242     MatchInfo.ShiftOffset = ShiftOffset;
243     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
244   }
245 
246   // TODO: Simplify demanded bits.
247   return false;
248 }
249 
250 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
251     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
252   B.setInstrAndDebugLoc(MI);
253   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
254 
255   const LLT S32 = LLT::scalar(32);
256   Register CvtSrc = MatchInfo.CvtVal;
257   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
258   if (SrcTy != S32) {
259     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
260     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
261   }
262 
263   assert(MI.getOpcode() != NewOpc);
264   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
265   MI.eraseFromParent();
266 }
267 
268 class AMDGPUPostLegalizerCombinerHelperState {
269 protected:
270   CombinerHelper &Helper;
271   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
272 
273 public:
274   AMDGPUPostLegalizerCombinerHelperState(
275       CombinerHelper &Helper,
276       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
277       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
278 };
279 
280 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
281 #include "AMDGPUGenPostLegalizeGICombiner.inc"
282 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
283 
284 namespace {
285 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
286 #include "AMDGPUGenPostLegalizeGICombiner.inc"
287 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
288 
289 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
290   GISelKnownBits *KB;
291   MachineDominatorTree *MDT;
292 
293 public:
294   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
295 
296   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
297                                   const AMDGPULegalizerInfo *LI,
298                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
299       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
300                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
301         KB(KB), MDT(MDT) {
302     if (!GeneratedRuleCfg.parseCommandLineOption())
303       report_fatal_error("Invalid rule identifier");
304   }
305 
306   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
307                MachineIRBuilder &B) const override;
308 };
309 
310 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
311                                               MachineInstr &MI,
312                                               MachineIRBuilder &B) const {
313   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
314   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
315   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
316                                                  PostLegalizerHelper);
317 
318   if (Generated.tryCombineAll(Observer, MI, B))
319     return true;
320 
321   switch (MI.getOpcode()) {
322   case TargetOpcode::G_SHL:
323   case TargetOpcode::G_LSHR:
324   case TargetOpcode::G_ASHR:
325     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
326     // common case, splitting this into a move and a 32-bit shift is faster and
327     // the same code size.
328     return Helper.tryCombineShiftToUnmerge(MI, 32);
329   }
330 
331   return false;
332 }
333 
334 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
335 #include "AMDGPUGenPostLegalizeGICombiner.inc"
336 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
337 
338 // Pass boilerplate
339 // ================
340 
341 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
342 public:
343   static char ID;
344 
345   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
346 
347   StringRef getPassName() const override {
348     return "AMDGPUPostLegalizerCombiner";
349   }
350 
351   bool runOnMachineFunction(MachineFunction &MF) override;
352 
353   void getAnalysisUsage(AnalysisUsage &AU) const override;
354 
355 private:
356   bool IsOptNone;
357 };
358 } // end anonymous namespace
359 
360 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
361   AU.addRequired<TargetPassConfig>();
362   AU.setPreservesCFG();
363   getSelectionDAGFallbackAnalysisUsage(AU);
364   AU.addRequired<GISelKnownBitsAnalysis>();
365   AU.addPreserved<GISelKnownBitsAnalysis>();
366   if (!IsOptNone) {
367     AU.addRequired<MachineDominatorTree>();
368     AU.addPreserved<MachineDominatorTree>();
369   }
370   MachineFunctionPass::getAnalysisUsage(AU);
371 }
372 
373 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
374     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
375   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
376 }
377 
378 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
379   if (MF.getProperties().hasProperty(
380           MachineFunctionProperties::Property::FailedISel))
381     return false;
382   auto *TPC = &getAnalysis<TargetPassConfig>();
383   const Function &F = MF.getFunction();
384   bool EnableOpt =
385       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
386 
387   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
388   const AMDGPULegalizerInfo *LI
389     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
390 
391   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
392   MachineDominatorTree *MDT =
393       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
394   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
395                                          F.hasMinSize(), LI, KB, MDT);
396   Combiner C(PCInfo, TPC);
397   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
398 }
399 
400 char AMDGPUPostLegalizerCombiner::ID = 0;
401 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
402                       "Combine AMDGPU machine instrs after legalization",
403                       false, false)
404 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
405 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
406 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
407                     "Combine AMDGPU machine instrs after legalization", false,
408                     false)
409 
410 namespace llvm {
411 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
412   return new AMDGPUPostLegalizerCombiner(IsOptNone);
413 }
414 } // end namespace llvm
415