1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp
2 //---------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass does combining of machine instructions at the generic MI level,
11 // after the legalizer.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "GCNSubtarget.h"
19 =======
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> clang-format
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/CodeGen/GlobalISel/Combiner.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineDominators.h"
30 #include "llvm/CodeGen/TargetPassConfig.h"
31 <<<<<<< HEAD
32 #include "llvm/Target/TargetMachine.h"
33 =======
34 #include "llvm/Support/Debug.h"
35 >>>>>>> clang-format
36 
37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 class AMDGPUPostLegalizerCombinerHelper {
43 protected:
44   MachineIRBuilder &B;
45   MachineFunction &MF;
46   MachineRegisterInfo &MRI;
47   CombinerHelper &Helper;
48 
49 public:
50   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
51       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
52 
53   struct FMinFMaxLegacyInfo {
54     Register LHS;
55     Register RHS;
56     Register True;
57     Register False;
58     CmpInst::Predicate Pred;
59   };
60 
61   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
62   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
63   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
64                                          const FMinFMaxLegacyInfo &Info);
65 
66   bool matchUCharToFloat(MachineInstr &MI);
67   void applyUCharToFloat(MachineInstr &MI);
68 
69   // FIXME: Should be able to have 2 separate matchdatas rather than custom
70   // struct boilerplate.
71   struct CvtF32UByteMatchInfo {
72     Register CvtVal;
73     unsigned ShiftOffset;
74   };
75 
76   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
77   void applyCvtF32UByteN(MachineInstr &MI,
78                          const CvtF32UByteMatchInfo &MatchInfo);
79 
80   struct ClampI64ToI16MatchInfo {
81     int64_t Cmp1;
82     int64_t Cmp2;
83     Register Origin;
84   };
85 
86   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
87                           MachineFunction &MF,
88                           ClampI64ToI16MatchInfo &MatchInfo);
89 
90   void applyClampI64ToI16(MachineInstr &MI,
91                           const ClampI64ToI16MatchInfo &MatchInfo);
92 };
93 
94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
95     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
96   // FIXME: Combines should have subtarget predicates, and we shouldn't need
97   // this here.
98   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
99     return false;
100 
101   // FIXME: Type predicate on pattern
102   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
103     return false;
104 
105   Register Cond = MI.getOperand(1).getReg();
106   if (!MRI.hasOneNonDBGUse(Cond) ||
107       !mi_match(Cond, MRI,
108                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
109     return false;
110 
111   Info.True = MI.getOperand(2).getReg();
112   Info.False = MI.getOperand(3).getReg();
113 
114   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
115       !(Info.LHS == Info.False && Info.RHS == Info.True))
116     return false;
117 
118   switch (Info.Pred) {
119   case CmpInst::FCMP_FALSE:
120   case CmpInst::FCMP_OEQ:
121   case CmpInst::FCMP_ONE:
122   case CmpInst::FCMP_ORD:
123   case CmpInst::FCMP_UNO:
124   case CmpInst::FCMP_UEQ:
125   case CmpInst::FCMP_UNE:
126   case CmpInst::FCMP_TRUE:
127     return false;
128   default:
129     return true;
130   }
131 }
132 
133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
134     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
135   B.setInstrAndDebugLoc(MI);
136   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
137     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
138   };
139 
140   switch (Info.Pred) {
141   case CmpInst::FCMP_ULT:
142   case CmpInst::FCMP_ULE:
143     if (Info.LHS == Info.True)
144       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
145     else
146       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
147     break;
148   case CmpInst::FCMP_OLE:
149   case CmpInst::FCMP_OLT: {
150     // We need to permute the operands to get the correct NaN behavior. The
151     // selected operand is the second one based on the failing compare with NaN,
152     // so permute it based on the compare type the hardware uses.
153     if (Info.LHS == Info.True)
154       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
155     else
156       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
157     break;
158   }
159   case CmpInst::FCMP_UGE:
160   case CmpInst::FCMP_UGT: {
161     if (Info.LHS == Info.True)
162       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
163     else
164       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
165     break;
166   }
167   case CmpInst::FCMP_OGT:
168   case CmpInst::FCMP_OGE: {
169     if (Info.LHS == Info.True)
170       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
171     else
172       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
173     break;
174   }
175   default:
176     llvm_unreachable("predicate should not have matched");
177   }
178 
179   MI.eraseFromParent();
180 }
181 
182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
183   Register DstReg = MI.getOperand(0).getReg();
184 
185   // TODO: We could try to match extracting the higher bytes, which would be
186   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
187   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
188   // about in practice.
189   LLT Ty = MRI.getType(DstReg);
190   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
191     Register SrcReg = MI.getOperand(1).getReg();
192     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
193     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
194     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
195     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
196   }
197 
198   return false;
199 }
200 
201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
202   B.setInstrAndDebugLoc(MI);
203 
204   const LLT S32 = LLT::scalar(32);
205 
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = MI.getOperand(1).getReg();
208   LLT Ty = MRI.getType(DstReg);
209   LLT SrcTy = MRI.getType(SrcReg);
210   if (SrcTy != S32)
211     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
212 
213   if (Ty == S32) {
214     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
215                  MI.getFlags());
216   } else {
217     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
218                              MI.getFlags());
219     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
220   }
221 
222   MI.eraseFromParent();
223 }
224 
225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
226     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
227   Register SrcReg = MI.getOperand(1).getReg();
228 
229   // Look through G_ZEXT.
230   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
231 
232   Register Src0;
233   int64_t ShiftAmt;
234   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
235   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
236     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
237 
238     unsigned ShiftOffset = 8 * Offset;
239     if (IsShr)
240       ShiftOffset += ShiftAmt;
241     else
242       ShiftOffset -= ShiftAmt;
243 
244     MatchInfo.CvtVal = Src0;
245     MatchInfo.ShiftOffset = ShiftOffset;
246     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
247   }
248 
249   // TODO: Simplify demanded bits.
250   return false;
251 }
252 
253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
254     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
255   B.setInstrAndDebugLoc(MI);
256   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
257 
258   const LLT S32 = LLT::scalar(32);
259   Register CvtSrc = MatchInfo.CvtVal;
260   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
261   if (SrcTy != S32) {
262     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
263     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
264   }
265 
266   assert(MI.getOpcode() != NewOpc);
267   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
268   MI.eraseFromParent();
269 }
270 
271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(
272     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
273     ClampI64ToI16MatchInfo &MatchInfo) {
274   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
275   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
276 
277   // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or
278   // below).
279   if (SrcType != LLT::scalar(64))
280     return false;
281 
282   MachineIRBuilder B(MI);
283 
284   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
285 
286   if (mi_match(MI.getOperand(1).getReg(), MRI,
287                m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2),
288                         m_Reg(MatchInfo.Origin)))) {
289     const auto Cmp1 = MatchInfo.Cmp1;
290     const auto Cmp2 = MatchInfo.Cmp2;
291 
292     const int64_t Min = std::numeric_limits<int16_t>::min();
293     const int64_t Max = std::numeric_limits<int16_t>::max();
294 
295     // are we really trying to clamp against short boundaries?
296     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
297             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
298   }
299 
300   return false;
301 }
302 
303 /**
304  * We want to find a combination of instructions that
305  * gets generated when an i64 gets clamped to i16.
306  * The corresponding pattern is:
307  * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC.
308  * This can be efficiently written as following:
309  * v_cvt_pk_i16_i32 v0, v0, v1
310  * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
311  */
312 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(
313     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
314   LLVM_DEBUG(dbgs() << "Combining MI");
315 
316   MachineIRBuilder B(MI);
317   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
318 
319   Register Src = MatchInfo.Origin;
320   assert(MRI.getType(Src) == LLT::scalar(64));
321   const LLT S32 = LLT::scalar(32);
322 
323   auto Unmerge = B.buildUnmerge(S32, Src);
324   Register Hi32 = Unmerge->getOperand(0).getReg();
325   Register Lo32 = Unmerge->getOperand(1).getReg();
326   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
327   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
328 
329   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
330   assert(MI.getOpcode() != CvtOpcode);
331 
332   Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
333 
334   auto CvtPk = B.buildInstr(CvtOpcode);
335   CvtPk.addDef(CvtDst);
336   CvtPk.addReg(Hi32);
337   CvtPk.addReg(Lo32);
338   CvtPk.setMIFlags(MI.getFlags());
339 
340   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
341   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
342 
343   Register MinBoundaryDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
344   B.buildConstant(MinBoundaryDst, min);
345 
346   Register MaxBoundaryDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
347   B.buildConstant(MaxBoundaryDst, max);
348 
349   Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
350 
351   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
352   Med.addDef(MedDst);
353   Med.addReg(MinBoundaryDst);
354   Med.addReg(CvtDst);
355   Med.addReg(MaxBoundaryDst);
356   Med.setMIFlags(MI.getFlags());
357 
358   B.buildCopy(MI.getOperand(0).getReg(), MedDst);
359 
360   MI.eraseFromParent();
361 }
362 
363 class AMDGPUPostLegalizerCombinerHelperState {
364 protected:
365   CombinerHelper &Helper;
366   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
367 
368 public:
369   AMDGPUPostLegalizerCombinerHelperState(
370       CombinerHelper &Helper,
371       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
372       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
373 };
374 
375 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
376 #include "AMDGPUGenPostLegalizeGICombiner.inc"
377 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
378 
379 namespace {
380 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
381 #include "AMDGPUGenPostLegalizeGICombiner.inc"
382 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
383 
384 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
385   GISelKnownBits *KB;
386   MachineDominatorTree *MDT;
387 
388 public:
389   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
390 
391   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
392                                   const AMDGPULegalizerInfo *LI,
393                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
394       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
395                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
396         KB(KB), MDT(MDT) {
397     if (!GeneratedRuleCfg.parseCommandLineOption())
398       report_fatal_error("Invalid rule identifier");
399   }
400 
401   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
402                MachineIRBuilder &B) const override;
403 };
404 
405 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
406                                               MachineInstr &MI,
407                                               MachineIRBuilder &B) const {
408   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
409   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
410   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
411                                                  PostLegalizerHelper);
412 
413   if (Generated.tryCombineAll(Observer, MI, B))
414     return true;
415 
416   switch (MI.getOpcode()) {
417   case TargetOpcode::G_SHL:
418   case TargetOpcode::G_LSHR:
419   case TargetOpcode::G_ASHR:
420     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
421     // common case, splitting this into a move and a 32-bit shift is faster and
422     // the same code size.
423     return Helper.tryCombineShiftToUnmerge(MI, 32);
424   }
425 
426   return false;
427 }
428 
429 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
430 #include "AMDGPUGenPostLegalizeGICombiner.inc"
431 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
432 
433 // Pass boilerplate
434 // ================
435 
436 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
437 public:
438   static char ID;
439 
440   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
441 
442   StringRef getPassName() const override {
443     return "AMDGPUPostLegalizerCombiner";
444   }
445 
446   bool runOnMachineFunction(MachineFunction &MF) override;
447 
448   void getAnalysisUsage(AnalysisUsage &AU) const override;
449 
450 private:
451   bool IsOptNone;
452 };
453 } // end anonymous namespace
454 
455 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
456   AU.addRequired<TargetPassConfig>();
457   AU.setPreservesCFG();
458   getSelectionDAGFallbackAnalysisUsage(AU);
459   AU.addRequired<GISelKnownBitsAnalysis>();
460   AU.addPreserved<GISelKnownBitsAnalysis>();
461   if (!IsOptNone) {
462     AU.addRequired<MachineDominatorTree>();
463     AU.addPreserved<MachineDominatorTree>();
464   }
465   MachineFunctionPass::getAnalysisUsage(AU);
466 }
467 
468 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
469     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
470   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
471 }
472 
473 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
474   if (MF.getProperties().hasProperty(
475           MachineFunctionProperties::Property::FailedISel))
476     return false;
477   auto *TPC = &getAnalysis<TargetPassConfig>();
478   const Function &F = MF.getFunction();
479   bool EnableOpt =
480       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
481 
482   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
483   const AMDGPULegalizerInfo *LI =
484       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
485 
486   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
487   MachineDominatorTree *MDT =
488       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
489   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
490                                          F.hasMinSize(), LI, KB, MDT);
491   Combiner C(PCInfo, TPC);
492   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
493 }
494 
495 char AMDGPUPostLegalizerCombiner::ID = 0;
496 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
497                       "Combine AMDGPU machine instrs after legalization", false,
498                       false)
499 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
500 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
501 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
502                     "Combine AMDGPU machine instrs after legalization", false,
503                     false)
504 
505 namespace llvm {
506 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
507   return new AMDGPUPostLegalizerCombiner(IsOptNone);
508 }
509 } // end namespace llvm
510