1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp
2 //---------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass does combining of machine instructions at the generic MI level,
11 // after the legalizer.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "GCNSubtarget.h"
19 =======
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> clang-format
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/CodeGen/GlobalISel/Combiner.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineDominators.h"
30 #include "llvm/CodeGen/TargetPassConfig.h"
31 <<<<<<< HEAD
32 #include "llvm/Target/TargetMachine.h"
33 =======
34 #include "llvm/Support/Debug.h"
35 >>>>>>> clang-format
36 
37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 class AMDGPUPostLegalizerCombinerHelper {
43 protected:
44   MachineIRBuilder &B;
45   MachineFunction &MF;
46   MachineRegisterInfo &MRI;
47   CombinerHelper &Helper;
48 
49 public:
50   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
51       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
52 
53   struct FMinFMaxLegacyInfo {
54     Register LHS;
55     Register RHS;
56     Register True;
57     Register False;
58     CmpInst::Predicate Pred;
59   };
60 
61   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
62   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
63   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
64                                          const FMinFMaxLegacyInfo &Info);
65 
66   bool matchUCharToFloat(MachineInstr &MI);
67   void applyUCharToFloat(MachineInstr &MI);
68 
69   // FIXME: Should be able to have 2 separate matchdatas rather than custom
70   // struct boilerplate.
71   struct CvtF32UByteMatchInfo {
72     Register CvtVal;
73     unsigned ShiftOffset;
74   };
75 
76   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
77   void applyCvtF32UByteN(MachineInstr &MI,
78                          const CvtF32UByteMatchInfo &MatchInfo);
79 
80   struct ClampI64ToI16MatchInfo {
81     int64_t Cmp1;
82     int64_t Cmp2;
83     Register Origin;
84   };
85 
86   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
87                           MachineFunction &MF,
88                           ClampI64ToI16MatchInfo &MatchInfo);
89 
90   void applyClampI64ToI16(MachineInstr &MI,
91                           const ClampI64ToI16MatchInfo &MatchInfo);
92 };
93 
94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
95     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
96   // FIXME: Combines should have subtarget predicates, and we shouldn't need
97   // this here.
98   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
99     return false;
100 
101   // FIXME: Type predicate on pattern
102   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
103     return false;
104 
105   Register Cond = MI.getOperand(1).getReg();
106   if (!MRI.hasOneNonDBGUse(Cond) ||
107       !mi_match(Cond, MRI,
108                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
109     return false;
110 
111   Info.True = MI.getOperand(2).getReg();
112   Info.False = MI.getOperand(3).getReg();
113 
114   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
115       !(Info.LHS == Info.False && Info.RHS == Info.True))
116     return false;
117 
118   switch (Info.Pred) {
119   case CmpInst::FCMP_FALSE:
120   case CmpInst::FCMP_OEQ:
121   case CmpInst::FCMP_ONE:
122   case CmpInst::FCMP_ORD:
123   case CmpInst::FCMP_UNO:
124   case CmpInst::FCMP_UEQ:
125   case CmpInst::FCMP_UNE:
126   case CmpInst::FCMP_TRUE:
127     return false;
128   default:
129     return true;
130   }
131 }
132 
133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
134     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
135   B.setInstrAndDebugLoc(MI);
136   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
137     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
138   };
139 
140   switch (Info.Pred) {
141   case CmpInst::FCMP_ULT:
142   case CmpInst::FCMP_ULE:
143     if (Info.LHS == Info.True)
144       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
145     else
146       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
147     break;
148   case CmpInst::FCMP_OLE:
149   case CmpInst::FCMP_OLT: {
150     // We need to permute the operands to get the correct NaN behavior. The
151     // selected operand is the second one based on the failing compare with NaN,
152     // so permute it based on the compare type the hardware uses.
153     if (Info.LHS == Info.True)
154       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
155     else
156       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
157     break;
158   }
159   case CmpInst::FCMP_UGE:
160   case CmpInst::FCMP_UGT: {
161     if (Info.LHS == Info.True)
162       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
163     else
164       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
165     break;
166   }
167   case CmpInst::FCMP_OGT:
168   case CmpInst::FCMP_OGE: {
169     if (Info.LHS == Info.True)
170       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
171     else
172       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
173     break;
174   }
175   default:
176     llvm_unreachable("predicate should not have matched");
177   }
178 
179   MI.eraseFromParent();
180 }
181 
182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
183   Register DstReg = MI.getOperand(0).getReg();
184 
185   // TODO: We could try to match extracting the higher bytes, which would be
186   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
187   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
188   // about in practice.
189   LLT Ty = MRI.getType(DstReg);
190   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
191     Register SrcReg = MI.getOperand(1).getReg();
192     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
193     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
194     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
195     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
196   }
197 
198   return false;
199 }
200 
201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
202   B.setInstrAndDebugLoc(MI);
203 
204   const LLT S32 = LLT::scalar(32);
205 
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = MI.getOperand(1).getReg();
208   LLT Ty = MRI.getType(DstReg);
209   LLT SrcTy = MRI.getType(SrcReg);
210   if (SrcTy != S32)
211     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
212 
213   if (Ty == S32) {
214     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
215                  MI.getFlags());
216   } else {
217     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
218                              MI.getFlags());
219     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
220   }
221 
222   MI.eraseFromParent();
223 }
224 
225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
226     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
227   Register SrcReg = MI.getOperand(1).getReg();
228 
229   // Look through G_ZEXT.
230   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
231 
232   Register Src0;
233   int64_t ShiftAmt;
234   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
235   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
236     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
237 
238     unsigned ShiftOffset = 8 * Offset;
239     if (IsShr)
240       ShiftOffset += ShiftAmt;
241     else
242       ShiftOffset -= ShiftAmt;
243 
244     MatchInfo.CvtVal = Src0;
245     MatchInfo.ShiftOffset = ShiftOffset;
246     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
247   }
248 
249   // TODO: Simplify demanded bits.
250   return false;
251 }
252 
253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
254     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
255   B.setInstrAndDebugLoc(MI);
256   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
257 
258   const LLT S32 = LLT::scalar(32);
259   Register CvtSrc = MatchInfo.CvtVal;
260   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
261   if (SrcTy != S32) {
262     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
263     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
264   }
265 
266   assert(MI.getOpcode() != NewOpc);
267   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
268   MI.eraseFromParent();
269 }
270 
271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(
272     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
273     ClampI64ToI16MatchInfo &MatchInfo) {
274   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
275   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
276   if (SrcType != LLT::scalar(64))
277     return false;
278 
279   MachineIRBuilder B(MI);
280 
281   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
282 
283   if (mi_match(MI.getOperand(1).getReg(), MRI,
284                m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2),
285                         m_Reg(MatchInfo.Origin)))) {
286     const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1);
287     const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2);
288 
289     const int64_t Min =
290         static_cast<int64_t>(std::numeric_limits<int16_t>::min());
291     const int64_t Max =
292         static_cast<int64_t>(std::numeric_limits<int16_t>::max());
293 
294     // are we really trying to clamp against short boundaries?
295     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
296             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
297   }
298 
299   return false;
300 }
301 
302 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(
303     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
304   LLVM_DEBUG(dbgs() << "Combining MI");
305 
306   MachineIRBuilder B(MI);
307   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
308 
309   Register Src = MatchInfo.Origin;
310   assert(MRI.getType(Src) == LLT::scalar(64));
311   const LLT S32 = LLT::scalar(32);
312 
313   auto Unmerge = B.buildUnmerge(S32, Src);
314   Register Hi32 = Unmerge->getOperand(0).getReg();
315   Register Lo32 = Unmerge->getOperand(1).getReg();
316   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
317   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
318 
319   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
320   assert(MI.getOpcode() != CvtOpcode);
321 
322   Register CvtDst = MRI.createGenericVirtualRegister(S32);
323   MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass);
324 
325   auto CvtPk = B.buildInstr(CvtOpcode);
326   CvtPk.addDef(CvtDst);
327   CvtPk.addReg(Hi32);
328   CvtPk.addReg(Lo32);
329   CvtPk.setMIFlags(MI.getFlags());
330 
331   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
332   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
333 
334   Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32);
335   MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass);
336   B.buildConstant(MinBoundaryDst, min);
337 
338   Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32);
339   MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass);
340   B.buildConstant(MaxBoundaryDst, max);
341 
342   Register MedDst = MRI.createGenericVirtualRegister(S32);
343   MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass);
344 
345   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
346   Med.addDef(MedDst);
347   Med.addReg(MinBoundaryDst);
348   Med.addReg(CvtDst);
349   Med.addReg(MaxBoundaryDst);
350   Med.setMIFlags(MI.getFlags());
351 
352   B.buildCopy(MI.getOperand(0).getReg(), MedDst);
353 
354   MI.eraseFromParent();
355 }
356 
357 class AMDGPUPostLegalizerCombinerHelperState {
358 protected:
359   CombinerHelper &Helper;
360   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
361 
362 public:
363   AMDGPUPostLegalizerCombinerHelperState(
364       CombinerHelper &Helper,
365       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
366       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
367 };
368 
369 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
370 #include "AMDGPUGenPostLegalizeGICombiner.inc"
371 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
372 
373 namespace {
374 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
375 #include "AMDGPUGenPostLegalizeGICombiner.inc"
376 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
377 
378 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
379   GISelKnownBits *KB;
380   MachineDominatorTree *MDT;
381 
382 public:
383   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
384 
385   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
386                                   const AMDGPULegalizerInfo *LI,
387                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
388       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
389                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
390         KB(KB), MDT(MDT) {
391     if (!GeneratedRuleCfg.parseCommandLineOption())
392       report_fatal_error("Invalid rule identifier");
393   }
394 
395   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
396                MachineIRBuilder &B) const override;
397 };
398 
399 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
400                                               MachineInstr &MI,
401                                               MachineIRBuilder &B) const {
402   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
403   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
404   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
405                                                  PostLegalizerHelper);
406 
407   if (Generated.tryCombineAll(Observer, MI, B))
408     return true;
409 
410   switch (MI.getOpcode()) {
411   case TargetOpcode::G_SHL:
412   case TargetOpcode::G_LSHR:
413   case TargetOpcode::G_ASHR:
414     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
415     // common case, splitting this into a move and a 32-bit shift is faster and
416     // the same code size.
417     return Helper.tryCombineShiftToUnmerge(MI, 32);
418   }
419 
420   return false;
421 }
422 
423 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
424 #include "AMDGPUGenPostLegalizeGICombiner.inc"
425 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
426 
427 // Pass boilerplate
428 // ================
429 
430 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
431 public:
432   static char ID;
433 
434   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
435 
436   StringRef getPassName() const override {
437     return "AMDGPUPostLegalizerCombiner";
438   }
439 
440   bool runOnMachineFunction(MachineFunction &MF) override;
441 
442   void getAnalysisUsage(AnalysisUsage &AU) const override;
443 
444 private:
445   bool IsOptNone;
446 };
447 } // end anonymous namespace
448 
449 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
450   AU.addRequired<TargetPassConfig>();
451   AU.setPreservesCFG();
452   getSelectionDAGFallbackAnalysisUsage(AU);
453   AU.addRequired<GISelKnownBitsAnalysis>();
454   AU.addPreserved<GISelKnownBitsAnalysis>();
455   if (!IsOptNone) {
456     AU.addRequired<MachineDominatorTree>();
457     AU.addPreserved<MachineDominatorTree>();
458   }
459   MachineFunctionPass::getAnalysisUsage(AU);
460 }
461 
462 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
463     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
464   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
465 }
466 
467 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
468   if (MF.getProperties().hasProperty(
469           MachineFunctionProperties::Property::FailedISel))
470     return false;
471   auto *TPC = &getAnalysis<TargetPassConfig>();
472   const Function &F = MF.getFunction();
473   bool EnableOpt =
474       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
475 
476   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
477   const AMDGPULegalizerInfo *LI =
478       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
479 
480   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
481   MachineDominatorTree *MDT =
482       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
483   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
484                                          F.hasMinSize(), LI, KB, MDT);
485   Combiner C(PCInfo, TPC);
486   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
487 }
488 
489 char AMDGPUPostLegalizerCombiner::ID = 0;
490 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
491                       "Combine AMDGPU machine instrs after legalization", false,
492                       false)
493 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
494 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
495 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
496                     "Combine AMDGPU machine instrs after legalization", false,
497                     false)
498 
499 namespace llvm {
500 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
501   return new AMDGPUPostLegalizerCombiner(IsOptNone);
502 }
503 } // end namespace llvm
504