1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp
2 //---------------===//
3 //
4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 // See https://llvm.org/LICENSE.txt for license information.
6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass does combining of machine instructions at the generic MI level,
11 // after the legalizer.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 <<<<<<< HEAD
16 #include "AMDGPU.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "GCNSubtarget.h"
19 =======
20 #include "AMDGPULegalizerInfo.h"
21 #include "AMDGPUTargetMachine.h"
22 >>>>>>> clang-format
23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24 #include "llvm/CodeGen/GlobalISel/Combiner.h"
25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineDominators.h"
30 #include "llvm/CodeGen/TargetPassConfig.h"
31 <<<<<<< HEAD
32 #include "llvm/Target/TargetMachine.h"
33 =======
34 #include "llvm/Support/Debug.h"
35 >>>>>>> clang-format
36 
37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 class AMDGPUPostLegalizerCombinerHelper {
43 protected:
44   MachineIRBuilder &B;
45   MachineFunction &MF;
46   MachineRegisterInfo &MRI;
47   CombinerHelper &Helper;
48 
49 public:
50   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
51       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
52 
53   struct FMinFMaxLegacyInfo {
54     Register LHS;
55     Register RHS;
56     Register True;
57     Register False;
58     CmpInst::Predicate Pred;
59   };
60 
61   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
62   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
63   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
64                                          const FMinFMaxLegacyInfo &Info);
65 
66   bool matchUCharToFloat(MachineInstr &MI);
67   void applyUCharToFloat(MachineInstr &MI);
68 
69   // FIXME: Should be able to have 2 separate matchdatas rather than custom
70   // struct boilerplate.
71   struct CvtF32UByteMatchInfo {
72     Register CvtVal;
73     unsigned ShiftOffset;
74   };
75 
76   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
77   void applyCvtF32UByteN(MachineInstr &MI,
78                          const CvtF32UByteMatchInfo &MatchInfo);
79 
80   struct ClampI64ToI16MatchInfo {
81     int64_t Cmp1;
82     int64_t Cmp2;
83     Register Origin;
84   };
85 
86   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
87                           MachineFunction &MF,
88                           ClampI64ToI16MatchInfo &MatchInfo);
89 
90   void applyClampI64ToI16(MachineInstr &MI,
91                           const ClampI64ToI16MatchInfo &MatchInfo);
92 };
93 
94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
95     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
96   // FIXME: Combines should have subtarget predicates, and we shouldn't need
97   // this here.
98   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
99     return false;
100 
101   // FIXME: Type predicate on pattern
102   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
103     return false;
104 
105   Register Cond = MI.getOperand(1).getReg();
106   if (!MRI.hasOneNonDBGUse(Cond) ||
107       !mi_match(Cond, MRI,
108                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
109     return false;
110 
111   Info.True = MI.getOperand(2).getReg();
112   Info.False = MI.getOperand(3).getReg();
113 
114   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
115       !(Info.LHS == Info.False && Info.RHS == Info.True))
116     return false;
117 
118   switch (Info.Pred) {
119   case CmpInst::FCMP_FALSE:
120   case CmpInst::FCMP_OEQ:
121   case CmpInst::FCMP_ONE:
122   case CmpInst::FCMP_ORD:
123   case CmpInst::FCMP_UNO:
124   case CmpInst::FCMP_UEQ:
125   case CmpInst::FCMP_UNE:
126   case CmpInst::FCMP_TRUE:
127     return false;
128   default:
129     return true;
130   }
131 }
132 
133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
134     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
135   B.setInstrAndDebugLoc(MI);
136   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
137     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
138   };
139 
140   switch (Info.Pred) {
141   case CmpInst::FCMP_ULT:
142   case CmpInst::FCMP_ULE:
143     if (Info.LHS == Info.True)
144       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
145     else
146       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
147     break;
148   case CmpInst::FCMP_OLE:
149   case CmpInst::FCMP_OLT: {
150     // We need to permute the operands to get the correct NaN behavior. The
151     // selected operand is the second one based on the failing compare with NaN,
152     // so permute it based on the compare type the hardware uses.
153     if (Info.LHS == Info.True)
154       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
155     else
156       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
157     break;
158   }
159   case CmpInst::FCMP_UGE:
160   case CmpInst::FCMP_UGT: {
161     if (Info.LHS == Info.True)
162       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
163     else
164       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
165     break;
166   }
167   case CmpInst::FCMP_OGT:
168   case CmpInst::FCMP_OGE: {
169     if (Info.LHS == Info.True)
170       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
171     else
172       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
173     break;
174   }
175   default:
176     llvm_unreachable("predicate should not have matched");
177   }
178 
179   MI.eraseFromParent();
180 }
181 
182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
183   Register DstReg = MI.getOperand(0).getReg();
184 
185   // TODO: We could try to match extracting the higher bytes, which would be
186   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
187   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
188   // about in practice.
189   LLT Ty = MRI.getType(DstReg);
190   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
191     Register SrcReg = MI.getOperand(1).getReg();
192     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
193     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
194     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
195     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
196   }
197 
198   return false;
199 }
200 
201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
202   B.setInstrAndDebugLoc(MI);
203 
204   const LLT S32 = LLT::scalar(32);
205 
206   Register DstReg = MI.getOperand(0).getReg();
207   Register SrcReg = MI.getOperand(1).getReg();
208   LLT Ty = MRI.getType(DstReg);
209   LLT SrcTy = MRI.getType(SrcReg);
210   if (SrcTy != S32)
211     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
212 
213   if (Ty == S32) {
214     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
215                  MI.getFlags());
216   } else {
217     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
218                              MI.getFlags());
219     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
220   }
221 
222   MI.eraseFromParent();
223 }
224 
225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
226     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
227   Register SrcReg = MI.getOperand(1).getReg();
228 
229   // Look through G_ZEXT.
230   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
231 
232   Register Src0;
233   int64_t ShiftAmt;
234   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
235   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
236     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
237 
238     unsigned ShiftOffset = 8 * Offset;
239     if (IsShr)
240       ShiftOffset += ShiftAmt;
241     else
242       ShiftOffset -= ShiftAmt;
243 
244     MatchInfo.CvtVal = Src0;
245     MatchInfo.ShiftOffset = ShiftOffset;
246     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
247   }
248 
249   // TODO: Simplify demanded bits.
250   return false;
251 }
252 
253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
254     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
255   B.setInstrAndDebugLoc(MI);
256   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
257 
258   const LLT S32 = LLT::scalar(32);
259   Register CvtSrc = MatchInfo.CvtVal;
260   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
261   if (SrcTy != S32) {
262     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
263     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
264   }
265 
266   assert(MI.getOpcode() != NewOpc);
267   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
268   MI.eraseFromParent();
269 }
270 
271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(
272     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
273     ClampI64ToI16MatchInfo &MatchInfo) {
274   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
275   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
276 
277   // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or
278   // below).
279   if (SrcType != LLT::scalar(64))
280     return false;
281 
282   MachineIRBuilder B(MI);
283 
284   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
285 
286   if (mi_match(MI.getOperand(1).getReg(), MRI,
287                m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2),
288                         m_Reg(MatchInfo.Origin)))) {
289     const auto Cmp1 = MatchInfo.Cmp1;
290     const auto Cmp2 = MatchInfo.Cmp2;
291     const auto Diff = std::abs(Cmp2 - Cmp1);
292 
293     // we don't need to clamp here.
294     if (Diff == 0 || Diff == 1) {
295       return false;
296     }
297 
298     const int64_t Min = std::numeric_limits<int16_t>::min();
299     const int64_t Max = std::numeric_limits<int16_t>::max();
300 
301     // are we really trying to clamp against short boundaries?
302     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
303             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
304   }
305 
306   return false;
307 }
308 
309 /**
310  * We want to find a combination of instructions that
311  * gets generated when an i64 gets clamped to i16.
312  * The corresponding pattern is:
313  * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC.
314  * This can be efficiently written as following:
315  * v_cvt_pk_i16_i32 v0, v0, v1
316  * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
317  */
318 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(
319     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
320   LLVM_DEBUG(dbgs() << "Combining MI");
321 
322   MachineIRBuilder B(MI);
323   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
324 
325   Register Src = MatchInfo.Origin;
326   assert(MRI.getType(Src) == LLT::scalar(64));
327   const LLT S32 = LLT::scalar(32);
328 
329   auto Unmerge = B.buildUnmerge(S32, Src);
330   Register Hi32 = Unmerge->getOperand(0).getReg();
331   Register Lo32 = Unmerge->getOperand(1).getReg();
332   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
333   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
334 
335   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
336   assert(MI.getOpcode() != CvtOpcode);
337 
338   const auto REG_CLASS = &AMDGPU::VGPR_32RegClass;
339 
340   Register CvtDst = MRI.createVirtualRegister(REG_CLASS);
341   MRI.setType(CvtDst, S32);
342 
343   auto CvtPk = B.buildInstr(CvtOpcode);
344   CvtPk.addDef(CvtDst);
345   CvtPk.addReg(Hi32);
346   CvtPk.addReg(Lo32);
347   CvtPk.setMIFlags(MI.getFlags());
348 
349   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
350   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
351 
352   Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
353   MRI.setType(MinBoundaryDst, S32);
354   B.buildConstant(MinBoundaryDst, min);
355 
356   Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS);
357   MRI.setType(MaxBoundaryDst, S32);
358   B.buildConstant(MaxBoundaryDst, max);
359 
360   Register MedDst = MRI.createVirtualRegister(REG_CLASS);
361   MRI.setType(MedDst, S32);
362 
363   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
364   Med.addDef(MedDst);
365   Med.addReg(MinBoundaryDst);
366   Med.addReg(CvtDst);
367   Med.addReg(MaxBoundaryDst);
368   Med.setMIFlags(MI.getFlags());
369 
370   B.buildCopy(MI.getOperand(0).getReg(), MedDst);
371 
372   MI.eraseFromParent();
373 }
374 
375 class AMDGPUPostLegalizerCombinerHelperState {
376 protected:
377   CombinerHelper &Helper;
378   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
379 
380 public:
381   AMDGPUPostLegalizerCombinerHelperState(
382       CombinerHelper &Helper,
383       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
384       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
385 };
386 
387 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
388 #include "AMDGPUGenPostLegalizeGICombiner.inc"
389 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
390 
391 namespace {
392 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
393 #include "AMDGPUGenPostLegalizeGICombiner.inc"
394 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
395 
396 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
397   GISelKnownBits *KB;
398   MachineDominatorTree *MDT;
399 
400 public:
401   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
402 
403   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
404                                   const AMDGPULegalizerInfo *LI,
405                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
406       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
407                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
408         KB(KB), MDT(MDT) {
409     if (!GeneratedRuleCfg.parseCommandLineOption())
410       report_fatal_error("Invalid rule identifier");
411   }
412 
413   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
414                MachineIRBuilder &B) const override;
415 };
416 
417 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
418                                               MachineInstr &MI,
419                                               MachineIRBuilder &B) const {
420   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
421   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
422   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
423                                                  PostLegalizerHelper);
424 
425   if (Generated.tryCombineAll(Observer, MI, B))
426     return true;
427 
428   switch (MI.getOpcode()) {
429   case TargetOpcode::G_SHL:
430   case TargetOpcode::G_LSHR:
431   case TargetOpcode::G_ASHR:
432     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
433     // common case, splitting this into a move and a 32-bit shift is faster and
434     // the same code size.
435     return Helper.tryCombineShiftToUnmerge(MI, 32);
436   }
437 
438   return false;
439 }
440 
441 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
442 #include "AMDGPUGenPostLegalizeGICombiner.inc"
443 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
444 
445 // Pass boilerplate
446 // ================
447 
448 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
449 public:
450   static char ID;
451 
452   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
453 
454   StringRef getPassName() const override {
455     return "AMDGPUPostLegalizerCombiner";
456   }
457 
458   bool runOnMachineFunction(MachineFunction &MF) override;
459 
460   void getAnalysisUsage(AnalysisUsage &AU) const override;
461 
462 private:
463   bool IsOptNone;
464 };
465 } // end anonymous namespace
466 
467 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
468   AU.addRequired<TargetPassConfig>();
469   AU.setPreservesCFG();
470   getSelectionDAGFallbackAnalysisUsage(AU);
471   AU.addRequired<GISelKnownBitsAnalysis>();
472   AU.addPreserved<GISelKnownBitsAnalysis>();
473   if (!IsOptNone) {
474     AU.addRequired<MachineDominatorTree>();
475     AU.addPreserved<MachineDominatorTree>();
476   }
477   MachineFunctionPass::getAnalysisUsage(AU);
478 }
479 
480 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
481     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
482   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
483 }
484 
485 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
486   if (MF.getProperties().hasProperty(
487           MachineFunctionProperties::Property::FailedISel))
488     return false;
489   auto *TPC = &getAnalysis<TargetPassConfig>();
490   const Function &F = MF.getFunction();
491   bool EnableOpt =
492       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
493 
494   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
495   const AMDGPULegalizerInfo *LI =
496       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
497 
498   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
499   MachineDominatorTree *MDT =
500       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
501   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
502                                          F.hasMinSize(), LI, KB, MDT);
503   Combiner C(PCInfo, TPC);
504   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
505 }
506 
507 char AMDGPUPostLegalizerCombiner::ID = 0;
508 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
509                       "Combine AMDGPU machine instrs after legalization", false,
510                       false)
511 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
512 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
513 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
514                     "Combine AMDGPU machine instrs after legalization", false,
515                     false)
516 
517 namespace llvm {
518 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
519   return new AMDGPUPostLegalizerCombiner(IsOptNone);
520 }
521 } // end namespace llvm
522