1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 class AMDGPUPostLegalizerCombinerHelper {
33 protected:
34   MachineIRBuilder &B;
35   MachineFunction &MF;
36   MachineRegisterInfo &MRI;
37   CombinerHelper &Helper;
38 
39 public:
40   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42 
43   struct FMinFMaxLegacyInfo {
44     Register LHS;
45     Register RHS;
46     Register True;
47     Register False;
48     CmpInst::Predicate Pred;
49   };
50 
51   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
52   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
53   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
54                                          const FMinFMaxLegacyInfo &Info);
55 
56   bool matchUCharToFloat(MachineInstr &MI);
57   void applyUCharToFloat(MachineInstr &MI);
58 
59   // FIXME: Should be able to have 2 separate matchdatas rather than custom
60   // struct boilerplate.
61   struct CvtF32UByteMatchInfo {
62     Register CvtVal;
63     unsigned ShiftOffset;
64   };
65 
66   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
67   void applyCvtF32UByteN(MachineInstr &MI,
68                          const CvtF32UByteMatchInfo &MatchInfo);
69 
70   struct ClampI64ToI16MatchInfo {
71     int64_t Cmp1;
72     int64_t Cmp2;
73     Register Origin;
74   };
75 
76   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
77                                 MachineFunction &MF,
78                                 ClampI64ToI16MatchInfo &MatchInfo);
79 
80   void applyClampI64ToI16(MachineInstr &MI,
81                           const ClampI64ToI16MatchInfo &MatchInfo);
82 };
83 
84 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
85     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
86   // FIXME: Combines should have subtarget predicates, and we shouldn't need
87   // this here.
88   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
89     return false;
90 
91   // FIXME: Type predicate on pattern
92   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
93     return false;
94 
95   Register Cond = MI.getOperand(1).getReg();
96   if (!MRI.hasOneNonDBGUse(Cond) ||
97       !mi_match(Cond, MRI,
98                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
99     return false;
100 
101   Info.True = MI.getOperand(2).getReg();
102   Info.False = MI.getOperand(3).getReg();
103 
104   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
105       !(Info.LHS == Info.False && Info.RHS == Info.True))
106     return false;
107 
108   switch (Info.Pred) {
109   case CmpInst::FCMP_FALSE:
110   case CmpInst::FCMP_OEQ:
111   case CmpInst::FCMP_ONE:
112   case CmpInst::FCMP_ORD:
113   case CmpInst::FCMP_UNO:
114   case CmpInst::FCMP_UEQ:
115   case CmpInst::FCMP_UNE:
116   case CmpInst::FCMP_TRUE:
117     return false;
118   default:
119     return true;
120   }
121 }
122 
123 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
124     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
125   B.setInstrAndDebugLoc(MI);
126   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
127     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
128   };
129 
130   switch (Info.Pred) {
131   case CmpInst::FCMP_ULT:
132   case CmpInst::FCMP_ULE:
133     if (Info.LHS == Info.True)
134       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
135     else
136       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
137     break;
138   case CmpInst::FCMP_OLE:
139   case CmpInst::FCMP_OLT: {
140     // We need to permute the operands to get the correct NaN behavior. The
141     // selected operand is the second one based on the failing compare with NaN,
142     // so permute it based on the compare type the hardware uses.
143     if (Info.LHS == Info.True)
144       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
145     else
146       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
147     break;
148   }
149   case CmpInst::FCMP_UGE:
150   case CmpInst::FCMP_UGT: {
151     if (Info.LHS == Info.True)
152       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
153     else
154       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
155     break;
156   }
157   case CmpInst::FCMP_OGT:
158   case CmpInst::FCMP_OGE: {
159     if (Info.LHS == Info.True)
160       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
161     else
162       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
163     break;
164   }
165   default:
166     llvm_unreachable("predicate should not have matched");
167   }
168 
169   MI.eraseFromParent();
170 }
171 
172 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
173   Register DstReg = MI.getOperand(0).getReg();
174 
175   // TODO: We could try to match extracting the higher bytes, which would be
176   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
177   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
178   // about in practice.
179   LLT Ty = MRI.getType(DstReg);
180   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
181     Register SrcReg = MI.getOperand(1).getReg();
182     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
183     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
184     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
185     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
186   }
187 
188   return false;
189 }
190 
191 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
192   B.setInstrAndDebugLoc(MI);
193 
194   const LLT S32 = LLT::scalar(32);
195 
196   Register DstReg = MI.getOperand(0).getReg();
197   Register SrcReg = MI.getOperand(1).getReg();
198   LLT Ty = MRI.getType(DstReg);
199   LLT SrcTy = MRI.getType(SrcReg);
200   if (SrcTy != S32)
201     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
202 
203   if (Ty == S32) {
204     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
205                    {SrcReg}, MI.getFlags());
206   } else {
207     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
208                              {SrcReg}, MI.getFlags());
209     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
210   }
211 
212   MI.eraseFromParent();
213 }
214 
215 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
216     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
217   Register SrcReg = MI.getOperand(1).getReg();
218 
219   // Look through G_ZEXT.
220   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
221 
222   Register Src0;
223   int64_t ShiftAmt;
224   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
225   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
226     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
227 
228     unsigned ShiftOffset = 8 * Offset;
229     if (IsShr)
230       ShiftOffset += ShiftAmt;
231     else
232       ShiftOffset -= ShiftAmt;
233 
234     MatchInfo.CvtVal = Src0;
235     MatchInfo.ShiftOffset = ShiftOffset;
236     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
237   }
238 
239   // TODO: Simplify demanded bits.
240   return false;
241 }
242 
243 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
244     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
245   B.setInstrAndDebugLoc(MI);
246   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
247 
248   const LLT S32 = LLT::scalar(32);
249   Register CvtSrc = MatchInfo.CvtVal;
250   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
251   if (SrcTy != S32) {
252     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
253     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
254   }
255 
256   assert(MI.getOpcode() != NewOpc);
257   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
258   MI.eraseFromParent();
259 }
260 
261 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
262                                MachineFunction &MF,
263                                ClampI64ToI16MatchInfo &MatchInfo) {
264   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
265   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
266   if (SrcType != LLT::scalar(64))
267     return false;
268 
269   MachineIRBuilder B(MI);
270 
271   LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16");
272 
273   if (mi_match(MI.getOperand(1).getReg(), MRI,
274                m_MaxMin(m_ICst(MatchInfo.Cmp1),
275                         m_ICst(MatchInfo.Cmp2),
276                         m_Reg(MatchInfo.Origin)))) {
277     const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1);
278     const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2);
279 
280     const int64_t Min = static_cast<int64_t>(std::numeric_limits<int16_t>::min());
281     const int64_t Max = static_cast<int64_t>(std::numeric_limits<int16_t>::max());
282 
283     // are we really trying to clamp against short boundaries?
284     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
285             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
286   }
287 
288   return false;
289 }
290 
291 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(MachineInstr &MI,
292                                const ClampI64ToI16MatchInfo &MatchInfo) {
293   LLVM_DEBUG(dbgs() << "Combining MI");
294 
295   MachineIRBuilder B(MI);
296   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
297 
298   Register Src = MatchInfo.Origin;
299   assert(MRI.getType(Src) == LLT::scalar(64));
300   const LLT S32 = LLT::scalar(32);
301 
302   auto Unmerge = B.buildUnmerge(S32, Src);
303   Register Hi32 = Unmerge->getOperand(0).getReg();
304   Register Lo32 = Unmerge->getOperand(1).getReg();
305   MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass);
306   MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass);
307 
308   constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64;
309   assert(MI.getOpcode() != CvtOpcode);
310 
311   Register CvtDst = MRI.createGenericVirtualRegister(S32);
312   MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass);
313 
314   auto CvtPk = B.buildInstr(CvtOpcode);
315   CvtPk.addDef(CvtDst);
316   CvtPk.addReg(Hi32);
317   CvtPk.addReg(Lo32);
318   CvtPk.setMIFlags(MI.getFlags());
319 
320   auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
321   auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
322 
323   Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32);
324   MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass);
325   B.buildConstant(MinBoundaryDst, min);
326 
327   Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32);
328   MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass);
329   B.buildConstant(MaxBoundaryDst, max);
330 
331   Register MedDst = MRI.createGenericVirtualRegister(S32);
332   MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass);
333 
334   auto Med = B.buildInstr(AMDGPU::V_MED3_I32);
335   Med.addDef(MedDst);
336   Med.addReg(MinBoundaryDst);
337   Med.addReg(CvtDst);
338   Med.addReg(MaxBoundaryDst);
339   Med.setMIFlags(MI.getFlags());
340 
341   B.buildCopy(MI.getOperand(0).getReg(), MedDst);
342 
343   MI.eraseFromParent();
344 }
345 
346 class AMDGPUPostLegalizerCombinerHelperState {
347 protected:
348   CombinerHelper &Helper;
349   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
350 
351 public:
352   AMDGPUPostLegalizerCombinerHelperState(
353       CombinerHelper &Helper,
354       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
355       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
356 };
357 
358 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
359 #include "AMDGPUGenPostLegalizeGICombiner.inc"
360 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
361 
362 namespace {
363 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
364 #include "AMDGPUGenPostLegalizeGICombiner.inc"
365 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
366 
367 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
368   GISelKnownBits *KB;
369   MachineDominatorTree *MDT;
370 
371 public:
372   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
373 
374   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
375                                   const AMDGPULegalizerInfo *LI,
376                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
377       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
378                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
379         KB(KB), MDT(MDT) {
380     if (!GeneratedRuleCfg.parseCommandLineOption())
381       report_fatal_error("Invalid rule identifier");
382   }
383 
384   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
385                MachineIRBuilder &B) const override;
386 };
387 
388 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
389                                               MachineInstr &MI,
390                                               MachineIRBuilder &B) const {
391   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
392   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
393   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
394                                                  PostLegalizerHelper);
395 
396   if (Generated.tryCombineAll(Observer, MI, B))
397     return true;
398 
399   switch (MI.getOpcode()) {
400   case TargetOpcode::G_SHL:
401   case TargetOpcode::G_LSHR:
402   case TargetOpcode::G_ASHR:
403     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
404     // common case, splitting this into a move and a 32-bit shift is faster and
405     // the same code size.
406     return Helper.tryCombineShiftToUnmerge(MI, 32);
407   }
408 
409   return false;
410 }
411 
412 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
413 #include "AMDGPUGenPostLegalizeGICombiner.inc"
414 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
415 
416 // Pass boilerplate
417 // ================
418 
419 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
420 public:
421   static char ID;
422 
423   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
424 
425   StringRef getPassName() const override {
426     return "AMDGPUPostLegalizerCombiner";
427   }
428 
429   bool runOnMachineFunction(MachineFunction &MF) override;
430 
431   void getAnalysisUsage(AnalysisUsage &AU) const override;
432 private:
433   bool IsOptNone;
434 };
435 } // end anonymous namespace
436 
437 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
438   AU.addRequired<TargetPassConfig>();
439   AU.setPreservesCFG();
440   getSelectionDAGFallbackAnalysisUsage(AU);
441   AU.addRequired<GISelKnownBitsAnalysis>();
442   AU.addPreserved<GISelKnownBitsAnalysis>();
443   if (!IsOptNone) {
444     AU.addRequired<MachineDominatorTree>();
445     AU.addPreserved<MachineDominatorTree>();
446   }
447   MachineFunctionPass::getAnalysisUsage(AU);
448 }
449 
450 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
451   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
452   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
453 }
454 
455 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
456   if (MF.getProperties().hasProperty(
457           MachineFunctionProperties::Property::FailedISel))
458     return false;
459   auto *TPC = &getAnalysis<TargetPassConfig>();
460   const Function &F = MF.getFunction();
461   bool EnableOpt =
462       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
463 
464   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
465   const AMDGPULegalizerInfo *LI
466     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
467 
468   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
469   MachineDominatorTree *MDT =
470       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
471   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
472                                          F.hasMinSize(), LI, KB, MDT);
473   Combiner C(PCInfo, TPC);
474   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
475 }
476 
477 char AMDGPUPostLegalizerCombiner::ID = 0;
478 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
479                       "Combine AMDGPU machine instrs after legalization",
480                       false, false)
481 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
482 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
483 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
484                     "Combine AMDGPU machine instrs after legalization", false,
485                     false)
486 
487 namespace llvm {
488 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
489   return new AMDGPUPostLegalizerCombiner(IsOptNone);
490 }
491 } // end namespace llvm
492