1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/Target/TargetMachine.h"
28
29 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
30
31 using namespace llvm;
32 using namespace MIPatternMatch;
33
34 class AMDGPUPostLegalizerCombinerHelper {
35 protected:
36 MachineIRBuilder &B;
37 MachineFunction &MF;
38 MachineRegisterInfo &MRI;
39 AMDGPUCombinerHelper &Helper;
40
41 public:
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder & B,AMDGPUCombinerHelper & Helper)42 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
43 AMDGPUCombinerHelper &Helper)
44 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
45
46 struct FMinFMaxLegacyInfo {
47 Register LHS;
48 Register RHS;
49 Register True;
50 Register False;
51 CmpInst::Predicate Pred;
52 };
53
54 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
55 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
56 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
57 const FMinFMaxLegacyInfo &Info);
58
59 bool matchUCharToFloat(MachineInstr &MI);
60 void applyUCharToFloat(MachineInstr &MI);
61
62 bool matchRcpSqrtToRsq(MachineInstr &MI,
63 std::function<void(MachineIRBuilder &)> &MatchInfo);
64
65 // FIXME: Should be able to have 2 separate matchdatas rather than custom
66 // struct boilerplate.
67 struct CvtF32UByteMatchInfo {
68 Register CvtVal;
69 unsigned ShiftOffset;
70 };
71
72 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
73 void applyCvtF32UByteN(MachineInstr &MI,
74 const CvtF32UByteMatchInfo &MatchInfo);
75
76 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
77 };
78
matchFMinFMaxLegacy(MachineInstr & MI,FMinFMaxLegacyInfo & Info)79 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
80 MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
81 // FIXME: Combines should have subtarget predicates, and we shouldn't need
82 // this here.
83 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
84 return false;
85
86 // FIXME: Type predicate on pattern
87 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
88 return false;
89
90 Register Cond = MI.getOperand(1).getReg();
91 if (!MRI.hasOneNonDBGUse(Cond) ||
92 !mi_match(Cond, MRI,
93 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
94 return false;
95
96 Info.True = MI.getOperand(2).getReg();
97 Info.False = MI.getOperand(3).getReg();
98
99 if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
100 !(Info.LHS == Info.False && Info.RHS == Info.True))
101 return false;
102
103 switch (Info.Pred) {
104 case CmpInst::FCMP_FALSE:
105 case CmpInst::FCMP_OEQ:
106 case CmpInst::FCMP_ONE:
107 case CmpInst::FCMP_ORD:
108 case CmpInst::FCMP_UNO:
109 case CmpInst::FCMP_UEQ:
110 case CmpInst::FCMP_UNE:
111 case CmpInst::FCMP_TRUE:
112 return false;
113 default:
114 return true;
115 }
116 }
117
applySelectFCmpToFMinToFMaxLegacy(MachineInstr & MI,const FMinFMaxLegacyInfo & Info)118 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
119 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
120 B.setInstrAndDebugLoc(MI);
121 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
122 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
123 };
124
125 switch (Info.Pred) {
126 case CmpInst::FCMP_ULT:
127 case CmpInst::FCMP_ULE:
128 if (Info.LHS == Info.True)
129 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
130 else
131 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
132 break;
133 case CmpInst::FCMP_OLE:
134 case CmpInst::FCMP_OLT: {
135 // We need to permute the operands to get the correct NaN behavior. The
136 // selected operand is the second one based on the failing compare with NaN,
137 // so permute it based on the compare type the hardware uses.
138 if (Info.LHS == Info.True)
139 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
140 else
141 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
142 break;
143 }
144 case CmpInst::FCMP_UGE:
145 case CmpInst::FCMP_UGT: {
146 if (Info.LHS == Info.True)
147 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
148 else
149 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
150 break;
151 }
152 case CmpInst::FCMP_OGT:
153 case CmpInst::FCMP_OGE: {
154 if (Info.LHS == Info.True)
155 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
156 else
157 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
158 break;
159 }
160 default:
161 llvm_unreachable("predicate should not have matched");
162 }
163
164 MI.eraseFromParent();
165 }
166
matchUCharToFloat(MachineInstr & MI)167 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
168 Register DstReg = MI.getOperand(0).getReg();
169
170 // TODO: We could try to match extracting the higher bytes, which would be
171 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
172 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
173 // about in practice.
174 LLT Ty = MRI.getType(DstReg);
175 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
176 Register SrcReg = MI.getOperand(1).getReg();
177 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
178 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
179 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
180 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
181 }
182
183 return false;
184 }
185
applyUCharToFloat(MachineInstr & MI)186 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
187 B.setInstrAndDebugLoc(MI);
188
189 const LLT S32 = LLT::scalar(32);
190
191 Register DstReg = MI.getOperand(0).getReg();
192 Register SrcReg = MI.getOperand(1).getReg();
193 LLT Ty = MRI.getType(DstReg);
194 LLT SrcTy = MRI.getType(SrcReg);
195 if (SrcTy != S32)
196 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
197
198 if (Ty == S32) {
199 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
200 {SrcReg}, MI.getFlags());
201 } else {
202 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
203 {SrcReg}, MI.getFlags());
204 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
205 }
206
207 MI.eraseFromParent();
208 }
209
matchRcpSqrtToRsq(MachineInstr & MI,std::function<void (MachineIRBuilder &)> & MatchInfo)210 bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
211 MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
212
213 auto getRcpSrc = [=](const MachineInstr &MI) {
214 MachineInstr *ResMI = nullptr;
215 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
216 MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
217 ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
218
219 return ResMI;
220 };
221
222 auto getSqrtSrc = [=](const MachineInstr &MI) {
223 MachineInstr *SqrtSrcMI = nullptr;
224 mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
225 return SqrtSrcMI;
226 };
227
228 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
229 // rcp(sqrt(x))
230 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
231 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
232 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
233 .addUse(SqrtSrcMI->getOperand(0).getReg())
234 .setMIFlags(MI.getFlags());
235 };
236 return true;
237 }
238
239 // sqrt(rcp(x))
240 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
241 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
242 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
243 .addUse(RcpSrcMI->getOperand(0).getReg())
244 .setMIFlags(MI.getFlags());
245 };
246 return true;
247 }
248
249 return false;
250 }
251
matchCvtF32UByteN(MachineInstr & MI,CvtF32UByteMatchInfo & MatchInfo)252 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
253 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
254 Register SrcReg = MI.getOperand(1).getReg();
255
256 // Look through G_ZEXT.
257 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
258
259 Register Src0;
260 int64_t ShiftAmt;
261 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
262 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
263 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
264
265 unsigned ShiftOffset = 8 * Offset;
266 if (IsShr)
267 ShiftOffset += ShiftAmt;
268 else
269 ShiftOffset -= ShiftAmt;
270
271 MatchInfo.CvtVal = Src0;
272 MatchInfo.ShiftOffset = ShiftOffset;
273 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
274 }
275
276 // TODO: Simplify demanded bits.
277 return false;
278 }
279
applyCvtF32UByteN(MachineInstr & MI,const CvtF32UByteMatchInfo & MatchInfo)280 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
281 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
282 B.setInstrAndDebugLoc(MI);
283 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
284
285 const LLT S32 = LLT::scalar(32);
286 Register CvtSrc = MatchInfo.CvtVal;
287 LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
288 if (SrcTy != S32) {
289 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
290 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
291 }
292
293 assert(MI.getOpcode() != NewOpc);
294 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
295 MI.eraseFromParent();
296 }
297
matchRemoveFcanonicalize(MachineInstr & MI,Register & Reg)298 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
299 MachineInstr &MI, Register &Reg) {
300 const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
301 MF.getSubtarget().getTargetLowering());
302 Reg = MI.getOperand(1).getReg();
303 return TLI->isCanonicalized(Reg, MF);
304 }
305
306 class AMDGPUPostLegalizerCombinerHelperState {
307 protected:
308 AMDGPUCombinerHelper &Helper;
309 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
310
311 public:
AMDGPUPostLegalizerCombinerHelperState(AMDGPUCombinerHelper & Helper,AMDGPUPostLegalizerCombinerHelper & PostLegalizerHelper)312 AMDGPUPostLegalizerCombinerHelperState(
313 AMDGPUCombinerHelper &Helper,
314 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
315 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
316 };
317
318 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
319 #include "AMDGPUGenPostLegalizeGICombiner.inc"
320 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
321
322 namespace {
323 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
324 #include "AMDGPUGenPostLegalizeGICombiner.inc"
325 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
326
327 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
328 GISelKnownBits *KB;
329 MachineDominatorTree *MDT;
330
331 public:
332 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
333
AMDGPUPostLegalizerCombinerInfo(bool EnableOpt,bool OptSize,bool MinSize,const AMDGPULegalizerInfo * LI,GISelKnownBits * KB,MachineDominatorTree * MDT)334 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
335 const AMDGPULegalizerInfo *LI,
336 GISelKnownBits *KB, MachineDominatorTree *MDT)
337 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
338 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
339 KB(KB), MDT(MDT) {
340 if (!GeneratedRuleCfg.parseCommandLineOption())
341 report_fatal_error("Invalid rule identifier");
342 }
343
344 bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
345 MachineIRBuilder &B) const override;
346 };
347
combine(GISelChangeObserver & Observer,MachineInstr & MI,MachineIRBuilder & B) const348 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
349 MachineInstr &MI,
350 MachineIRBuilder &B) const {
351 AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo);
352 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
353 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
354 PostLegalizerHelper);
355
356 if (Generated.tryCombineAll(Observer, MI, B))
357 return true;
358
359 switch (MI.getOpcode()) {
360 case TargetOpcode::G_SHL:
361 case TargetOpcode::G_LSHR:
362 case TargetOpcode::G_ASHR:
363 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
364 // common case, splitting this into a move and a 32-bit shift is faster and
365 // the same code size.
366 return Helper.tryCombineShiftToUnmerge(MI, 32);
367 }
368
369 return false;
370 }
371
372 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
373 #include "AMDGPUGenPostLegalizeGICombiner.inc"
374 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
375
376 // Pass boilerplate
377 // ================
378
379 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
380 public:
381 static char ID;
382
383 AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
384
getPassName() const385 StringRef getPassName() const override {
386 return "AMDGPUPostLegalizerCombiner";
387 }
388
389 bool runOnMachineFunction(MachineFunction &MF) override;
390
391 void getAnalysisUsage(AnalysisUsage &AU) const override;
392 private:
393 bool IsOptNone;
394 };
395 } // end anonymous namespace
396
getAnalysisUsage(AnalysisUsage & AU) const397 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
398 AU.addRequired<TargetPassConfig>();
399 AU.setPreservesCFG();
400 getSelectionDAGFallbackAnalysisUsage(AU);
401 AU.addRequired<GISelKnownBitsAnalysis>();
402 AU.addPreserved<GISelKnownBitsAnalysis>();
403 if (!IsOptNone) {
404 AU.addRequired<MachineDominatorTree>();
405 AU.addPreserved<MachineDominatorTree>();
406 }
407 MachineFunctionPass::getAnalysisUsage(AU);
408 }
409
AMDGPUPostLegalizerCombiner(bool IsOptNone)410 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
411 : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
412 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
413 }
414
runOnMachineFunction(MachineFunction & MF)415 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
416 if (MF.getProperties().hasProperty(
417 MachineFunctionProperties::Property::FailedISel))
418 return false;
419 auto *TPC = &getAnalysis<TargetPassConfig>();
420 const Function &F = MF.getFunction();
421 bool EnableOpt =
422 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
423
424 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
425 const AMDGPULegalizerInfo *LI
426 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
427
428 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
429 MachineDominatorTree *MDT =
430 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
431 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
432 F.hasMinSize(), LI, KB, MDT);
433 Combiner C(PCInfo, TPC);
434 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
435 }
436
437 char AMDGPUPostLegalizerCombiner::ID = 0;
438 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
439 "Combine AMDGPU machine instrs after legalization",
440 false, false)
441 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
442 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
443 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
444 "Combine AMDGPU machine instrs after legalization", false,
445 false)
446
447 namespace llvm {
createAMDGPUPostLegalizeCombiner(bool IsOptNone)448 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
449 return new AMDGPUPostLegalizerCombiner(IsOptNone);
450 }
451 } // end namespace llvm
452