1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUTargetMachine.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "llvm/CodeGen/GlobalISel/Combiner.h" 17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/MachineDominators.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Support/Debug.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 struct FMinFMaxLegacyInfo { 33 Register LHS; 34 Register RHS; 35 Register True; 36 Register False; 37 CmpInst::Predicate Pred; 38 }; 39 40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, 42 MachineFunction &MF, FMinFMaxLegacyInfo &Info) { 43 // FIXME: Combines should have subtarget predicates, and we shouldn't need 44 // this here. 45 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 46 return false; 47 48 // FIXME: Type predicate on pattern 49 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 50 return false; 51 52 Register Cond = MI.getOperand(1).getReg(); 53 if (!MRI.hasOneNonDBGUse(Cond) || 54 !mi_match(Cond, MRI, 55 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 56 return false; 57 58 Info.True = MI.getOperand(2).getReg(); 59 Info.False = MI.getOperand(3).getReg(); 60 61 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 62 !(Info.LHS == Info.False && Info.RHS == Info.True)) 63 return false; 64 65 switch (Info.Pred) { 66 case CmpInst::FCMP_FALSE: 67 case CmpInst::FCMP_OEQ: 68 case CmpInst::FCMP_ONE: 69 case CmpInst::FCMP_ORD: 70 case CmpInst::FCMP_UNO: 71 case CmpInst::FCMP_UEQ: 72 case CmpInst::FCMP_UNE: 73 case CmpInst::FCMP_TRUE: 74 return false; 75 default: 76 return true; 77 } 78 } 79 80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 81 const FMinFMaxLegacyInfo &Info) { 82 83 auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { 84 MachineIRBuilder MIB(MI); 85 MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 86 }; 87 88 switch (Info.Pred) { 89 case CmpInst::FCMP_ULT: 90 case CmpInst::FCMP_ULE: 91 if (Info.LHS == Info.True) 92 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 93 else 94 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 95 break; 96 case CmpInst::FCMP_OLE: 97 case CmpInst::FCMP_OLT: { 98 // We need to permute the operands to get the correct NaN behavior. The 99 // selected operand is the second one based on the failing compare with NaN, 100 // so permute it based on the compare type the hardware uses. 101 if (Info.LHS == Info.True) 102 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 103 else 104 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 105 break; 106 } 107 case CmpInst::FCMP_UGE: 108 case CmpInst::FCMP_UGT: { 109 if (Info.LHS == Info.True) 110 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 111 else 112 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 113 break; 114 } 115 case CmpInst::FCMP_OGT: 116 case CmpInst::FCMP_OGE: { 117 if (Info.LHS == Info.True) 118 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 119 else 120 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 121 break; 122 } 123 default: 124 llvm_unreachable("predicate should not have matched"); 125 } 126 127 MI.eraseFromParent(); 128 } 129 130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, 131 MachineFunction &MF, CombinerHelper &Helper) { 132 Register DstReg = MI.getOperand(0).getReg(); 133 134 // TODO: We could try to match extracting the higher bytes, which would be 135 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 136 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 137 // about in practice. 138 LLT Ty = MRI.getType(DstReg); 139 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 140 const APInt Mask = APInt::getHighBitsSet(32, 24); 141 return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(), 142 Mask); 143 } 144 145 return false; 146 } 147 148 static void applyUCharToFloat(MachineInstr &MI) { 149 MachineIRBuilder B(MI); 150 151 const LLT S32 = LLT::scalar(32); 152 153 Register DstReg = MI.getOperand(0).getReg(); 154 LLT Ty = B.getMRI()->getType(DstReg); 155 156 if (Ty == S32) { 157 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 158 {MI.getOperand(1)}, MI.getFlags()); 159 } else { 160 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 161 {MI.getOperand(1)}, MI.getFlags()); 162 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 163 } 164 165 MI.eraseFromParent(); 166 } 167 168 // FIXME: Should be able to have 2 separate matchdatas rather than custom struct 169 // boilerplate. 170 struct CvtF32UByteMatchInfo { 171 Register CvtVal; 172 unsigned ShiftOffset; 173 }; 174 175 static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, 176 MachineFunction &MF, 177 CvtF32UByteMatchInfo &MatchInfo) { 178 Register SrcReg = MI.getOperand(1).getReg(); 179 180 // Look through G_ZEXT. 181 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 182 183 Register Src0; 184 int64_t ShiftAmt; 185 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 186 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 187 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 188 189 unsigned ShiftOffset = 8 * Offset; 190 if (IsShr) 191 ShiftOffset += ShiftAmt; 192 else 193 ShiftOffset -= ShiftAmt; 194 195 MatchInfo.CvtVal = Src0; 196 MatchInfo.ShiftOffset = ShiftOffset; 197 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 198 } 199 200 // TODO: Simplify demanded bits. 201 return false; 202 } 203 204 static void applyCvtF32UByteN(MachineInstr &MI, 205 const CvtF32UByteMatchInfo &MatchInfo) { 206 MachineIRBuilder B(MI); 207 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 208 209 const LLT S32 = LLT::scalar(32); 210 Register CvtSrc = MatchInfo.CvtVal; 211 LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal); 212 if (SrcTy != S32) { 213 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 214 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 215 } 216 217 assert(MI.getOpcode() != NewOpc); 218 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 219 MI.eraseFromParent(); 220 } 221 222 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 223 #include "AMDGPUGenPostLegalizeGICombiner.inc" 224 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 225 226 namespace { 227 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 228 #include "AMDGPUGenPostLegalizeGICombiner.inc" 229 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 230 231 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { 232 GISelKnownBits *KB; 233 MachineDominatorTree *MDT; 234 235 public: 236 AMDGPUGenPostLegalizerCombinerHelper Generated; 237 238 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 239 const AMDGPULegalizerInfo *LI, 240 GISelKnownBits *KB, MachineDominatorTree *MDT) 241 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 242 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 243 KB(KB), MDT(MDT) { 244 if (!Generated.parseCommandLineOption()) 245 report_fatal_error("Invalid rule identifier"); 246 } 247 248 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 249 MachineIRBuilder &B) const override; 250 }; 251 252 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 253 MachineInstr &MI, 254 MachineIRBuilder &B) const { 255 CombinerHelper Helper(Observer, B, KB, MDT); 256 257 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 258 return true; 259 260 switch (MI.getOpcode()) { 261 case TargetOpcode::G_SHL: 262 case TargetOpcode::G_LSHR: 263 case TargetOpcode::G_ASHR: 264 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 265 // common case, splitting this into a move and a 32-bit shift is faster and 266 // the same code size. 267 return Helper.tryCombineShiftToUnmerge(MI, 32); 268 } 269 270 return false; 271 } 272 273 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 274 #include "AMDGPUGenPostLegalizeGICombiner.inc" 275 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 276 277 // Pass boilerplate 278 // ================ 279 280 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 281 public: 282 static char ID; 283 284 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 285 286 StringRef getPassName() const override { 287 return "AMDGPUPostLegalizerCombiner"; 288 } 289 290 bool runOnMachineFunction(MachineFunction &MF) override; 291 292 void getAnalysisUsage(AnalysisUsage &AU) const override; 293 private: 294 bool IsOptNone; 295 }; 296 } // end anonymous namespace 297 298 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 299 AU.addRequired<TargetPassConfig>(); 300 AU.setPreservesCFG(); 301 getSelectionDAGFallbackAnalysisUsage(AU); 302 AU.addRequired<GISelKnownBitsAnalysis>(); 303 AU.addPreserved<GISelKnownBitsAnalysis>(); 304 if (!IsOptNone) { 305 AU.addRequired<MachineDominatorTree>(); 306 AU.addPreserved<MachineDominatorTree>(); 307 } 308 MachineFunctionPass::getAnalysisUsage(AU); 309 } 310 311 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 312 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 313 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 314 } 315 316 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 317 if (MF.getProperties().hasProperty( 318 MachineFunctionProperties::Property::FailedISel)) 319 return false; 320 auto *TPC = &getAnalysis<TargetPassConfig>(); 321 const Function &F = MF.getFunction(); 322 bool EnableOpt = 323 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 324 325 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 326 const AMDGPULegalizerInfo *LI 327 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 328 329 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 330 MachineDominatorTree *MDT = 331 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 332 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 333 F.hasMinSize(), LI, KB, MDT); 334 Combiner C(PCInfo, TPC); 335 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 336 } 337 338 char AMDGPUPostLegalizerCombiner::ID = 0; 339 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 340 "Combine AMDGPU machine instrs after legalization", 341 false, false) 342 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 343 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 344 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 345 "Combine AMDGPU machine instrs after legalization", false, 346 false) 347 348 namespace llvm { 349 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 350 return new AMDGPUPostLegalizerCombiner(IsOptNone); 351 } 352 } // end namespace llvm 353