1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "AMDGPUSubtarget.h" 17 #include "llvm/CodeGen/GlobalISel/Combiner.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 20 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 21 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 22 #include "llvm/CodeGen/MachineDominators.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Target/TargetMachine.h" 25 26 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 27 28 using namespace llvm; 29 using namespace MIPatternMatch; 30 31 class AMDGPUPostLegalizerCombinerHelper { 32 protected: 33 MachineIRBuilder &B; 34 MachineFunction &MF; 35 MachineRegisterInfo &MRI; 36 CombinerHelper &Helper; 37 38 public: 39 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 40 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 41 42 struct FMinFMaxLegacyInfo { 43 Register LHS; 44 Register RHS; 45 Register True; 46 Register False; 47 CmpInst::Predicate Pred; 48 }; 49 50 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 51 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 52 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 53 const FMinFMaxLegacyInfo &Info); 54 55 bool matchUCharToFloat(MachineInstr &MI); 56 void applyUCharToFloat(MachineInstr &MI); 57 58 // FIXME: Should be able to have 2 separate matchdatas rather than custom 59 // struct boilerplate. 60 struct CvtF32UByteMatchInfo { 61 Register CvtVal; 62 unsigned ShiftOffset; 63 }; 64 65 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 66 void applyCvtF32UByteN(MachineInstr &MI, 67 const CvtF32UByteMatchInfo &MatchInfo); 68 }; 69 70 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 71 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 72 // FIXME: Combines should have subtarget predicates, and we shouldn't need 73 // this here. 74 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 75 return false; 76 77 // FIXME: Type predicate on pattern 78 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 79 return false; 80 81 Register Cond = MI.getOperand(1).getReg(); 82 if (!MRI.hasOneNonDBGUse(Cond) || 83 !mi_match(Cond, MRI, 84 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 85 return false; 86 87 Info.True = MI.getOperand(2).getReg(); 88 Info.False = MI.getOperand(3).getReg(); 89 90 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 91 !(Info.LHS == Info.False && Info.RHS == Info.True)) 92 return false; 93 94 switch (Info.Pred) { 95 case CmpInst::FCMP_FALSE: 96 case CmpInst::FCMP_OEQ: 97 case CmpInst::FCMP_ONE: 98 case CmpInst::FCMP_ORD: 99 case CmpInst::FCMP_UNO: 100 case CmpInst::FCMP_UEQ: 101 case CmpInst::FCMP_UNE: 102 case CmpInst::FCMP_TRUE: 103 return false; 104 default: 105 return true; 106 } 107 } 108 109 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 110 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 111 B.setInstrAndDebugLoc(MI); 112 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 113 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 114 }; 115 116 switch (Info.Pred) { 117 case CmpInst::FCMP_ULT: 118 case CmpInst::FCMP_ULE: 119 if (Info.LHS == Info.True) 120 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 121 else 122 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 123 break; 124 case CmpInst::FCMP_OLE: 125 case CmpInst::FCMP_OLT: { 126 // We need to permute the operands to get the correct NaN behavior. The 127 // selected operand is the second one based on the failing compare with NaN, 128 // so permute it based on the compare type the hardware uses. 129 if (Info.LHS == Info.True) 130 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 131 else 132 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 133 break; 134 } 135 case CmpInst::FCMP_UGE: 136 case CmpInst::FCMP_UGT: { 137 if (Info.LHS == Info.True) 138 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 139 else 140 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 141 break; 142 } 143 case CmpInst::FCMP_OGT: 144 case CmpInst::FCMP_OGE: { 145 if (Info.LHS == Info.True) 146 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 147 else 148 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 149 break; 150 } 151 default: 152 llvm_unreachable("predicate should not have matched"); 153 } 154 155 MI.eraseFromParent(); 156 } 157 158 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 159 Register DstReg = MI.getOperand(0).getReg(); 160 161 // TODO: We could try to match extracting the higher bytes, which would be 162 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 163 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 164 // about in practice. 165 LLT Ty = MRI.getType(DstReg); 166 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 167 Register SrcReg = MI.getOperand(1).getReg(); 168 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 169 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 170 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 171 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 172 } 173 174 return false; 175 } 176 177 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 178 B.setInstrAndDebugLoc(MI); 179 180 const LLT S32 = LLT::scalar(32); 181 182 Register DstReg = MI.getOperand(0).getReg(); 183 Register SrcReg = MI.getOperand(1).getReg(); 184 LLT Ty = MRI.getType(DstReg); 185 LLT SrcTy = MRI.getType(SrcReg); 186 if (SrcTy != S32) 187 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 188 189 if (Ty == S32) { 190 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 191 {SrcReg}, MI.getFlags()); 192 } else { 193 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 194 {SrcReg}, MI.getFlags()); 195 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 196 } 197 198 MI.eraseFromParent(); 199 } 200 201 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 202 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 203 Register SrcReg = MI.getOperand(1).getReg(); 204 205 // Look through G_ZEXT. 206 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 207 208 Register Src0; 209 int64_t ShiftAmt; 210 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 211 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 212 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 213 214 unsigned ShiftOffset = 8 * Offset; 215 if (IsShr) 216 ShiftOffset += ShiftAmt; 217 else 218 ShiftOffset -= ShiftAmt; 219 220 MatchInfo.CvtVal = Src0; 221 MatchInfo.ShiftOffset = ShiftOffset; 222 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 223 } 224 225 // TODO: Simplify demanded bits. 226 return false; 227 } 228 229 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 230 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 231 B.setInstrAndDebugLoc(MI); 232 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 233 234 const LLT S32 = LLT::scalar(32); 235 Register CvtSrc = MatchInfo.CvtVal; 236 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 237 if (SrcTy != S32) { 238 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 239 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 240 } 241 242 assert(MI.getOpcode() != NewOpc); 243 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 244 MI.eraseFromParent(); 245 } 246 247 class AMDGPUPostLegalizerCombinerHelperState { 248 protected: 249 CombinerHelper &Helper; 250 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 251 252 public: 253 AMDGPUPostLegalizerCombinerHelperState( 254 CombinerHelper &Helper, 255 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 256 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 257 }; 258 259 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 260 #include "AMDGPUGenPostLegalizeGICombiner.inc" 261 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 262 263 namespace { 264 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 265 #include "AMDGPUGenPostLegalizeGICombiner.inc" 266 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 267 268 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 269 GISelKnownBits *KB; 270 MachineDominatorTree *MDT; 271 272 public: 273 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 274 275 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 276 const AMDGPULegalizerInfo *LI, 277 GISelKnownBits *KB, MachineDominatorTree *MDT) 278 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 279 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 280 KB(KB), MDT(MDT) { 281 if (!GeneratedRuleCfg.parseCommandLineOption()) 282 report_fatal_error("Invalid rule identifier"); 283 } 284 285 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 286 MachineIRBuilder &B) const override; 287 }; 288 289 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 290 MachineInstr &MI, 291 MachineIRBuilder &B) const { 292 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 293 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 294 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 295 PostLegalizerHelper); 296 297 if (Generated.tryCombineAll(Observer, MI, B)) 298 return true; 299 300 switch (MI.getOpcode()) { 301 case TargetOpcode::G_SHL: 302 case TargetOpcode::G_LSHR: 303 case TargetOpcode::G_ASHR: 304 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 305 // common case, splitting this into a move and a 32-bit shift is faster and 306 // the same code size. 307 return Helper.tryCombineShiftToUnmerge(MI, 32); 308 } 309 310 return false; 311 } 312 313 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 314 #include "AMDGPUGenPostLegalizeGICombiner.inc" 315 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 316 317 // Pass boilerplate 318 // ================ 319 320 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 321 public: 322 static char ID; 323 324 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 325 326 StringRef getPassName() const override { 327 return "AMDGPUPostLegalizerCombiner"; 328 } 329 330 bool runOnMachineFunction(MachineFunction &MF) override; 331 332 void getAnalysisUsage(AnalysisUsage &AU) const override; 333 private: 334 bool IsOptNone; 335 }; 336 } // end anonymous namespace 337 338 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 339 AU.addRequired<TargetPassConfig>(); 340 AU.setPreservesCFG(); 341 getSelectionDAGFallbackAnalysisUsage(AU); 342 AU.addRequired<GISelKnownBitsAnalysis>(); 343 AU.addPreserved<GISelKnownBitsAnalysis>(); 344 if (!IsOptNone) { 345 AU.addRequired<MachineDominatorTree>(); 346 AU.addPreserved<MachineDominatorTree>(); 347 } 348 MachineFunctionPass::getAnalysisUsage(AU); 349 } 350 351 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 352 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 353 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 354 } 355 356 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 357 if (MF.getProperties().hasProperty( 358 MachineFunctionProperties::Property::FailedISel)) 359 return false; 360 auto *TPC = &getAnalysis<TargetPassConfig>(); 361 const Function &F = MF.getFunction(); 362 bool EnableOpt = 363 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 364 365 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 366 const AMDGPULegalizerInfo *LI 367 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 368 369 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 370 MachineDominatorTree *MDT = 371 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 372 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 373 F.hasMinSize(), LI, KB, MDT); 374 Combiner C(PCInfo, TPC); 375 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 376 } 377 378 char AMDGPUPostLegalizerCombiner::ID = 0; 379 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 380 "Combine AMDGPU machine instrs after legalization", 381 false, false) 382 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 383 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 384 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 385 "Combine AMDGPU machine instrs after legalization", false, 386 false) 387 388 namespace llvm { 389 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 390 return new AMDGPUPostLegalizerCombiner(IsOptNone); 391 } 392 } // end namespace llvm 393