1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUTargetMachine.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "llvm/CodeGen/GlobalISel/Combiner.h" 17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/MachineDominators.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Support/Debug.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 struct FMinFMaxLegacyInfo { 33 Register LHS; 34 Register RHS; 35 Register True; 36 Register False; 37 CmpInst::Predicate Pred; 38 }; 39 40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, 42 MachineFunction &MF, FMinFMaxLegacyInfo &Info) { 43 // FIXME: Combines should have subtarget predicates, and we shouldn't need 44 // this here. 45 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 46 return false; 47 48 // FIXME: Type predicate on pattern 49 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 50 return false; 51 52 Register Cond = MI.getOperand(1).getReg(); 53 if (!MRI.hasOneNonDBGUse(Cond) || 54 !mi_match(Cond, MRI, 55 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 56 return false; 57 58 Info.True = MI.getOperand(2).getReg(); 59 Info.False = MI.getOperand(3).getReg(); 60 61 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 62 !(Info.LHS == Info.False && Info.RHS == Info.True)) 63 return false; 64 65 switch (Info.Pred) { 66 case CmpInst::FCMP_FALSE: 67 case CmpInst::FCMP_OEQ: 68 case CmpInst::FCMP_ONE: 69 case CmpInst::FCMP_ORD: 70 case CmpInst::FCMP_UNO: 71 case CmpInst::FCMP_UEQ: 72 case CmpInst::FCMP_UNE: 73 case CmpInst::FCMP_TRUE: 74 return false; 75 default: 76 return true; 77 } 78 } 79 80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 81 const FMinFMaxLegacyInfo &Info) { 82 83 auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { 84 MachineIRBuilder MIB(MI); 85 MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 86 }; 87 88 switch (Info.Pred) { 89 case CmpInst::FCMP_ULT: 90 case CmpInst::FCMP_ULE: 91 if (Info.LHS == Info.True) 92 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 93 else 94 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 95 break; 96 case CmpInst::FCMP_OLE: 97 case CmpInst::FCMP_OLT: { 98 // We need to permute the operands to get the correct NaN behavior. The 99 // selected operand is the second one based on the failing compare with NaN, 100 // so permute it based on the compare type the hardware uses. 101 if (Info.LHS == Info.True) 102 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 103 else 104 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 105 break; 106 } 107 case CmpInst::FCMP_UGE: 108 case CmpInst::FCMP_UGT: { 109 if (Info.LHS == Info.True) 110 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 111 else 112 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 113 break; 114 } 115 case CmpInst::FCMP_OGT: 116 case CmpInst::FCMP_OGE: { 117 if (Info.LHS == Info.True) 118 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 119 else 120 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 121 break; 122 } 123 default: 124 llvm_unreachable("predicate should not have matched"); 125 } 126 127 MI.eraseFromParent(); 128 } 129 130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, 131 MachineFunction &MF, CombinerHelper &Helper) { 132 Register DstReg = MI.getOperand(0).getReg(); 133 134 // TODO: We could try to match extracting the higher bytes, which would be 135 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 136 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 137 // about in practice. 138 LLT Ty = MRI.getType(DstReg); 139 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 140 const APInt Mask = APInt::getHighBitsSet(32, 24); 141 return Helper.getKnownBits()->maskedValueIsZero(MI.getOperand(1).getReg(), 142 Mask); 143 } 144 145 return false; 146 } 147 148 static void applyUCharToFloat(MachineInstr &MI) { 149 MachineIRBuilder B(MI); 150 151 const LLT S32 = LLT::scalar(32); 152 153 Register DstReg = MI.getOperand(0).getReg(); 154 LLT Ty = B.getMRI()->getType(DstReg); 155 156 if (Ty == S32) { 157 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 158 {MI.getOperand(1)}, MI.getFlags()); 159 } else { 160 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 161 {MI.getOperand(1)}, MI.getFlags()); 162 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 163 } 164 165 MI.eraseFromParent(); 166 } 167 168 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 169 #include "AMDGPUGenPostLegalizeGICombiner.inc" 170 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 171 172 namespace { 173 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 174 #include "AMDGPUGenPostLegalizeGICombiner.inc" 175 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 176 177 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { 178 GISelKnownBits *KB; 179 MachineDominatorTree *MDT; 180 181 public: 182 AMDGPUGenPostLegalizerCombinerHelper Generated; 183 184 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 185 const AMDGPULegalizerInfo *LI, 186 GISelKnownBits *KB, MachineDominatorTree *MDT) 187 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 188 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 189 KB(KB), MDT(MDT) { 190 if (!Generated.parseCommandLineOption()) 191 report_fatal_error("Invalid rule identifier"); 192 } 193 194 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 195 MachineIRBuilder &B) const override; 196 }; 197 198 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 199 MachineInstr &MI, 200 MachineIRBuilder &B) const { 201 CombinerHelper Helper(Observer, B, KB, MDT); 202 203 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 204 return true; 205 206 switch (MI.getOpcode()) { 207 case TargetOpcode::G_SHL: 208 case TargetOpcode::G_LSHR: 209 case TargetOpcode::G_ASHR: 210 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 211 // common case, splitting this into a move and a 32-bit shift is faster and 212 // the same code size. 213 return Helper.tryCombineShiftToUnmerge(MI, 32); 214 } 215 216 return false; 217 } 218 219 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 220 #include "AMDGPUGenPostLegalizeGICombiner.inc" 221 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 222 223 // Pass boilerplate 224 // ================ 225 226 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 227 public: 228 static char ID; 229 230 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 231 232 StringRef getPassName() const override { 233 return "AMDGPUPostLegalizerCombiner"; 234 } 235 236 bool runOnMachineFunction(MachineFunction &MF) override; 237 238 void getAnalysisUsage(AnalysisUsage &AU) const override; 239 private: 240 bool IsOptNone; 241 }; 242 } // end anonymous namespace 243 244 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 245 AU.addRequired<TargetPassConfig>(); 246 AU.setPreservesCFG(); 247 getSelectionDAGFallbackAnalysisUsage(AU); 248 AU.addRequired<GISelKnownBitsAnalysis>(); 249 AU.addPreserved<GISelKnownBitsAnalysis>(); 250 if (!IsOptNone) { 251 AU.addRequired<MachineDominatorTree>(); 252 AU.addPreserved<MachineDominatorTree>(); 253 } 254 MachineFunctionPass::getAnalysisUsage(AU); 255 } 256 257 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 258 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 259 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 260 } 261 262 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 263 if (MF.getProperties().hasProperty( 264 MachineFunctionProperties::Property::FailedISel)) 265 return false; 266 auto *TPC = &getAnalysis<TargetPassConfig>(); 267 const Function &F = MF.getFunction(); 268 bool EnableOpt = 269 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 270 271 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 272 const AMDGPULegalizerInfo *LI 273 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 274 275 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 276 MachineDominatorTree *MDT = 277 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 278 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 279 F.hasMinSize(), LI, KB, MDT); 280 Combiner C(PCInfo, TPC); 281 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 282 } 283 284 char AMDGPUPostLegalizerCombiner::ID = 0; 285 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 286 "Combine AMDGPU machine instrs after legalization", 287 false, false) 288 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 289 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 290 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 291 "Combine AMDGPU machine instrs after legalization", false, 292 false) 293 294 namespace llvm { 295 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 296 return new AMDGPUPostLegalizerCombiner(IsOptNone); 297 } 298 } // end namespace llvm 299