1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 <<<<<<< HEAD 15 <<<<<<< HEAD 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "GCNSubtarget.h" 19 ======= 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPUTargetMachine.h" 22 >>>>>>> clang-format 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 ======= 25 #include "AMDGPUTargetMachine.h" 26 #include "AMDGPULegalizerInfo.h" 27 28 29 >>>>>>> Updating formatting changes. 30 #include "llvm/CodeGen/GlobalISel/Combiner.h" 31 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 32 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 33 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 34 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 35 #include "llvm/CodeGen/MachineDominators.h" 36 #include "llvm/CodeGen/TargetPassConfig.h" 37 <<<<<<< HEAD 38 #include "llvm/Target/TargetMachine.h" 39 ======= 40 #include "llvm/Support/Debug.h" 41 <<<<<<< HEAD 42 >>>>>>> clang-format 43 ======= 44 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 45 >>>>>>> Updating formatting changes. 46 47 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 48 49 using namespace llvm; 50 using namespace MIPatternMatch; 51 52 class AMDGPUPostLegalizerCombinerHelper { 53 protected: 54 MachineIRBuilder &B; 55 MachineFunction &MF; 56 MachineRegisterInfo &MRI; 57 CombinerHelper &Helper; 58 59 public: 60 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 61 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 62 63 struct FMinFMaxLegacyInfo { 64 Register LHS; 65 Register RHS; 66 Register True; 67 Register False; 68 CmpInst::Predicate Pred; 69 }; 70 71 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 72 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 73 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 74 const FMinFMaxLegacyInfo &Info); 75 76 bool matchUCharToFloat(MachineInstr &MI); 77 void applyUCharToFloat(MachineInstr &MI); 78 79 // FIXME: Should be able to have 2 separate matchdatas rather than custom 80 // struct boilerplate. 81 struct CvtF32UByteMatchInfo { 82 Register CvtVal; 83 unsigned ShiftOffset; 84 }; 85 86 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 87 void applyCvtF32UByteN(MachineInstr &MI, 88 const CvtF32UByteMatchInfo &MatchInfo); 89 }; 90 91 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 92 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 93 // FIXME: Combines should have subtarget predicates, and we shouldn't need 94 // this here. 95 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 96 return false; 97 98 // FIXME: Type predicate on pattern 99 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 100 return false; 101 102 Register Cond = MI.getOperand(1).getReg(); 103 if (!MRI.hasOneNonDBGUse(Cond) || 104 !mi_match(Cond, MRI, 105 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 106 return false; 107 108 Info.True = MI.getOperand(2).getReg(); 109 Info.False = MI.getOperand(3).getReg(); 110 111 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 112 !(Info.LHS == Info.False && Info.RHS == Info.True)) 113 return false; 114 115 switch (Info.Pred) { 116 case CmpInst::FCMP_FALSE: 117 case CmpInst::FCMP_OEQ: 118 case CmpInst::FCMP_ONE: 119 case CmpInst::FCMP_ORD: 120 case CmpInst::FCMP_UNO: 121 case CmpInst::FCMP_UEQ: 122 case CmpInst::FCMP_UNE: 123 case CmpInst::FCMP_TRUE: 124 return false; 125 default: 126 return true; 127 } 128 } 129 130 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 131 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 132 B.setInstrAndDebugLoc(MI); 133 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 134 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 135 }; 136 137 switch (Info.Pred) { 138 case CmpInst::FCMP_ULT: 139 case CmpInst::FCMP_ULE: 140 if (Info.LHS == Info.True) 141 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 142 else 143 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 144 break; 145 case CmpInst::FCMP_OLE: 146 case CmpInst::FCMP_OLT: { 147 // We need to permute the operands to get the correct NaN behavior. The 148 // selected operand is the second one based on the failing compare with NaN, 149 // so permute it based on the compare type the hardware uses. 150 if (Info.LHS == Info.True) 151 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 152 else 153 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 154 break; 155 } 156 case CmpInst::FCMP_UGE: 157 case CmpInst::FCMP_UGT: { 158 if (Info.LHS == Info.True) 159 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 160 else 161 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 162 break; 163 } 164 case CmpInst::FCMP_OGT: 165 case CmpInst::FCMP_OGE: { 166 if (Info.LHS == Info.True) 167 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 168 else 169 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 170 break; 171 } 172 default: 173 llvm_unreachable("predicate should not have matched"); 174 } 175 176 MI.eraseFromParent(); 177 } 178 179 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 180 Register DstReg = MI.getOperand(0).getReg(); 181 182 // TODO: We could try to match extracting the higher bytes, which would be 183 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 184 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 185 // about in practice. 186 LLT Ty = MRI.getType(DstReg); 187 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 188 Register SrcReg = MI.getOperand(1).getReg(); 189 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 190 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 191 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 192 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 193 } 194 195 return false; 196 } 197 198 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 199 B.setInstrAndDebugLoc(MI); 200 201 const LLT S32 = LLT::scalar(32); 202 203 Register DstReg = MI.getOperand(0).getReg(); 204 Register SrcReg = MI.getOperand(1).getReg(); 205 LLT Ty = MRI.getType(DstReg); 206 LLT SrcTy = MRI.getType(SrcReg); 207 if (SrcTy != S32) 208 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 209 210 if (Ty == S32) { 211 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 212 {SrcReg}, MI.getFlags()); 213 } else { 214 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 215 {SrcReg}, MI.getFlags()); 216 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 217 } 218 219 MI.eraseFromParent(); 220 } 221 222 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 223 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 224 Register SrcReg = MI.getOperand(1).getReg(); 225 226 // Look through G_ZEXT. 227 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 228 229 Register Src0; 230 int64_t ShiftAmt; 231 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 232 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 233 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 234 235 unsigned ShiftOffset = 8 * Offset; 236 if (IsShr) 237 ShiftOffset += ShiftAmt; 238 else 239 ShiftOffset -= ShiftAmt; 240 241 MatchInfo.CvtVal = Src0; 242 MatchInfo.ShiftOffset = ShiftOffset; 243 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 244 } 245 246 // TODO: Simplify demanded bits. 247 return false; 248 } 249 250 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 251 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 252 B.setInstrAndDebugLoc(MI); 253 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 254 255 const LLT S32 = LLT::scalar(32); 256 Register CvtSrc = MatchInfo.CvtVal; 257 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 258 if (SrcTy != S32) { 259 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 260 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 261 } 262 263 assert(MI.getOpcode() != NewOpc); 264 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 265 MI.eraseFromParent(); 266 } 267 268 class AMDGPUPostLegalizerCombinerHelperState { 269 protected: 270 CombinerHelper &Helper; 271 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 272 273 public: 274 AMDGPUPostLegalizerCombinerHelperState( 275 CombinerHelper &Helper, 276 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 277 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 278 }; 279 280 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 281 #include "AMDGPUGenPostLegalizeGICombiner.inc" 282 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 283 284 namespace { 285 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 286 #include "AMDGPUGenPostLegalizeGICombiner.inc" 287 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 288 289 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 290 GISelKnownBits *KB; 291 MachineDominatorTree *MDT; 292 293 public: 294 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 295 296 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 297 const AMDGPULegalizerInfo *LI, 298 GISelKnownBits *KB, MachineDominatorTree *MDT) 299 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 300 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 301 KB(KB), MDT(MDT) { 302 if (!GeneratedRuleCfg.parseCommandLineOption()) 303 report_fatal_error("Invalid rule identifier"); 304 } 305 306 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 307 MachineIRBuilder &B) const override; 308 }; 309 310 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 311 MachineInstr &MI, 312 MachineIRBuilder &B) const { 313 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 314 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 315 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 316 PostLegalizerHelper); 317 318 if (Generated.tryCombineAll(Observer, MI, B)) 319 return true; 320 321 switch (MI.getOpcode()) { 322 case TargetOpcode::G_SHL: 323 case TargetOpcode::G_LSHR: 324 case TargetOpcode::G_ASHR: 325 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 326 // common case, splitting this into a move and a 32-bit shift is faster and 327 // the same code size. 328 return Helper.tryCombineShiftToUnmerge(MI, 32); 329 } 330 331 return false; 332 } 333 334 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 335 #include "AMDGPUGenPostLegalizeGICombiner.inc" 336 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 337 338 // Pass boilerplate 339 // ================ 340 341 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 342 public: 343 static char ID; 344 345 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 346 347 StringRef getPassName() const override { 348 return "AMDGPUPostLegalizerCombiner"; 349 } 350 351 bool runOnMachineFunction(MachineFunction &MF) override; 352 353 void getAnalysisUsage(AnalysisUsage &AU) const override; 354 355 private: 356 bool IsOptNone; 357 }; 358 } // end anonymous namespace 359 360 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 361 AU.addRequired<TargetPassConfig>(); 362 AU.setPreservesCFG(); 363 getSelectionDAGFallbackAnalysisUsage(AU); 364 AU.addRequired<GISelKnownBitsAnalysis>(); 365 AU.addPreserved<GISelKnownBitsAnalysis>(); 366 if (!IsOptNone) { 367 AU.addRequired<MachineDominatorTree>(); 368 AU.addPreserved<MachineDominatorTree>(); 369 } 370 MachineFunctionPass::getAnalysisUsage(AU); 371 } 372 373 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 374 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 375 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 376 } 377 378 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 379 if (MF.getProperties().hasProperty( 380 MachineFunctionProperties::Property::FailedISel)) 381 return false; 382 auto *TPC = &getAnalysis<TargetPassConfig>(); 383 const Function &F = MF.getFunction(); 384 bool EnableOpt = 385 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 386 387 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 388 const AMDGPULegalizerInfo *LI 389 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 390 391 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 392 MachineDominatorTree *MDT = 393 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 394 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 395 F.hasMinSize(), LI, KB, MDT); 396 Combiner C(PCInfo, TPC); 397 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 398 } 399 400 char AMDGPUPostLegalizerCombiner::ID = 0; 401 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 402 "Combine AMDGPU machine instrs after legalization", 403 false, false) 404 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 405 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 406 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 407 "Combine AMDGPU machine instrs after legalization", false, 408 false) 409 410 namespace llvm { 411 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 412 return new AMDGPUPostLegalizerCombiner(IsOptNone); 413 } 414 } // end namespace llvm 415