1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "llvm/CodeGen/GlobalISel/Combiner.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/Target/TargetMachine.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 class AMDGPUPostLegalizerCombinerHelper { 33 protected: 34 MachineIRBuilder &B; 35 MachineFunction &MF; 36 MachineRegisterInfo &MRI; 37 CombinerHelper &Helper; 38 39 public: 40 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 42 43 struct FMinFMaxLegacyInfo { 44 Register LHS; 45 Register RHS; 46 Register True; 47 Register False; 48 CmpInst::Predicate Pred; 49 }; 50 51 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 52 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 53 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 54 const FMinFMaxLegacyInfo &Info); 55 56 bool matchUCharToFloat(MachineInstr &MI); 57 void applyUCharToFloat(MachineInstr &MI); 58 59 // FIXME: Should be able to have 2 separate matchdatas rather than custom 60 // struct boilerplate. 61 struct CvtF32UByteMatchInfo { 62 Register CvtVal; 63 unsigned ShiftOffset; 64 }; 65 66 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 67 void applyCvtF32UByteN(MachineInstr &MI, 68 const CvtF32UByteMatchInfo &MatchInfo); 69 70 struct ClampI64ToI16MatchInfo { 71 int64_t Cmp1; 72 int64_t Cmp2; 73 Register Origin; 74 }; 75 76 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 77 MachineFunction &MF, 78 ClampI64ToI16MatchInfo &MatchInfo); 79 80 void applyClampI64ToI16(MachineInstr &MI, 81 const ClampI64ToI16MatchInfo &MatchInfo); 82 }; 83 84 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 85 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 86 // FIXME: Combines should have subtarget predicates, and we shouldn't need 87 // this here. 88 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 89 return false; 90 91 // FIXME: Type predicate on pattern 92 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 93 return false; 94 95 Register Cond = MI.getOperand(1).getReg(); 96 if (!MRI.hasOneNonDBGUse(Cond) || 97 !mi_match(Cond, MRI, 98 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 99 return false; 100 101 Info.True = MI.getOperand(2).getReg(); 102 Info.False = MI.getOperand(3).getReg(); 103 104 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 105 !(Info.LHS == Info.False && Info.RHS == Info.True)) 106 return false; 107 108 switch (Info.Pred) { 109 case CmpInst::FCMP_FALSE: 110 case CmpInst::FCMP_OEQ: 111 case CmpInst::FCMP_ONE: 112 case CmpInst::FCMP_ORD: 113 case CmpInst::FCMP_UNO: 114 case CmpInst::FCMP_UEQ: 115 case CmpInst::FCMP_UNE: 116 case CmpInst::FCMP_TRUE: 117 return false; 118 default: 119 return true; 120 } 121 } 122 123 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 124 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 125 B.setInstrAndDebugLoc(MI); 126 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 127 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 128 }; 129 130 switch (Info.Pred) { 131 case CmpInst::FCMP_ULT: 132 case CmpInst::FCMP_ULE: 133 if (Info.LHS == Info.True) 134 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 135 else 136 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 137 break; 138 case CmpInst::FCMP_OLE: 139 case CmpInst::FCMP_OLT: { 140 // We need to permute the operands to get the correct NaN behavior. The 141 // selected operand is the second one based on the failing compare with NaN, 142 // so permute it based on the compare type the hardware uses. 143 if (Info.LHS == Info.True) 144 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 145 else 146 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 147 break; 148 } 149 case CmpInst::FCMP_UGE: 150 case CmpInst::FCMP_UGT: { 151 if (Info.LHS == Info.True) 152 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 153 else 154 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 155 break; 156 } 157 case CmpInst::FCMP_OGT: 158 case CmpInst::FCMP_OGE: { 159 if (Info.LHS == Info.True) 160 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 161 else 162 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 163 break; 164 } 165 default: 166 llvm_unreachable("predicate should not have matched"); 167 } 168 169 MI.eraseFromParent(); 170 } 171 172 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 173 Register DstReg = MI.getOperand(0).getReg(); 174 175 // TODO: We could try to match extracting the higher bytes, which would be 176 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 177 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 178 // about in practice. 179 LLT Ty = MRI.getType(DstReg); 180 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 181 Register SrcReg = MI.getOperand(1).getReg(); 182 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 183 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 184 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 185 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 186 } 187 188 return false; 189 } 190 191 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 192 B.setInstrAndDebugLoc(MI); 193 194 const LLT S32 = LLT::scalar(32); 195 196 Register DstReg = MI.getOperand(0).getReg(); 197 Register SrcReg = MI.getOperand(1).getReg(); 198 LLT Ty = MRI.getType(DstReg); 199 LLT SrcTy = MRI.getType(SrcReg); 200 if (SrcTy != S32) 201 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 202 203 if (Ty == S32) { 204 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 205 {SrcReg}, MI.getFlags()); 206 } else { 207 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 208 {SrcReg}, MI.getFlags()); 209 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 210 } 211 212 MI.eraseFromParent(); 213 } 214 215 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 216 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 217 Register SrcReg = MI.getOperand(1).getReg(); 218 219 // Look through G_ZEXT. 220 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 221 222 Register Src0; 223 int64_t ShiftAmt; 224 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 225 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 226 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 227 228 unsigned ShiftOffset = 8 * Offset; 229 if (IsShr) 230 ShiftOffset += ShiftAmt; 231 else 232 ShiftOffset -= ShiftAmt; 233 234 MatchInfo.CvtVal = Src0; 235 MatchInfo.ShiftOffset = ShiftOffset; 236 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 237 } 238 239 // TODO: Simplify demanded bits. 240 return false; 241 } 242 243 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 244 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 245 B.setInstrAndDebugLoc(MI); 246 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 247 248 const LLT S32 = LLT::scalar(32); 249 Register CvtSrc = MatchInfo.CvtVal; 250 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 251 if (SrcTy != S32) { 252 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 253 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 254 } 255 256 assert(MI.getOpcode() != NewOpc); 257 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 258 MI.eraseFromParent(); 259 } 260 261 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 262 MachineFunction &MF, 263 ClampI64ToI16MatchInfo &MatchInfo) { 264 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 265 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 266 if (SrcType != LLT::scalar(64)) 267 return false; 268 269 MachineIRBuilder B(MI); 270 271 LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); 272 273 if (mi_match(MI.getOperand(1).getReg(), MRI, 274 m_MaxMin(m_ICst(MatchInfo.Cmp1), 275 m_ICst(MatchInfo.Cmp2), 276 m_Reg(MatchInfo.Origin)))) { 277 const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1); 278 const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2); 279 280 const int64_t Min = static_cast<int64_t>(std::numeric_limits<int16_t>::min()); 281 const int64_t Max = static_cast<int64_t>(std::numeric_limits<int16_t>::max()); 282 283 // are we really trying to clamp against short boundaries? 284 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 285 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 286 } 287 288 return false; 289 } 290 291 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16(MachineInstr &MI, 292 const ClampI64ToI16MatchInfo &MatchInfo) { 293 LLVM_DEBUG(dbgs() << "Combining MI"); 294 295 MachineIRBuilder B(MI); 296 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 297 298 Register Src = MatchInfo.Origin; 299 assert(MRI.getType(Src) == LLT::scalar(64)); 300 const LLT S32 = LLT::scalar(32); 301 302 auto Unmerge = B.buildUnmerge(S32, Src); 303 Register Hi32 = Unmerge->getOperand(0).getReg(); 304 Register Lo32 = Unmerge->getOperand(1).getReg(); 305 MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); 306 MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); 307 308 constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; 309 assert(MI.getOpcode() != CvtOpcode); 310 311 Register CvtDst = MRI.createGenericVirtualRegister(S32); 312 MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass); 313 314 auto CvtPk = B.buildInstr(CvtOpcode); 315 CvtPk.addDef(CvtDst); 316 CvtPk.addReg(Hi32); 317 CvtPk.addReg(Lo32); 318 CvtPk.setMIFlags(MI.getFlags()); 319 320 auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 321 auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 322 323 Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32); 324 MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass); 325 B.buildConstant(MinBoundaryDst, min); 326 327 Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32); 328 MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass); 329 B.buildConstant(MaxBoundaryDst, max); 330 331 Register MedDst = MRI.createGenericVirtualRegister(S32); 332 MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass); 333 334 auto Med = B.buildInstr(AMDGPU::V_MED3_I32); 335 Med.addDef(MedDst); 336 Med.addReg(MinBoundaryDst); 337 Med.addReg(CvtDst); 338 Med.addReg(MaxBoundaryDst); 339 Med.setMIFlags(MI.getFlags()); 340 341 B.buildCopy(MI.getOperand(0).getReg(), MedDst); 342 343 MI.eraseFromParent(); 344 } 345 346 class AMDGPUPostLegalizerCombinerHelperState { 347 protected: 348 CombinerHelper &Helper; 349 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 350 351 public: 352 AMDGPUPostLegalizerCombinerHelperState( 353 CombinerHelper &Helper, 354 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 355 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 356 }; 357 358 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 359 #include "AMDGPUGenPostLegalizeGICombiner.inc" 360 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 361 362 namespace { 363 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 364 #include "AMDGPUGenPostLegalizeGICombiner.inc" 365 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 366 367 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 368 GISelKnownBits *KB; 369 MachineDominatorTree *MDT; 370 371 public: 372 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 373 374 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 375 const AMDGPULegalizerInfo *LI, 376 GISelKnownBits *KB, MachineDominatorTree *MDT) 377 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 378 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 379 KB(KB), MDT(MDT) { 380 if (!GeneratedRuleCfg.parseCommandLineOption()) 381 report_fatal_error("Invalid rule identifier"); 382 } 383 384 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 385 MachineIRBuilder &B) const override; 386 }; 387 388 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 389 MachineInstr &MI, 390 MachineIRBuilder &B) const { 391 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 392 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 393 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 394 PostLegalizerHelper); 395 396 if (Generated.tryCombineAll(Observer, MI, B)) 397 return true; 398 399 switch (MI.getOpcode()) { 400 case TargetOpcode::G_SHL: 401 case TargetOpcode::G_LSHR: 402 case TargetOpcode::G_ASHR: 403 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 404 // common case, splitting this into a move and a 32-bit shift is faster and 405 // the same code size. 406 return Helper.tryCombineShiftToUnmerge(MI, 32); 407 } 408 409 return false; 410 } 411 412 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 413 #include "AMDGPUGenPostLegalizeGICombiner.inc" 414 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 415 416 // Pass boilerplate 417 // ================ 418 419 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 420 public: 421 static char ID; 422 423 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 424 425 StringRef getPassName() const override { 426 return "AMDGPUPostLegalizerCombiner"; 427 } 428 429 bool runOnMachineFunction(MachineFunction &MF) override; 430 431 void getAnalysisUsage(AnalysisUsage &AU) const override; 432 private: 433 bool IsOptNone; 434 }; 435 } // end anonymous namespace 436 437 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 438 AU.addRequired<TargetPassConfig>(); 439 AU.setPreservesCFG(); 440 getSelectionDAGFallbackAnalysisUsage(AU); 441 AU.addRequired<GISelKnownBitsAnalysis>(); 442 AU.addPreserved<GISelKnownBitsAnalysis>(); 443 if (!IsOptNone) { 444 AU.addRequired<MachineDominatorTree>(); 445 AU.addPreserved<MachineDominatorTree>(); 446 } 447 MachineFunctionPass::getAnalysisUsage(AU); 448 } 449 450 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 451 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 452 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 453 } 454 455 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 456 if (MF.getProperties().hasProperty( 457 MachineFunctionProperties::Property::FailedISel)) 458 return false; 459 auto *TPC = &getAnalysis<TargetPassConfig>(); 460 const Function &F = MF.getFunction(); 461 bool EnableOpt = 462 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 463 464 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 465 const AMDGPULegalizerInfo *LI 466 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 467 468 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 469 MachineDominatorTree *MDT = 470 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 471 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 472 F.hasMinSize(), LI, KB, MDT); 473 Combiner C(PCInfo, TPC); 474 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 475 } 476 477 char AMDGPUPostLegalizerCombiner::ID = 0; 478 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 479 "Combine AMDGPU machine instrs after legalization", 480 false, false) 481 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 482 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 483 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 484 "Combine AMDGPU machine instrs after legalization", false, 485 false) 486 487 namespace llvm { 488 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 489 return new AMDGPUPostLegalizerCombiner(IsOptNone); 490 } 491 } // end namespace llvm 492