1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp 2 //---------------===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass does combining of machine instructions at the generic MI level, 11 // after the legalizer. 12 // 13 //===----------------------------------------------------------------------===// 14 15 <<<<<<< HEAD 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "GCNSubtarget.h" 19 ======= 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPUTargetMachine.h" 22 >>>>>>> clang-format 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/CodeGen/GlobalISel/Combiner.h" 25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 29 #include "llvm/CodeGen/MachineDominators.h" 30 #include "llvm/CodeGen/TargetPassConfig.h" 31 <<<<<<< HEAD 32 #include "llvm/Target/TargetMachine.h" 33 ======= 34 #include "llvm/Support/Debug.h" 35 >>>>>>> clang-format 36 37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 class AMDGPUPostLegalizerCombinerHelper { 43 protected: 44 MachineIRBuilder &B; 45 MachineFunction &MF; 46 MachineRegisterInfo &MRI; 47 CombinerHelper &Helper; 48 49 public: 50 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 51 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 52 53 struct FMinFMaxLegacyInfo { 54 Register LHS; 55 Register RHS; 56 Register True; 57 Register False; 58 CmpInst::Predicate Pred; 59 }; 60 61 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 62 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 63 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 64 const FMinFMaxLegacyInfo &Info); 65 66 bool matchUCharToFloat(MachineInstr &MI); 67 void applyUCharToFloat(MachineInstr &MI); 68 69 // FIXME: Should be able to have 2 separate matchdatas rather than custom 70 // struct boilerplate. 71 struct CvtF32UByteMatchInfo { 72 Register CvtVal; 73 unsigned ShiftOffset; 74 }; 75 76 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 77 void applyCvtF32UByteN(MachineInstr &MI, 78 const CvtF32UByteMatchInfo &MatchInfo); 79 80 struct ClampI64ToI16MatchInfo { 81 int64_t Cmp1; 82 int64_t Cmp2; 83 Register Origin; 84 }; 85 86 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 87 MachineFunction &MF, 88 ClampI64ToI16MatchInfo &MatchInfo); 89 90 void applyClampI64ToI16(MachineInstr &MI, 91 const ClampI64ToI16MatchInfo &MatchInfo); 92 }; 93 94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 95 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 96 // FIXME: Combines should have subtarget predicates, and we shouldn't need 97 // this here. 98 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 99 return false; 100 101 // FIXME: Type predicate on pattern 102 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 103 return false; 104 105 Register Cond = MI.getOperand(1).getReg(); 106 if (!MRI.hasOneNonDBGUse(Cond) || 107 !mi_match(Cond, MRI, 108 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 109 return false; 110 111 Info.True = MI.getOperand(2).getReg(); 112 Info.False = MI.getOperand(3).getReg(); 113 114 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 115 !(Info.LHS == Info.False && Info.RHS == Info.True)) 116 return false; 117 118 switch (Info.Pred) { 119 case CmpInst::FCMP_FALSE: 120 case CmpInst::FCMP_OEQ: 121 case CmpInst::FCMP_ONE: 122 case CmpInst::FCMP_ORD: 123 case CmpInst::FCMP_UNO: 124 case CmpInst::FCMP_UEQ: 125 case CmpInst::FCMP_UNE: 126 case CmpInst::FCMP_TRUE: 127 return false; 128 default: 129 return true; 130 } 131 } 132 133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 134 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 135 B.setInstrAndDebugLoc(MI); 136 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 137 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 138 }; 139 140 switch (Info.Pred) { 141 case CmpInst::FCMP_ULT: 142 case CmpInst::FCMP_ULE: 143 if (Info.LHS == Info.True) 144 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 145 else 146 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 147 break; 148 case CmpInst::FCMP_OLE: 149 case CmpInst::FCMP_OLT: { 150 // We need to permute the operands to get the correct NaN behavior. The 151 // selected operand is the second one based on the failing compare with NaN, 152 // so permute it based on the compare type the hardware uses. 153 if (Info.LHS == Info.True) 154 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 155 else 156 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 157 break; 158 } 159 case CmpInst::FCMP_UGE: 160 case CmpInst::FCMP_UGT: { 161 if (Info.LHS == Info.True) 162 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 163 else 164 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 165 break; 166 } 167 case CmpInst::FCMP_OGT: 168 case CmpInst::FCMP_OGE: { 169 if (Info.LHS == Info.True) 170 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 171 else 172 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 173 break; 174 } 175 default: 176 llvm_unreachable("predicate should not have matched"); 177 } 178 179 MI.eraseFromParent(); 180 } 181 182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 183 Register DstReg = MI.getOperand(0).getReg(); 184 185 // TODO: We could try to match extracting the higher bytes, which would be 186 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 187 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 188 // about in practice. 189 LLT Ty = MRI.getType(DstReg); 190 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 191 Register SrcReg = MI.getOperand(1).getReg(); 192 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 193 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 194 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 195 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 196 } 197 198 return false; 199 } 200 201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 202 B.setInstrAndDebugLoc(MI); 203 204 const LLT S32 = LLT::scalar(32); 205 206 Register DstReg = MI.getOperand(0).getReg(); 207 Register SrcReg = MI.getOperand(1).getReg(); 208 LLT Ty = MRI.getType(DstReg); 209 LLT SrcTy = MRI.getType(SrcReg); 210 if (SrcTy != S32) 211 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 212 213 if (Ty == S32) { 214 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 215 MI.getFlags()); 216 } else { 217 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 218 MI.getFlags()); 219 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 220 } 221 222 MI.eraseFromParent(); 223 } 224 225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 226 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 227 Register SrcReg = MI.getOperand(1).getReg(); 228 229 // Look through G_ZEXT. 230 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 231 232 Register Src0; 233 int64_t ShiftAmt; 234 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 235 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 236 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 237 238 unsigned ShiftOffset = 8 * Offset; 239 if (IsShr) 240 ShiftOffset += ShiftAmt; 241 else 242 ShiftOffset -= ShiftAmt; 243 244 MatchInfo.CvtVal = Src0; 245 MatchInfo.ShiftOffset = ShiftOffset; 246 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 247 } 248 249 // TODO: Simplify demanded bits. 250 return false; 251 } 252 253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 254 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 255 B.setInstrAndDebugLoc(MI); 256 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 257 258 const LLT S32 = LLT::scalar(32); 259 Register CvtSrc = MatchInfo.CvtVal; 260 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 261 if (SrcTy != S32) { 262 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 263 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 264 } 265 266 assert(MI.getOpcode() != NewOpc); 267 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 268 MI.eraseFromParent(); 269 } 270 271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16( 272 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 273 ClampI64ToI16MatchInfo &MatchInfo) { 274 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 275 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 276 277 // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or 278 // below). 279 if (SrcType != LLT::scalar(64)) 280 return false; 281 282 MachineIRBuilder B(MI); 283 284 LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); 285 286 if (mi_match(MI.getOperand(1).getReg(), MRI, 287 m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2), 288 m_Reg(MatchInfo.Origin)))) { 289 const auto Cmp1 = MatchInfo.Cmp1; 290 const auto Cmp2 = MatchInfo.Cmp2; 291 292 const int64_t Min = std::numeric_limits<int16_t>::min(); 293 const int64_t Max = std::numeric_limits<int16_t>::max(); 294 295 // are we really trying to clamp against short boundaries? 296 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 297 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 298 } 299 300 return false; 301 } 302 303 /** 304 * We want to find a combination of instructions that 305 * gets generated when an i64 gets clamped to i16. 306 * The corresponding pattern is: 307 * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC. 308 * This can be efficiently written as following: 309 * v_cvt_pk_i16_i32 v0, v0, v1 310 * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 311 */ 312 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16( 313 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 314 LLVM_DEBUG(dbgs() << "Combining MI"); 315 316 MachineIRBuilder B(MI); 317 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 318 319 Register Src = MatchInfo.Origin; 320 assert(MRI.getType(Src) == LLT::scalar(64)); 321 const LLT S32 = LLT::scalar(32); 322 323 auto Unmerge = B.buildUnmerge(S32, Src); 324 Register Hi32 = Unmerge->getOperand(0).getReg(); 325 Register Lo32 = Unmerge->getOperand(1).getReg(); 326 MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); 327 MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); 328 329 constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; 330 assert(MI.getOpcode() != CvtOpcode); 331 332 Register CvtDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 333 334 auto CvtPk = B.buildInstr(CvtOpcode); 335 CvtPk.addDef(CvtDst); 336 CvtPk.addReg(Hi32); 337 CvtPk.addReg(Lo32); 338 CvtPk.setMIFlags(MI.getFlags()); 339 340 auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 341 auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 342 343 Register MinBoundaryDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 344 B.buildConstant(MinBoundaryDst, min); 345 346 Register MaxBoundaryDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 347 B.buildConstant(MaxBoundaryDst, max); 348 349 Register MedDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); 350 351 auto Med = B.buildInstr(AMDGPU::V_MED3_I32); 352 Med.addDef(MedDst); 353 Med.addReg(MinBoundaryDst); 354 Med.addReg(CvtDst); 355 Med.addReg(MaxBoundaryDst); 356 Med.setMIFlags(MI.getFlags()); 357 358 B.buildCopy(MI.getOperand(0).getReg(), MedDst); 359 360 MI.eraseFromParent(); 361 } 362 363 class AMDGPUPostLegalizerCombinerHelperState { 364 protected: 365 CombinerHelper &Helper; 366 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 367 368 public: 369 AMDGPUPostLegalizerCombinerHelperState( 370 CombinerHelper &Helper, 371 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 372 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 373 }; 374 375 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 376 #include "AMDGPUGenPostLegalizeGICombiner.inc" 377 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 378 379 namespace { 380 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 381 #include "AMDGPUGenPostLegalizeGICombiner.inc" 382 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 383 384 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 385 GISelKnownBits *KB; 386 MachineDominatorTree *MDT; 387 388 public: 389 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 390 391 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 392 const AMDGPULegalizerInfo *LI, 393 GISelKnownBits *KB, MachineDominatorTree *MDT) 394 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 395 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 396 KB(KB), MDT(MDT) { 397 if (!GeneratedRuleCfg.parseCommandLineOption()) 398 report_fatal_error("Invalid rule identifier"); 399 } 400 401 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 402 MachineIRBuilder &B) const override; 403 }; 404 405 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 406 MachineInstr &MI, 407 MachineIRBuilder &B) const { 408 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 409 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 410 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 411 PostLegalizerHelper); 412 413 if (Generated.tryCombineAll(Observer, MI, B)) 414 return true; 415 416 switch (MI.getOpcode()) { 417 case TargetOpcode::G_SHL: 418 case TargetOpcode::G_LSHR: 419 case TargetOpcode::G_ASHR: 420 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 421 // common case, splitting this into a move and a 32-bit shift is faster and 422 // the same code size. 423 return Helper.tryCombineShiftToUnmerge(MI, 32); 424 } 425 426 return false; 427 } 428 429 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 430 #include "AMDGPUGenPostLegalizeGICombiner.inc" 431 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 432 433 // Pass boilerplate 434 // ================ 435 436 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 437 public: 438 static char ID; 439 440 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 441 442 StringRef getPassName() const override { 443 return "AMDGPUPostLegalizerCombiner"; 444 } 445 446 bool runOnMachineFunction(MachineFunction &MF) override; 447 448 void getAnalysisUsage(AnalysisUsage &AU) const override; 449 450 private: 451 bool IsOptNone; 452 }; 453 } // end anonymous namespace 454 455 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 456 AU.addRequired<TargetPassConfig>(); 457 AU.setPreservesCFG(); 458 getSelectionDAGFallbackAnalysisUsage(AU); 459 AU.addRequired<GISelKnownBitsAnalysis>(); 460 AU.addPreserved<GISelKnownBitsAnalysis>(); 461 if (!IsOptNone) { 462 AU.addRequired<MachineDominatorTree>(); 463 AU.addPreserved<MachineDominatorTree>(); 464 } 465 MachineFunctionPass::getAnalysisUsage(AU); 466 } 467 468 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 469 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 470 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 471 } 472 473 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 474 if (MF.getProperties().hasProperty( 475 MachineFunctionProperties::Property::FailedISel)) 476 return false; 477 auto *TPC = &getAnalysis<TargetPassConfig>(); 478 const Function &F = MF.getFunction(); 479 bool EnableOpt = 480 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 481 482 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 483 const AMDGPULegalizerInfo *LI = 484 static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 485 486 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 487 MachineDominatorTree *MDT = 488 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 489 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 490 F.hasMinSize(), LI, KB, MDT); 491 Combiner C(PCInfo, TPC); 492 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 493 } 494 495 char AMDGPUPostLegalizerCombiner::ID = 0; 496 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 497 "Combine AMDGPU machine instrs after legalization", false, 498 false) 499 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 500 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 501 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 502 "Combine AMDGPU machine instrs after legalization", false, 503 false) 504 505 namespace llvm { 506 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 507 return new AMDGPUPostLegalizerCombiner(IsOptNone); 508 } 509 } // end namespace llvm 510