1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp 2 //---------------===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass does combining of machine instructions at the generic MI level, 11 // after the legalizer. 12 // 13 //===----------------------------------------------------------------------===// 14 15 <<<<<<< HEAD 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "GCNSubtarget.h" 19 ======= 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPUTargetMachine.h" 22 >>>>>>> clang-format 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/CodeGen/GlobalISel/Combiner.h" 25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 29 #include "llvm/CodeGen/MachineDominators.h" 30 #include "llvm/CodeGen/TargetPassConfig.h" 31 <<<<<<< HEAD 32 #include "llvm/Target/TargetMachine.h" 33 ======= 34 #include "llvm/Support/Debug.h" 35 >>>>>>> clang-format 36 37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 class AMDGPUPostLegalizerCombinerHelper { 43 protected: 44 MachineIRBuilder &B; 45 MachineFunction &MF; 46 MachineRegisterInfo &MRI; 47 CombinerHelper &Helper; 48 49 public: 50 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 51 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 52 53 struct FMinFMaxLegacyInfo { 54 Register LHS; 55 Register RHS; 56 Register True; 57 Register False; 58 CmpInst::Predicate Pred; 59 }; 60 61 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 62 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 63 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 64 const FMinFMaxLegacyInfo &Info); 65 66 bool matchUCharToFloat(MachineInstr &MI); 67 void applyUCharToFloat(MachineInstr &MI); 68 69 // FIXME: Should be able to have 2 separate matchdatas rather than custom 70 // struct boilerplate. 71 struct CvtF32UByteMatchInfo { 72 Register CvtVal; 73 unsigned ShiftOffset; 74 }; 75 76 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 77 void applyCvtF32UByteN(MachineInstr &MI, 78 const CvtF32UByteMatchInfo &MatchInfo); 79 80 struct ClampI64ToI16MatchInfo { 81 int64_t Cmp1; 82 int64_t Cmp2; 83 Register Origin; 84 }; 85 86 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 87 MachineFunction &MF, 88 ClampI64ToI16MatchInfo &MatchInfo); 89 90 void applyClampI64ToI16(MachineInstr &MI, 91 const ClampI64ToI16MatchInfo &MatchInfo); 92 }; 93 94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 95 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 96 // FIXME: Combines should have subtarget predicates, and we shouldn't need 97 // this here. 98 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 99 return false; 100 101 // FIXME: Type predicate on pattern 102 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 103 return false; 104 105 Register Cond = MI.getOperand(1).getReg(); 106 if (!MRI.hasOneNonDBGUse(Cond) || 107 !mi_match(Cond, MRI, 108 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 109 return false; 110 111 Info.True = MI.getOperand(2).getReg(); 112 Info.False = MI.getOperand(3).getReg(); 113 114 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 115 !(Info.LHS == Info.False && Info.RHS == Info.True)) 116 return false; 117 118 switch (Info.Pred) { 119 case CmpInst::FCMP_FALSE: 120 case CmpInst::FCMP_OEQ: 121 case CmpInst::FCMP_ONE: 122 case CmpInst::FCMP_ORD: 123 case CmpInst::FCMP_UNO: 124 case CmpInst::FCMP_UEQ: 125 case CmpInst::FCMP_UNE: 126 case CmpInst::FCMP_TRUE: 127 return false; 128 default: 129 return true; 130 } 131 } 132 133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 134 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 135 B.setInstrAndDebugLoc(MI); 136 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 137 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 138 }; 139 140 switch (Info.Pred) { 141 case CmpInst::FCMP_ULT: 142 case CmpInst::FCMP_ULE: 143 if (Info.LHS == Info.True) 144 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 145 else 146 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 147 break; 148 case CmpInst::FCMP_OLE: 149 case CmpInst::FCMP_OLT: { 150 // We need to permute the operands to get the correct NaN behavior. The 151 // selected operand is the second one based on the failing compare with NaN, 152 // so permute it based on the compare type the hardware uses. 153 if (Info.LHS == Info.True) 154 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 155 else 156 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 157 break; 158 } 159 case CmpInst::FCMP_UGE: 160 case CmpInst::FCMP_UGT: { 161 if (Info.LHS == Info.True) 162 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 163 else 164 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 165 break; 166 } 167 case CmpInst::FCMP_OGT: 168 case CmpInst::FCMP_OGE: { 169 if (Info.LHS == Info.True) 170 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 171 else 172 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 173 break; 174 } 175 default: 176 llvm_unreachable("predicate should not have matched"); 177 } 178 179 MI.eraseFromParent(); 180 } 181 182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 183 Register DstReg = MI.getOperand(0).getReg(); 184 185 // TODO: We could try to match extracting the higher bytes, which would be 186 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 187 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 188 // about in practice. 189 LLT Ty = MRI.getType(DstReg); 190 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 191 Register SrcReg = MI.getOperand(1).getReg(); 192 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 193 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 194 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 195 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 196 } 197 198 return false; 199 } 200 201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 202 B.setInstrAndDebugLoc(MI); 203 204 const LLT S32 = LLT::scalar(32); 205 206 Register DstReg = MI.getOperand(0).getReg(); 207 Register SrcReg = MI.getOperand(1).getReg(); 208 LLT Ty = MRI.getType(DstReg); 209 LLT SrcTy = MRI.getType(SrcReg); 210 if (SrcTy != S32) 211 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 212 213 if (Ty == S32) { 214 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 215 MI.getFlags()); 216 } else { 217 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 218 MI.getFlags()); 219 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 220 } 221 222 MI.eraseFromParent(); 223 } 224 225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 226 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 227 Register SrcReg = MI.getOperand(1).getReg(); 228 229 // Look through G_ZEXT. 230 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 231 232 Register Src0; 233 int64_t ShiftAmt; 234 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 235 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 236 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 237 238 unsigned ShiftOffset = 8 * Offset; 239 if (IsShr) 240 ShiftOffset += ShiftAmt; 241 else 242 ShiftOffset -= ShiftAmt; 243 244 MatchInfo.CvtVal = Src0; 245 MatchInfo.ShiftOffset = ShiftOffset; 246 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 247 } 248 249 // TODO: Simplify demanded bits. 250 return false; 251 } 252 253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 254 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 255 B.setInstrAndDebugLoc(MI); 256 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 257 258 const LLT S32 = LLT::scalar(32); 259 Register CvtSrc = MatchInfo.CvtVal; 260 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 261 if (SrcTy != S32) { 262 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 263 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 264 } 265 266 assert(MI.getOpcode() != NewOpc); 267 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 268 MI.eraseFromParent(); 269 } 270 271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16( 272 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 273 ClampI64ToI16MatchInfo &MatchInfo) { 274 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 275 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 276 277 // we want to check if a 64-bit number gets clamped to 16-bit boundaries (or 278 // below). 279 if (SrcType != LLT::scalar(64)) 280 return false; 281 282 MachineIRBuilder B(MI); 283 284 LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); 285 286 if (mi_match(MI.getOperand(1).getReg(), MRI, 287 m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2), 288 m_Reg(MatchInfo.Origin)))) { 289 const auto Cmp1 = MatchInfo.Cmp1; 290 const auto Cmp2 = MatchInfo.Cmp2; 291 const auto Diff = std::abs(Cmp2 - Cmp1); 292 293 // we don't need to clamp here. 294 if (Diff == 0 || Diff == 1) { 295 return false; 296 } 297 298 const int64_t Min = std::numeric_limits<int16_t>::min(); 299 const int64_t Max = std::numeric_limits<int16_t>::max(); 300 301 // are we really trying to clamp against short boundaries? 302 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 303 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 304 } 305 306 return false; 307 } 308 309 /** 310 * We want to find a combination of instructions that 311 * gets generated when an i64 gets clamped to i16. 312 * The corresponding pattern is: 313 * G_SELECT MIN/MAX, G_ICMP, G_SELECT MIN/MAX, G_ICMP, G_TRUNC. 314 * This can be efficiently written as following: 315 * v_cvt_pk_i16_i32 v0, v0, v1 316 * v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 317 */ 318 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16( 319 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 320 LLVM_DEBUG(dbgs() << "Combining MI"); 321 322 MachineIRBuilder B(MI); 323 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 324 325 Register Src = MatchInfo.Origin; 326 assert(MRI.getType(Src) == LLT::scalar(64)); 327 const LLT S32 = LLT::scalar(32); 328 329 auto Unmerge = B.buildUnmerge(S32, Src); 330 Register Hi32 = Unmerge->getOperand(0).getReg(); 331 Register Lo32 = Unmerge->getOperand(1).getReg(); 332 MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); 333 MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); 334 335 constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; 336 assert(MI.getOpcode() != CvtOpcode); 337 338 const auto REG_CLASS = &AMDGPU::VGPR_32RegClass; 339 340 Register CvtDst = MRI.createVirtualRegister(REG_CLASS); 341 MRI.setType(CvtDst, S32); 342 343 auto CvtPk = B.buildInstr(CvtOpcode); 344 CvtPk.addDef(CvtDst); 345 CvtPk.addReg(Hi32); 346 CvtPk.addReg(Lo32); 347 CvtPk.setMIFlags(MI.getFlags()); 348 349 auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 350 auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 351 352 Register MinBoundaryDst = MRI.createVirtualRegister(REG_CLASS); 353 MRI.setType(MinBoundaryDst, S32); 354 B.buildConstant(MinBoundaryDst, min); 355 356 Register MaxBoundaryDst = MRI.createVirtualRegister(REG_CLASS); 357 MRI.setType(MaxBoundaryDst, S32); 358 B.buildConstant(MaxBoundaryDst, max); 359 360 Register MedDst = MRI.createVirtualRegister(REG_CLASS); 361 MRI.setType(MedDst, S32); 362 363 auto Med = B.buildInstr(AMDGPU::V_MED3_I32); 364 Med.addDef(MedDst); 365 Med.addReg(MinBoundaryDst); 366 Med.addReg(CvtDst); 367 Med.addReg(MaxBoundaryDst); 368 Med.setMIFlags(MI.getFlags()); 369 370 B.buildCopy(MI.getOperand(0).getReg(), MedDst); 371 372 MI.eraseFromParent(); 373 } 374 375 class AMDGPUPostLegalizerCombinerHelperState { 376 protected: 377 CombinerHelper &Helper; 378 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 379 380 public: 381 AMDGPUPostLegalizerCombinerHelperState( 382 CombinerHelper &Helper, 383 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 384 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 385 }; 386 387 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 388 #include "AMDGPUGenPostLegalizeGICombiner.inc" 389 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 390 391 namespace { 392 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 393 #include "AMDGPUGenPostLegalizeGICombiner.inc" 394 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 395 396 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 397 GISelKnownBits *KB; 398 MachineDominatorTree *MDT; 399 400 public: 401 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 402 403 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 404 const AMDGPULegalizerInfo *LI, 405 GISelKnownBits *KB, MachineDominatorTree *MDT) 406 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 407 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 408 KB(KB), MDT(MDT) { 409 if (!GeneratedRuleCfg.parseCommandLineOption()) 410 report_fatal_error("Invalid rule identifier"); 411 } 412 413 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 414 MachineIRBuilder &B) const override; 415 }; 416 417 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 418 MachineInstr &MI, 419 MachineIRBuilder &B) const { 420 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 421 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 422 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 423 PostLegalizerHelper); 424 425 if (Generated.tryCombineAll(Observer, MI, B)) 426 return true; 427 428 switch (MI.getOpcode()) { 429 case TargetOpcode::G_SHL: 430 case TargetOpcode::G_LSHR: 431 case TargetOpcode::G_ASHR: 432 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 433 // common case, splitting this into a move and a 32-bit shift is faster and 434 // the same code size. 435 return Helper.tryCombineShiftToUnmerge(MI, 32); 436 } 437 438 return false; 439 } 440 441 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 442 #include "AMDGPUGenPostLegalizeGICombiner.inc" 443 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 444 445 // Pass boilerplate 446 // ================ 447 448 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 449 public: 450 static char ID; 451 452 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 453 454 StringRef getPassName() const override { 455 return "AMDGPUPostLegalizerCombiner"; 456 } 457 458 bool runOnMachineFunction(MachineFunction &MF) override; 459 460 void getAnalysisUsage(AnalysisUsage &AU) const override; 461 462 private: 463 bool IsOptNone; 464 }; 465 } // end anonymous namespace 466 467 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 468 AU.addRequired<TargetPassConfig>(); 469 AU.setPreservesCFG(); 470 getSelectionDAGFallbackAnalysisUsage(AU); 471 AU.addRequired<GISelKnownBitsAnalysis>(); 472 AU.addPreserved<GISelKnownBitsAnalysis>(); 473 if (!IsOptNone) { 474 AU.addRequired<MachineDominatorTree>(); 475 AU.addPreserved<MachineDominatorTree>(); 476 } 477 MachineFunctionPass::getAnalysisUsage(AU); 478 } 479 480 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 481 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 482 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 483 } 484 485 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 486 if (MF.getProperties().hasProperty( 487 MachineFunctionProperties::Property::FailedISel)) 488 return false; 489 auto *TPC = &getAnalysis<TargetPassConfig>(); 490 const Function &F = MF.getFunction(); 491 bool EnableOpt = 492 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 493 494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 495 const AMDGPULegalizerInfo *LI = 496 static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 497 498 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 499 MachineDominatorTree *MDT = 500 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 501 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 502 F.hasMinSize(), LI, KB, MDT); 503 Combiner C(PCInfo, TPC); 504 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 505 } 506 507 char AMDGPUPostLegalizerCombiner::ID = 0; 508 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 509 "Combine AMDGPU machine instrs after legalization", false, 510 false) 511 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 512 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 513 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 514 "Combine AMDGPU machine instrs after legalization", false, 515 false) 516 517 namespace llvm { 518 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 519 return new AMDGPUPostLegalizerCombiner(IsOptNone); 520 } 521 } // end namespace llvm 522