1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp 2 //---------------===// 3 // 4 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 // See https://llvm.org/LICENSE.txt for license information. 6 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass does combining of machine instructions at the generic MI level, 11 // after the legalizer. 12 // 13 //===----------------------------------------------------------------------===// 14 15 <<<<<<< HEAD 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "GCNSubtarget.h" 19 ======= 20 #include "AMDGPULegalizerInfo.h" 21 #include "AMDGPUTargetMachine.h" 22 >>>>>>> clang-format 23 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 24 #include "llvm/CodeGen/GlobalISel/Combiner.h" 25 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 26 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 29 #include "llvm/CodeGen/MachineDominators.h" 30 #include "llvm/CodeGen/TargetPassConfig.h" 31 <<<<<<< HEAD 32 #include "llvm/Target/TargetMachine.h" 33 ======= 34 #include "llvm/Support/Debug.h" 35 >>>>>>> clang-format 36 37 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 38 39 using namespace llvm; 40 using namespace MIPatternMatch; 41 42 class AMDGPUPostLegalizerCombinerHelper { 43 protected: 44 MachineIRBuilder &B; 45 MachineFunction &MF; 46 MachineRegisterInfo &MRI; 47 CombinerHelper &Helper; 48 49 public: 50 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 51 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 52 53 struct FMinFMaxLegacyInfo { 54 Register LHS; 55 Register RHS; 56 Register True; 57 Register False; 58 CmpInst::Predicate Pred; 59 }; 60 61 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 62 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 63 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 64 const FMinFMaxLegacyInfo &Info); 65 66 bool matchUCharToFloat(MachineInstr &MI); 67 void applyUCharToFloat(MachineInstr &MI); 68 69 // FIXME: Should be able to have 2 separate matchdatas rather than custom 70 // struct boilerplate. 71 struct CvtF32UByteMatchInfo { 72 Register CvtVal; 73 unsigned ShiftOffset; 74 }; 75 76 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 77 void applyCvtF32UByteN(MachineInstr &MI, 78 const CvtF32UByteMatchInfo &MatchInfo); 79 80 struct ClampI64ToI16MatchInfo { 81 int64_t Cmp1; 82 int64_t Cmp2; 83 Register Origin; 84 }; 85 86 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 87 MachineFunction &MF, 88 ClampI64ToI16MatchInfo &MatchInfo); 89 90 void applyClampI64ToI16(MachineInstr &MI, 91 const ClampI64ToI16MatchInfo &MatchInfo); 92 }; 93 94 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 95 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 96 // FIXME: Combines should have subtarget predicates, and we shouldn't need 97 // this here. 98 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 99 return false; 100 101 // FIXME: Type predicate on pattern 102 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 103 return false; 104 105 Register Cond = MI.getOperand(1).getReg(); 106 if (!MRI.hasOneNonDBGUse(Cond) || 107 !mi_match(Cond, MRI, 108 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 109 return false; 110 111 Info.True = MI.getOperand(2).getReg(); 112 Info.False = MI.getOperand(3).getReg(); 113 114 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 115 !(Info.LHS == Info.False && Info.RHS == Info.True)) 116 return false; 117 118 switch (Info.Pred) { 119 case CmpInst::FCMP_FALSE: 120 case CmpInst::FCMP_OEQ: 121 case CmpInst::FCMP_ONE: 122 case CmpInst::FCMP_ORD: 123 case CmpInst::FCMP_UNO: 124 case CmpInst::FCMP_UEQ: 125 case CmpInst::FCMP_UNE: 126 case CmpInst::FCMP_TRUE: 127 return false; 128 default: 129 return true; 130 } 131 } 132 133 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 134 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 135 B.setInstrAndDebugLoc(MI); 136 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 137 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 138 }; 139 140 switch (Info.Pred) { 141 case CmpInst::FCMP_ULT: 142 case CmpInst::FCMP_ULE: 143 if (Info.LHS == Info.True) 144 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 145 else 146 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 147 break; 148 case CmpInst::FCMP_OLE: 149 case CmpInst::FCMP_OLT: { 150 // We need to permute the operands to get the correct NaN behavior. The 151 // selected operand is the second one based on the failing compare with NaN, 152 // so permute it based on the compare type the hardware uses. 153 if (Info.LHS == Info.True) 154 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 155 else 156 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 157 break; 158 } 159 case CmpInst::FCMP_UGE: 160 case CmpInst::FCMP_UGT: { 161 if (Info.LHS == Info.True) 162 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 163 else 164 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 165 break; 166 } 167 case CmpInst::FCMP_OGT: 168 case CmpInst::FCMP_OGE: { 169 if (Info.LHS == Info.True) 170 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 171 else 172 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 173 break; 174 } 175 default: 176 llvm_unreachable("predicate should not have matched"); 177 } 178 179 MI.eraseFromParent(); 180 } 181 182 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 183 Register DstReg = MI.getOperand(0).getReg(); 184 185 // TODO: We could try to match extracting the higher bytes, which would be 186 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 187 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 188 // about in practice. 189 LLT Ty = MRI.getType(DstReg); 190 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 191 Register SrcReg = MI.getOperand(1).getReg(); 192 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 193 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 194 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 195 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 196 } 197 198 return false; 199 } 200 201 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 202 B.setInstrAndDebugLoc(MI); 203 204 const LLT S32 = LLT::scalar(32); 205 206 Register DstReg = MI.getOperand(0).getReg(); 207 Register SrcReg = MI.getOperand(1).getReg(); 208 LLT Ty = MRI.getType(DstReg); 209 LLT SrcTy = MRI.getType(SrcReg); 210 if (SrcTy != S32) 211 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 212 213 if (Ty == S32) { 214 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 215 MI.getFlags()); 216 } else { 217 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 218 MI.getFlags()); 219 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 220 } 221 222 MI.eraseFromParent(); 223 } 224 225 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 226 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 227 Register SrcReg = MI.getOperand(1).getReg(); 228 229 // Look through G_ZEXT. 230 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 231 232 Register Src0; 233 int64_t ShiftAmt; 234 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 235 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 236 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 237 238 unsigned ShiftOffset = 8 * Offset; 239 if (IsShr) 240 ShiftOffset += ShiftAmt; 241 else 242 ShiftOffset -= ShiftAmt; 243 244 MatchInfo.CvtVal = Src0; 245 MatchInfo.ShiftOffset = ShiftOffset; 246 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 247 } 248 249 // TODO: Simplify demanded bits. 250 return false; 251 } 252 253 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 254 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 255 B.setInstrAndDebugLoc(MI); 256 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 257 258 const LLT S32 = LLT::scalar(32); 259 Register CvtSrc = MatchInfo.CvtVal; 260 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 261 if (SrcTy != S32) { 262 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 263 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 264 } 265 266 assert(MI.getOpcode() != NewOpc); 267 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 268 MI.eraseFromParent(); 269 } 270 271 bool AMDGPUPostLegalizerCombinerHelper::matchClampI64ToI16( 272 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 273 ClampI64ToI16MatchInfo &MatchInfo) { 274 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 275 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 276 if (SrcType != LLT::scalar(64)) 277 return false; 278 279 MachineIRBuilder B(MI); 280 281 LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16"); 282 283 if (mi_match(MI.getOperand(1).getReg(), MRI, 284 m_MaxMin(m_ICst(MatchInfo.Cmp1), m_ICst(MatchInfo.Cmp2), 285 m_Reg(MatchInfo.Origin)))) { 286 const auto Cmp1 = static_cast<int64_t>(MatchInfo.Cmp1); 287 const auto Cmp2 = static_cast<int64_t>(MatchInfo.Cmp2); 288 289 const int64_t Min = 290 static_cast<int64_t>(std::numeric_limits<int16_t>::min()); 291 const int64_t Max = 292 static_cast<int64_t>(std::numeric_limits<int16_t>::max()); 293 294 // are we really trying to clamp against short boundaries? 295 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 296 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 297 } 298 299 return false; 300 } 301 302 void AMDGPUPostLegalizerCombinerHelper::applyClampI64ToI16( 303 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 304 LLVM_DEBUG(dbgs() << "Combining MI"); 305 306 MachineIRBuilder B(MI); 307 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 308 309 Register Src = MatchInfo.Origin; 310 assert(MRI.getType(Src) == LLT::scalar(64)); 311 const LLT S32 = LLT::scalar(32); 312 313 auto Unmerge = B.buildUnmerge(S32, Src); 314 Register Hi32 = Unmerge->getOperand(0).getReg(); 315 Register Lo32 = Unmerge->getOperand(1).getReg(); 316 MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); 317 MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); 318 319 constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; 320 assert(MI.getOpcode() != CvtOpcode); 321 322 Register CvtDst = MRI.createGenericVirtualRegister(S32); 323 MRI.setRegClass(CvtDst, &AMDGPU::VGPR_32RegClass); 324 325 auto CvtPk = B.buildInstr(CvtOpcode); 326 CvtPk.addDef(CvtDst); 327 CvtPk.addReg(Hi32); 328 CvtPk.addReg(Lo32); 329 CvtPk.setMIFlags(MI.getFlags()); 330 331 auto min = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 332 auto max = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 333 334 Register MinBoundaryDst = MRI.createGenericVirtualRegister(S32); 335 MRI.setRegClass(MinBoundaryDst, &AMDGPU::VGPR_32RegClass); 336 B.buildConstant(MinBoundaryDst, min); 337 338 Register MaxBoundaryDst = MRI.createGenericVirtualRegister(S32); 339 MRI.setRegClass(MaxBoundaryDst, &AMDGPU::VGPR_32RegClass); 340 B.buildConstant(MaxBoundaryDst, max); 341 342 Register MedDst = MRI.createGenericVirtualRegister(S32); 343 MRI.setRegClass(MedDst, &AMDGPU::VGPR_32RegClass); 344 345 auto Med = B.buildInstr(AMDGPU::V_MED3_I32); 346 Med.addDef(MedDst); 347 Med.addReg(MinBoundaryDst); 348 Med.addReg(CvtDst); 349 Med.addReg(MaxBoundaryDst); 350 Med.setMIFlags(MI.getFlags()); 351 352 B.buildCopy(MI.getOperand(0).getReg(), MedDst); 353 354 MI.eraseFromParent(); 355 } 356 357 class AMDGPUPostLegalizerCombinerHelperState { 358 protected: 359 CombinerHelper &Helper; 360 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 361 362 public: 363 AMDGPUPostLegalizerCombinerHelperState( 364 CombinerHelper &Helper, 365 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 366 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 367 }; 368 369 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 370 #include "AMDGPUGenPostLegalizeGICombiner.inc" 371 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 372 373 namespace { 374 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 375 #include "AMDGPUGenPostLegalizeGICombiner.inc" 376 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 377 378 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 379 GISelKnownBits *KB; 380 MachineDominatorTree *MDT; 381 382 public: 383 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 384 385 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 386 const AMDGPULegalizerInfo *LI, 387 GISelKnownBits *KB, MachineDominatorTree *MDT) 388 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 389 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 390 KB(KB), MDT(MDT) { 391 if (!GeneratedRuleCfg.parseCommandLineOption()) 392 report_fatal_error("Invalid rule identifier"); 393 } 394 395 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 396 MachineIRBuilder &B) const override; 397 }; 398 399 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 400 MachineInstr &MI, 401 MachineIRBuilder &B) const { 402 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 403 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 404 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 405 PostLegalizerHelper); 406 407 if (Generated.tryCombineAll(Observer, MI, B)) 408 return true; 409 410 switch (MI.getOpcode()) { 411 case TargetOpcode::G_SHL: 412 case TargetOpcode::G_LSHR: 413 case TargetOpcode::G_ASHR: 414 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 415 // common case, splitting this into a move and a 32-bit shift is faster and 416 // the same code size. 417 return Helper.tryCombineShiftToUnmerge(MI, 32); 418 } 419 420 return false; 421 } 422 423 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 424 #include "AMDGPUGenPostLegalizeGICombiner.inc" 425 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 426 427 // Pass boilerplate 428 // ================ 429 430 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 431 public: 432 static char ID; 433 434 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 435 436 StringRef getPassName() const override { 437 return "AMDGPUPostLegalizerCombiner"; 438 } 439 440 bool runOnMachineFunction(MachineFunction &MF) override; 441 442 void getAnalysisUsage(AnalysisUsage &AU) const override; 443 444 private: 445 bool IsOptNone; 446 }; 447 } // end anonymous namespace 448 449 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 450 AU.addRequired<TargetPassConfig>(); 451 AU.setPreservesCFG(); 452 getSelectionDAGFallbackAnalysisUsage(AU); 453 AU.addRequired<GISelKnownBitsAnalysis>(); 454 AU.addPreserved<GISelKnownBitsAnalysis>(); 455 if (!IsOptNone) { 456 AU.addRequired<MachineDominatorTree>(); 457 AU.addPreserved<MachineDominatorTree>(); 458 } 459 MachineFunctionPass::getAnalysisUsage(AU); 460 } 461 462 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 463 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 464 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 465 } 466 467 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 468 if (MF.getProperties().hasProperty( 469 MachineFunctionProperties::Property::FailedISel)) 470 return false; 471 auto *TPC = &getAnalysis<TargetPassConfig>(); 472 const Function &F = MF.getFunction(); 473 bool EnableOpt = 474 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 475 476 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 477 const AMDGPULegalizerInfo *LI = 478 static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 479 480 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 481 MachineDominatorTree *MDT = 482 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 483 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 484 F.hasMinSize(), LI, KB, MDT); 485 Combiner C(PCInfo, TPC); 486 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 487 } 488 489 char AMDGPUPostLegalizerCombiner::ID = 0; 490 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 491 "Combine AMDGPU machine instrs after legalization", false, 492 false) 493 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 494 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 495 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 496 "Combine AMDGPU machine instrs after legalization", false, 497 false) 498 499 namespace llvm { 500 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 501 return new AMDGPUPostLegalizerCombiner(IsOptNone); 502 } 503 } // end namespace llvm 504