1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 <<<<<<< HEAD 15 #include "AMDGPU.h" 16 ======= 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 >>>>>>> Move Combiner to PreLegalize step 20 #include "llvm/CodeGen/GlobalISel/Combiner.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 24 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 25 #include "llvm/CodeGen/MachineDominators.h" 26 #include "llvm/CodeGen/TargetPassConfig.h" 27 #include "llvm/Target/TargetMachine.h" 28 29 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 30 31 using namespace llvm; 32 using namespace MIPatternMatch; 33 34 class AMDGPUPreLegalizerCombinerHelper { 35 protected: 36 MachineIRBuilder &B; 37 MachineFunction &MF; 38 MachineRegisterInfo &MRI; 39 CombinerHelper &Helper; 40 41 public: 42 AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 43 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 44 45 struct ClampI64ToI16MatchInfo { 46 int64_t Cmp1; 47 int64_t Cmp2; 48 Register Origin; 49 }; 50 51 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 52 MachineFunction &MF, 53 ClampI64ToI16MatchInfo &MatchInfo); 54 55 void applyClampI64ToI16(MachineInstr &MI, 56 const ClampI64ToI16MatchInfo &MatchInfo); 57 }; 58 59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( 60 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 61 ClampI64ToI16MatchInfo &MatchInfo) { 62 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 63 64 // Try to find a pattern where an i64 value should get clamped to short. 65 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 66 if (SrcType != LLT::scalar(64)) 67 return false; 68 69 const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 70 if (DstType != LLT::scalar(16)) 71 return false; 72 73 LLVM_DEBUG(dbgs() << "Matching Clamp i64 to i16\n"); 74 75 Register Base; 76 77 // Try to match a combination of min / max MIR opcodes. 78 if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 79 if (!mi_match(Base, MRI, m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 80 return false; 81 } 82 } 83 84 if (mi_match(MI.getOperand(1).getReg(), MRI, m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 85 if (!mi_match(Base, MRI, m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 86 return false; 87 } 88 } 89 90 const auto Cmp1 = MatchInfo.Cmp1; 91 const auto Cmp2 = MatchInfo.Cmp2; 92 const auto Diff = std::abs(Cmp2 - Cmp1); 93 94 // If the difference between both comparison values is 0 or 1, there is no 95 // need to clamp. 96 if (Diff == 0 || Diff == 1) 97 return false; 98 99 const int64_t Min = std::numeric_limits<int16_t>::min(); 100 const int64_t Max = std::numeric_limits<int16_t>::max(); 101 102 // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 103 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 104 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 105 } 106 107 // We want to find a combination of instructions that 108 // gets generated when an i64 gets clamped to i16. 109 // The corresponding pattern is: 110 // G_MAX / G_MAX for i16 <= G_TRUNC i64. 111 // This can be efficiently written as following: 112 // v_cvt_pk_i16_i32 v0, v0, v1 113 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 114 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( 115 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 116 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); 117 118 Register Src = MatchInfo.Origin; 119 assert(MRI.getType(Src) == LLT::scalar(64)); 120 const LLT S32 = LLT::scalar(32); 121 122 B.setMBB(*MI.getParent()); 123 B.setInstrAndDebugLoc(MI); 124 125 auto Unmerge = B.buildUnmerge(S32, Src); 126 Register Hi32 = Unmerge.getReg(0); 127 Register Lo32 = Unmerge.getReg(1); 128 MRI.setRegClass(Hi32, &AMDGPU::VGPR_32RegClass); 129 MRI.setRegClass(Lo32, &AMDGPU::VGPR_32RegClass); 130 131 constexpr unsigned int CvtOpcode = AMDGPU::V_CVT_PK_I16_I32_e64; 132 assert(MI.getOpcode() != CvtOpcode); 133 134 const auto REG_CLASS = &AMDGPU::VGPR_32RegClass; 135 136 Register CvtDst = MRI.createVirtualRegister(REG_CLASS); 137 MRI.setType(CvtDst, S32); 138 139 auto CvtPk = B.buildInstr(CvtOpcode); 140 CvtPk.addDef(CvtDst); 141 CvtPk.addReg(Hi32); 142 CvtPk.addReg(Lo32); 143 CvtPk.setMIFlags(MI.getFlags()); 144 145 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 146 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 147 148 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 149 MRI.setRegClass(MinBoundaryDst.getReg(0), REG_CLASS); 150 151 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 152 MRI.setRegClass(MaxBoundaryDst.getReg(0), REG_CLASS); 153 154 Register MedDst = MRI.createVirtualRegister(REG_CLASS); 155 MRI.setType(MedDst, S32); 156 157 auto Med = B.buildInstr(AMDGPU::V_MED3_I32); 158 Med.addDef(MedDst); 159 Med.addReg(MinBoundaryDst.getReg(0)); 160 Med.addReg(CvtDst); 161 Med.addReg(MaxBoundaryDst.getReg(0)); 162 Med.setMIFlags(MI.getFlags()); 163 164 Register TruncDst = MRI.createGenericVirtualRegister(LLT::scalar(16)); 165 B.buildTrunc(TruncDst, MedDst); 166 B.buildCopy(MI.getOperand(0).getReg(), TruncDst); 167 168 MI.eraseFromParent(); 169 } 170 171 class AMDGPUPreLegalizerCombinerHelperState { 172 protected: 173 CombinerHelper &Helper; 174 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; 175 176 public: 177 AMDGPUPreLegalizerCombinerHelperState( 178 CombinerHelper &Helper, 179 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) 180 : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} 181 }; 182 183 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 184 #include "AMDGPUGenPreLegalizeGICombiner.inc" 185 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 186 187 namespace { 188 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 189 #include "AMDGPUGenPreLegalizeGICombiner.inc" 190 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 191 192 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { 193 GISelKnownBits *KB; 194 MachineDominatorTree *MDT; 195 196 public: 197 AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 198 199 AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 200 const AMDGPULegalizerInfo *LI, 201 GISelKnownBits *KB, MachineDominatorTree *MDT) 202 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 203 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 204 KB(KB), MDT(MDT) { 205 if (!GeneratedRuleCfg.parseCommandLineOption()) 206 report_fatal_error("Invalid rule identifier"); 207 } 208 209 virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 210 MachineIRBuilder &B) const override; 211 }; 212 213 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 214 MachineInstr &MI, 215 MachineIRBuilder &B) const { 216 CombinerHelper Helper(Observer, B, KB, MDT); 217 AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); 218 AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 219 PreLegalizerHelper); 220 221 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 222 return true; 223 224 switch (MI.getOpcode()) { 225 case TargetOpcode::G_CONCAT_VECTORS: 226 return Helper.tryCombineConcatVectors(MI); 227 case TargetOpcode::G_SHUFFLE_VECTOR: 228 return Helper.tryCombineShuffleVector(MI); 229 } 230 231 return false; 232 } 233 234 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 235 #include "AMDGPUGenPreLegalizeGICombiner.inc" 236 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 237 238 // Pass boilerplate 239 // ================ 240 241 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 242 public: 243 static char ID; 244 245 AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 246 247 StringRef getPassName() const override { 248 return "AMDGPUPreLegalizerCombiner"; 249 } 250 251 bool runOnMachineFunction(MachineFunction &MF) override; 252 253 void getAnalysisUsage(AnalysisUsage &AU) const override; 254 private: 255 bool IsOptNone; 256 }; 257 } // end anonymous namespace 258 259 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 260 AU.addRequired<TargetPassConfig>(); 261 AU.setPreservesCFG(); 262 getSelectionDAGFallbackAnalysisUsage(AU); 263 AU.addRequired<GISelKnownBitsAnalysis>(); 264 AU.addPreserved<GISelKnownBitsAnalysis>(); 265 if (!IsOptNone) { 266 AU.addRequired<MachineDominatorTree>(); 267 AU.addPreserved<MachineDominatorTree>(); 268 } 269 MachineFunctionPass::getAnalysisUsage(AU); 270 } 271 272 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 273 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 274 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 275 } 276 277 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 278 if (MF.getProperties().hasProperty( 279 MachineFunctionProperties::Property::FailedISel)) 280 return false; 281 auto *TPC = &getAnalysis<TargetPassConfig>(); 282 const Function &F = MF.getFunction(); 283 bool EnableOpt = 284 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 285 286 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 287 const AMDGPULegalizerInfo *LI = 288 static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 289 290 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 291 MachineDominatorTree *MDT = 292 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 293 AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 294 F.hasMinSize(), LI, KB, MDT); 295 Combiner C(PCInfo, TPC); 296 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 297 } 298 299 char AMDGPUPreLegalizerCombiner::ID = 0; 300 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 301 "Combine AMDGPU machine instrs before legalization", 302 false, false) 303 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 304 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 305 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 306 "Combine AMDGPU machine instrs before legalization", false, 307 false) 308 309 namespace llvm { 310 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 311 return new AMDGPUPreLegalizerCombiner(IsOptNone); 312 } 313 } // end namespace llvm 314