186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 17a1fe17c9SMatt Arsenault #include "AMDGPUIntrinsicInfo.h" 1886de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 19a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 2086de486dSMatt Arsenault 2186de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 2386de486dSMatt Arsenault #include "llvm/IR/InstVisitor.h" 2486de486dSMatt Arsenault #include "llvm/IR/IRBuilder.h" 2586de486dSMatt Arsenault #include "llvm/Support/Debug.h" 2686de486dSMatt Arsenault #include "llvm/Support/raw_ostream.h" 2786de486dSMatt Arsenault 2886de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 2986de486dSMatt Arsenault 3086de486dSMatt Arsenault using namespace llvm; 3186de486dSMatt Arsenault 3286de486dSMatt Arsenault namespace { 3386de486dSMatt Arsenault 3486de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 35a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36a1fe17c9SMatt Arsenault const GCNTargetMachine *TM; 37a1fe17c9SMatt Arsenault const SISubtarget *ST; 3886de486dSMatt Arsenault DivergenceAnalysis *DA; 39a1fe17c9SMatt Arsenault Module *Mod; 40a1fe17c9SMatt Arsenault bool HasUnsafeFPMath; 4186de486dSMatt Arsenault 42*e14df4b2SKonstantin Zhuravlyov /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to 43*e14df4b2SKonstantin Zhuravlyov /// binary operator \p V. 44*e14df4b2SKonstantin Zhuravlyov /// 45*e14df4b2SKonstantin Zhuravlyov /// \returns Binary operator \p V. 46*e14df4b2SKonstantin Zhuravlyov Value *copyFlags(const BinaryOperator &I, Value *V) const; 47*e14df4b2SKonstantin Zhuravlyov 48*e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 16 bit integer type for given 32 bit integer type 49*e14df4b2SKonstantin Zhuravlyov /// \p T. 50*e14df4b2SKonstantin Zhuravlyov Type *getI16Ty(IRBuilder<> &B, const Type *T) const; 51*e14df4b2SKonstantin Zhuravlyov 52*e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 32 bit integer type for given 16 bit integer type 53*e14df4b2SKonstantin Zhuravlyov /// \p T. 54*e14df4b2SKonstantin Zhuravlyov Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 55*e14df4b2SKonstantin Zhuravlyov 56*e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 16 bit integer, false 57*e14df4b2SKonstantin Zhuravlyov /// otherwise. 58*e14df4b2SKonstantin Zhuravlyov bool isI16Ty(const Type *T) const; 59*e14df4b2SKonstantin Zhuravlyov 60*e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 32 bit integer, false 61*e14df4b2SKonstantin Zhuravlyov /// otherwise. 62*e14df4b2SKonstantin Zhuravlyov bool isI32Ty(const Type *T) const; 63*e14df4b2SKonstantin Zhuravlyov 64*e14df4b2SKonstantin Zhuravlyov /// \returns True if binary operation \p I is a signed binary operation, false 65*e14df4b2SKonstantin Zhuravlyov /// otherwise. 66*e14df4b2SKonstantin Zhuravlyov bool isSigned(const BinaryOperator &I) const; 67*e14df4b2SKonstantin Zhuravlyov 68*e14df4b2SKonstantin Zhuravlyov /// \returns True if the condition of 'select' operation \p I comes from a 69*e14df4b2SKonstantin Zhuravlyov /// signed 'icmp' operation, false otherwise. 70*e14df4b2SKonstantin Zhuravlyov bool isSigned(const SelectInst &I) const; 71*e14df4b2SKonstantin Zhuravlyov 72*e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit 73*e14df4b2SKonstantin Zhuravlyov /// binary operation by sign or zero extending operands to 32 bits, replacing 74*e14df4b2SKonstantin Zhuravlyov /// 16 bit operation with equivalent 32 bit operation, and truncating the 75*e14df4b2SKonstantin Zhuravlyov /// result of 32 bit operation back to 16 bits. 16 bit division operation is 76*e14df4b2SKonstantin Zhuravlyov /// not promoted. 77*e14df4b2SKonstantin Zhuravlyov /// 78*e14df4b2SKonstantin Zhuravlyov /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit 79*e14df4b2SKonstantin Zhuravlyov /// binary operation, false otherwise. 80*e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; 81*e14df4b2SKonstantin Zhuravlyov 82*e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' 83*e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, and replacing 16 84*e14df4b2SKonstantin Zhuravlyov /// bit operation with 32 bit operation. 85*e14df4b2SKonstantin Zhuravlyov /// 86*e14df4b2SKonstantin Zhuravlyov /// \returns True. 87*e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(ICmpInst &I) const; 88*e14df4b2SKonstantin Zhuravlyov 89*e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' 90*e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, replacing 16 bit 91*e14df4b2SKonstantin Zhuravlyov /// operation with 32 bit operation, and truncating the result of 32 bit 92*e14df4b2SKonstantin Zhuravlyov /// operation back to 16 bits. 93*e14df4b2SKonstantin Zhuravlyov /// 94*e14df4b2SKonstantin Zhuravlyov /// \returns True. 95*e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(SelectInst &I) const; 96*e14df4b2SKonstantin Zhuravlyov 9786de486dSMatt Arsenault public: 9886de486dSMatt Arsenault static char ID; 9986de486dSMatt Arsenault AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 10086de486dSMatt Arsenault FunctionPass(ID), 101a1fe17c9SMatt Arsenault TM(static_cast<const GCNTargetMachine *>(TM)), 102a1fe17c9SMatt Arsenault ST(nullptr), 103a1fe17c9SMatt Arsenault DA(nullptr), 104a1fe17c9SMatt Arsenault Mod(nullptr), 105a1fe17c9SMatt Arsenault HasUnsafeFPMath(false) { } 106a1fe17c9SMatt Arsenault 107a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 108a1fe17c9SMatt Arsenault 109*e14df4b2SKonstantin Zhuravlyov bool visitInstruction(Instruction &I) { return false; } 110*e14df4b2SKonstantin Zhuravlyov bool visitBinaryOperator(BinaryOperator &I); 111*e14df4b2SKonstantin Zhuravlyov bool visitICmpInst(ICmpInst &I); 112*e14df4b2SKonstantin Zhuravlyov bool visitSelectInst(SelectInst &I); 11386de486dSMatt Arsenault 11486de486dSMatt Arsenault bool doInitialization(Module &M) override; 11586de486dSMatt Arsenault bool runOnFunction(Function &F) override; 11686de486dSMatt Arsenault 11786de486dSMatt Arsenault const char *getPassName() const override { 11886de486dSMatt Arsenault return "AMDGPU IR optimizations"; 11986de486dSMatt Arsenault } 12086de486dSMatt Arsenault 12186de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 12286de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 12386de486dSMatt Arsenault AU.setPreservesAll(); 12486de486dSMatt Arsenault } 12586de486dSMatt Arsenault }; 12686de486dSMatt Arsenault 12786de486dSMatt Arsenault } // End anonymous namespace 12886de486dSMatt Arsenault 129*e14df4b2SKonstantin Zhuravlyov Value *AMDGPUCodeGenPrepare::copyFlags( 130*e14df4b2SKonstantin Zhuravlyov const BinaryOperator &I, Value *V) const { 131*e14df4b2SKonstantin Zhuravlyov assert(isa<BinaryOperator>(V) && "V must be binary operator"); 132*e14df4b2SKonstantin Zhuravlyov 133*e14df4b2SKonstantin Zhuravlyov BinaryOperator *BinOp = cast<BinaryOperator>(V); 134*e14df4b2SKonstantin Zhuravlyov if (isa<OverflowingBinaryOperator>(BinOp)) { 135*e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 136*e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 137*e14df4b2SKonstantin Zhuravlyov } else if (isa<PossiblyExactOperator>(BinOp)) { 138*e14df4b2SKonstantin Zhuravlyov BinOp->setIsExact(I.isExact()); 139*e14df4b2SKonstantin Zhuravlyov } 140*e14df4b2SKonstantin Zhuravlyov 141*e14df4b2SKonstantin Zhuravlyov return V; 142*e14df4b2SKonstantin Zhuravlyov } 143*e14df4b2SKonstantin Zhuravlyov 144*e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const { 145*e14df4b2SKonstantin Zhuravlyov assert(isI32Ty(T) && "T must be 32 bits"); 146*e14df4b2SKonstantin Zhuravlyov 147*e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 148*e14df4b2SKonstantin Zhuravlyov return B.getInt16Ty(); 149*e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements()); 150*e14df4b2SKonstantin Zhuravlyov } 151*e14df4b2SKonstantin Zhuravlyov 152*e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 153*e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(T) && "T must be 16 bits"); 154*e14df4b2SKonstantin Zhuravlyov 155*e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 156*e14df4b2SKonstantin Zhuravlyov return B.getInt32Ty(); 157*e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 158*e14df4b2SKonstantin Zhuravlyov } 159*e14df4b2SKonstantin Zhuravlyov 160*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const { 161*e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(16)) 162*e14df4b2SKonstantin Zhuravlyov return true; 163*e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 164*e14df4b2SKonstantin Zhuravlyov return false; 165*e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(16); 166*e14df4b2SKonstantin Zhuravlyov } 167*e14df4b2SKonstantin Zhuravlyov 168*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const { 169*e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(32)) 170*e14df4b2SKonstantin Zhuravlyov return true; 171*e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 172*e14df4b2SKonstantin Zhuravlyov return false; 173*e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(32); 174*e14df4b2SKonstantin Zhuravlyov } 175*e14df4b2SKonstantin Zhuravlyov 176*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 177*e14df4b2SKonstantin Zhuravlyov return I.getOpcode() == Instruction::SDiv || 178*e14df4b2SKonstantin Zhuravlyov I.getOpcode() == Instruction::SRem; 179*e14df4b2SKonstantin Zhuravlyov } 180*e14df4b2SKonstantin Zhuravlyov 181*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 182*e14df4b2SKonstantin Zhuravlyov return isa<ICmpInst>(I.getOperand(0)) ? 183*e14df4b2SKonstantin Zhuravlyov cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 184*e14df4b2SKonstantin Zhuravlyov } 185*e14df4b2SKonstantin Zhuravlyov 186*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { 187*e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 188*e14df4b2SKonstantin Zhuravlyov 189*e14df4b2SKonstantin Zhuravlyov if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) 190*e14df4b2SKonstantin Zhuravlyov return false; 191*e14df4b2SKonstantin Zhuravlyov 192*e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 193*e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 194*e14df4b2SKonstantin Zhuravlyov 195*e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 196*e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 197*e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 198*e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 199*e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 200*e14df4b2SKonstantin Zhuravlyov 201*e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 202*e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 203*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 204*e14df4b2SKonstantin Zhuravlyov } else { 205*e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 206*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 207*e14df4b2SKonstantin Zhuravlyov } 208*e14df4b2SKonstantin Zhuravlyov ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 209*e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 210*e14df4b2SKonstantin Zhuravlyov 211*e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 212*e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 213*e14df4b2SKonstantin Zhuravlyov 214*e14df4b2SKonstantin Zhuravlyov return true; 215*e14df4b2SKonstantin Zhuravlyov } 216*e14df4b2SKonstantin Zhuravlyov 217*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const { 218*e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits"); 219*e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits"); 220*e14df4b2SKonstantin Zhuravlyov 221*e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 222*e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 223*e14df4b2SKonstantin Zhuravlyov 224*e14df4b2SKonstantin Zhuravlyov Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType()); 225*e14df4b2SKonstantin Zhuravlyov Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType()); 226*e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 227*e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 228*e14df4b2SKonstantin Zhuravlyov Value *NewICmp = nullptr; 229*e14df4b2SKonstantin Zhuravlyov 230*e14df4b2SKonstantin Zhuravlyov if (I.isSigned()) { 231*e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0); 232*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1); 233*e14df4b2SKonstantin Zhuravlyov } else { 234*e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0); 235*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1); 236*e14df4b2SKonstantin Zhuravlyov } 237*e14df4b2SKonstantin Zhuravlyov NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 238*e14df4b2SKonstantin Zhuravlyov 239*e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(NewICmp); 240*e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 241*e14df4b2SKonstantin Zhuravlyov 242*e14df4b2SKonstantin Zhuravlyov return true; 243*e14df4b2SKonstantin Zhuravlyov } 244*e14df4b2SKonstantin Zhuravlyov 245*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const { 246*e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 247*e14df4b2SKonstantin Zhuravlyov 248*e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 249*e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 250*e14df4b2SKonstantin Zhuravlyov 251*e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 252*e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 253*e14df4b2SKonstantin Zhuravlyov Value *ExtOp2 = nullptr; 254*e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 255*e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 256*e14df4b2SKonstantin Zhuravlyov 257*e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 258*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 259*e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 260*e14df4b2SKonstantin Zhuravlyov } else { 261*e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 262*e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 263*e14df4b2SKonstantin Zhuravlyov } 264*e14df4b2SKonstantin Zhuravlyov ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 265*e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 266*e14df4b2SKonstantin Zhuravlyov 267*e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 268*e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 269*e14df4b2SKonstantin Zhuravlyov 270*e14df4b2SKonstantin Zhuravlyov return true; 271*e14df4b2SKonstantin Zhuravlyov } 272*e14df4b2SKonstantin Zhuravlyov 273a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 274a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 275a1fe17c9SMatt Arsenault if (!CNum) 276a1fe17c9SMatt Arsenault return false; 277a1fe17c9SMatt Arsenault 278a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 279e3862cdcSMatt Arsenault return UnsafeDiv || CNum->isExactlyValue(+1.0); 280a1fe17c9SMatt Arsenault } 281a1fe17c9SMatt Arsenault 282a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 283a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 284a1fe17c9SMatt Arsenault // expected to be optimized. 285a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 286a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 287a1fe17c9SMatt Arsenault 288a1fe17c9SMatt Arsenault // TODO: Handle half 289a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 290a1fe17c9SMatt Arsenault return false; 291a1fe17c9SMatt Arsenault 292a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 293a1fe17c9SMatt Arsenault if (!FPMath) 294a1fe17c9SMatt Arsenault return false; 295a1fe17c9SMatt Arsenault 296a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 297a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 298a1fe17c9SMatt Arsenault if (ULP < 2.5f) 299a1fe17c9SMatt Arsenault return false; 300a1fe17c9SMatt Arsenault 301a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 302a1fe17c9SMatt Arsenault bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 303a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 304a1fe17c9SMatt Arsenault if (ST->hasFP32Denormals() && !UnsafeDiv) 305a1fe17c9SMatt Arsenault return false; 306a1fe17c9SMatt Arsenault 307a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 308a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 309a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 310a1fe17c9SMatt Arsenault 311a1fe17c9SMatt Arsenault const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 312a1fe17c9SMatt Arsenault Function *Decl 313a1fe17c9SMatt Arsenault = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 314a1fe17c9SMatt Arsenault 315a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 316a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 317a1fe17c9SMatt Arsenault 318a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 319a1fe17c9SMatt Arsenault 320a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 321a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 322a1fe17c9SMatt Arsenault 323a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 324a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 325a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 326a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 327a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 328a1fe17c9SMatt Arsenault Value *NewElt; 329a1fe17c9SMatt Arsenault 330a1fe17c9SMatt Arsenault if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 331a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 332a1fe17c9SMatt Arsenault } else { 333a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 334a1fe17c9SMatt Arsenault } 335a1fe17c9SMatt Arsenault 336a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 337a1fe17c9SMatt Arsenault } 338a1fe17c9SMatt Arsenault } else { 339a1fe17c9SMatt Arsenault if (!shouldKeepFDivF32(Num, UnsafeDiv)) 340a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 341a1fe17c9SMatt Arsenault } 342a1fe17c9SMatt Arsenault 343a1fe17c9SMatt Arsenault if (NewFDiv) { 344a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 345a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 346a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 347a1fe17c9SMatt Arsenault } 348a1fe17c9SMatt Arsenault 349a1fe17c9SMatt Arsenault return true; 350a1fe17c9SMatt Arsenault } 351a1fe17c9SMatt Arsenault 352a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 353a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 354a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 355a1fe17c9SMatt Arsenault } 356a1fe17c9SMatt Arsenault 357*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 358*e14df4b2SKonstantin Zhuravlyov bool Changed = false; 359*e14df4b2SKonstantin Zhuravlyov 360*e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 361*e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 362*e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 363*e14df4b2SKonstantin Zhuravlyov 364*e14df4b2SKonstantin Zhuravlyov return Changed; 365*e14df4b2SKonstantin Zhuravlyov } 366*e14df4b2SKonstantin Zhuravlyov 367*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 368*e14df4b2SKonstantin Zhuravlyov bool Changed = false; 369*e14df4b2SKonstantin Zhuravlyov 370*e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 371*e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) && 372*e14df4b2SKonstantin Zhuravlyov isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I)) 373*e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 374*e14df4b2SKonstantin Zhuravlyov 375*e14df4b2SKonstantin Zhuravlyov return Changed; 376*e14df4b2SKonstantin Zhuravlyov } 377*e14df4b2SKonstantin Zhuravlyov 378*e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 379*e14df4b2SKonstantin Zhuravlyov bool Changed = false; 380*e14df4b2SKonstantin Zhuravlyov 381*e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 382*e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 383*e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 384*e14df4b2SKonstantin Zhuravlyov 385*e14df4b2SKonstantin Zhuravlyov return Changed; 386*e14df4b2SKonstantin Zhuravlyov } 387*e14df4b2SKonstantin Zhuravlyov 38886de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 389a1fe17c9SMatt Arsenault Mod = &M; 39086de486dSMatt Arsenault return false; 39186de486dSMatt Arsenault } 39286de486dSMatt Arsenault 39386de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 39486de486dSMatt Arsenault if (!TM || skipFunction(F)) 39586de486dSMatt Arsenault return false; 39686de486dSMatt Arsenault 397a1fe17c9SMatt Arsenault ST = &TM->getSubtarget<SISubtarget>(F); 39886de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 399a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 40086de486dSMatt Arsenault 401a1fe17c9SMatt Arsenault bool MadeChange = false; 402a1fe17c9SMatt Arsenault 403a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 404a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 405a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 406a1fe17c9SMatt Arsenault Next = std::next(I); 407a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 408a1fe17c9SMatt Arsenault } 409a1fe17c9SMatt Arsenault } 410a1fe17c9SMatt Arsenault 411a1fe17c9SMatt Arsenault return MadeChange; 41286de486dSMatt Arsenault } 41386de486dSMatt Arsenault 41486de486dSMatt Arsenault INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 41586de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 41686de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 41786de486dSMatt Arsenault INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 41886de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 41986de486dSMatt Arsenault 42086de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 42186de486dSMatt Arsenault 422a1fe17c9SMatt Arsenault FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 42386de486dSMatt Arsenault return new AMDGPUCodeGenPrepare(TM); 42486de486dSMatt Arsenault } 425