186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 17a1fe17c9SMatt Arsenault #include "AMDGPUIntrinsicInfo.h" 1886de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 19a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 2086de486dSMatt Arsenault 2186de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 2386de486dSMatt Arsenault #include "llvm/IR/InstVisitor.h" 2486de486dSMatt Arsenault #include "llvm/IR/IRBuilder.h" 2586de486dSMatt Arsenault #include "llvm/Support/Debug.h" 2686de486dSMatt Arsenault #include "llvm/Support/raw_ostream.h" 2786de486dSMatt Arsenault 2886de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 2986de486dSMatt Arsenault 3086de486dSMatt Arsenault using namespace llvm; 3186de486dSMatt Arsenault 3286de486dSMatt Arsenault namespace { 3386de486dSMatt Arsenault 3486de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 35a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36a1fe17c9SMatt Arsenault const GCNTargetMachine *TM; 37a1fe17c9SMatt Arsenault const SISubtarget *ST; 3886de486dSMatt Arsenault DivergenceAnalysis *DA; 39a1fe17c9SMatt Arsenault Module *Mod; 40a1fe17c9SMatt Arsenault bool HasUnsafeFPMath; 4186de486dSMatt Arsenault 42e14df4b2SKonstantin Zhuravlyov /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to 43e14df4b2SKonstantin Zhuravlyov /// binary operator \p V. 44e14df4b2SKonstantin Zhuravlyov /// 45e14df4b2SKonstantin Zhuravlyov /// \returns Binary operator \p V. 46e14df4b2SKonstantin Zhuravlyov Value *copyFlags(const BinaryOperator &I, Value *V) const; 47e14df4b2SKonstantin Zhuravlyov 48e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 16 bit integer type for given 32 bit integer type 49e14df4b2SKonstantin Zhuravlyov /// \p T. 50e14df4b2SKonstantin Zhuravlyov Type *getI16Ty(IRBuilder<> &B, const Type *T) const; 51e14df4b2SKonstantin Zhuravlyov 52e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 32 bit integer type for given 16 bit integer type 53e14df4b2SKonstantin Zhuravlyov /// \p T. 54e14df4b2SKonstantin Zhuravlyov Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 55e14df4b2SKonstantin Zhuravlyov 56e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 16 bit integer, false 57e14df4b2SKonstantin Zhuravlyov /// otherwise. 58e14df4b2SKonstantin Zhuravlyov bool isI16Ty(const Type *T) const; 59e14df4b2SKonstantin Zhuravlyov 60e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 32 bit integer, false 61e14df4b2SKonstantin Zhuravlyov /// otherwise. 62e14df4b2SKonstantin Zhuravlyov bool isI32Ty(const Type *T) const; 63e14df4b2SKonstantin Zhuravlyov 64e14df4b2SKonstantin Zhuravlyov /// \returns True if binary operation \p I is a signed binary operation, false 65e14df4b2SKonstantin Zhuravlyov /// otherwise. 66e14df4b2SKonstantin Zhuravlyov bool isSigned(const BinaryOperator &I) const; 67e14df4b2SKonstantin Zhuravlyov 68e14df4b2SKonstantin Zhuravlyov /// \returns True if the condition of 'select' operation \p I comes from a 69e14df4b2SKonstantin Zhuravlyov /// signed 'icmp' operation, false otherwise. 70e14df4b2SKonstantin Zhuravlyov bool isSigned(const SelectInst &I) const; 71e14df4b2SKonstantin Zhuravlyov 72e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit 73e14df4b2SKonstantin Zhuravlyov /// binary operation by sign or zero extending operands to 32 bits, replacing 74e14df4b2SKonstantin Zhuravlyov /// 16 bit operation with equivalent 32 bit operation, and truncating the 75e14df4b2SKonstantin Zhuravlyov /// result of 32 bit operation back to 16 bits. 16 bit division operation is 76e14df4b2SKonstantin Zhuravlyov /// not promoted. 77e14df4b2SKonstantin Zhuravlyov /// 78e14df4b2SKonstantin Zhuravlyov /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit 79e14df4b2SKonstantin Zhuravlyov /// binary operation, false otherwise. 80e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; 81e14df4b2SKonstantin Zhuravlyov 82e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' 83e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, and replacing 16 84e14df4b2SKonstantin Zhuravlyov /// bit operation with 32 bit operation. 85e14df4b2SKonstantin Zhuravlyov /// 86e14df4b2SKonstantin Zhuravlyov /// \returns True. 87e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(ICmpInst &I) const; 88e14df4b2SKonstantin Zhuravlyov 89e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' 90e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, replacing 16 bit 91e14df4b2SKonstantin Zhuravlyov /// operation with 32 bit operation, and truncating the result of 32 bit 92e14df4b2SKonstantin Zhuravlyov /// operation back to 16 bits. 93e14df4b2SKonstantin Zhuravlyov /// 94e14df4b2SKonstantin Zhuravlyov /// \returns True. 95e14df4b2SKonstantin Zhuravlyov bool promoteUniformI16OpToI32Op(SelectInst &I) const; 96e14df4b2SKonstantin Zhuravlyov 9786de486dSMatt Arsenault public: 9886de486dSMatt Arsenault static char ID; 9986de486dSMatt Arsenault AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 10086de486dSMatt Arsenault FunctionPass(ID), 101a1fe17c9SMatt Arsenault TM(static_cast<const GCNTargetMachine *>(TM)), 102a1fe17c9SMatt Arsenault ST(nullptr), 103a1fe17c9SMatt Arsenault DA(nullptr), 104a1fe17c9SMatt Arsenault Mod(nullptr), 105a1fe17c9SMatt Arsenault HasUnsafeFPMath(false) { } 106a1fe17c9SMatt Arsenault 107a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 108a1fe17c9SMatt Arsenault 109e14df4b2SKonstantin Zhuravlyov bool visitInstruction(Instruction &I) { return false; } 110e14df4b2SKonstantin Zhuravlyov bool visitBinaryOperator(BinaryOperator &I); 111e14df4b2SKonstantin Zhuravlyov bool visitICmpInst(ICmpInst &I); 112e14df4b2SKonstantin Zhuravlyov bool visitSelectInst(SelectInst &I); 11386de486dSMatt Arsenault 11486de486dSMatt Arsenault bool doInitialization(Module &M) override; 11586de486dSMatt Arsenault bool runOnFunction(Function &F) override; 11686de486dSMatt Arsenault 117117296c0SMehdi Amini StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 11886de486dSMatt Arsenault 11986de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 12086de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 12186de486dSMatt Arsenault AU.setPreservesAll(); 12286de486dSMatt Arsenault } 12386de486dSMatt Arsenault }; 12486de486dSMatt Arsenault 12586de486dSMatt Arsenault } // End anonymous namespace 12686de486dSMatt Arsenault 127e14df4b2SKonstantin Zhuravlyov Value *AMDGPUCodeGenPrepare::copyFlags( 128e14df4b2SKonstantin Zhuravlyov const BinaryOperator &I, Value *V) const { 129e14df4b2SKonstantin Zhuravlyov assert(isa<BinaryOperator>(V) && "V must be binary operator"); 130e14df4b2SKonstantin Zhuravlyov 131e14df4b2SKonstantin Zhuravlyov BinaryOperator *BinOp = cast<BinaryOperator>(V); 132e14df4b2SKonstantin Zhuravlyov if (isa<OverflowingBinaryOperator>(BinOp)) { 133e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 134e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 135e14df4b2SKonstantin Zhuravlyov } else if (isa<PossiblyExactOperator>(BinOp)) { 136e14df4b2SKonstantin Zhuravlyov BinOp->setIsExact(I.isExact()); 137e14df4b2SKonstantin Zhuravlyov } 138e14df4b2SKonstantin Zhuravlyov 139e14df4b2SKonstantin Zhuravlyov return V; 140e14df4b2SKonstantin Zhuravlyov } 141e14df4b2SKonstantin Zhuravlyov 142e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const { 143e14df4b2SKonstantin Zhuravlyov assert(isI32Ty(T) && "T must be 32 bits"); 144e14df4b2SKonstantin Zhuravlyov 145e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 146e14df4b2SKonstantin Zhuravlyov return B.getInt16Ty(); 147e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements()); 148e14df4b2SKonstantin Zhuravlyov } 149e14df4b2SKonstantin Zhuravlyov 150e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 151e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(T) && "T must be 16 bits"); 152e14df4b2SKonstantin Zhuravlyov 153e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 154e14df4b2SKonstantin Zhuravlyov return B.getInt32Ty(); 155e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 156e14df4b2SKonstantin Zhuravlyov } 157e14df4b2SKonstantin Zhuravlyov 158e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const { 159e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(16)) 160e14df4b2SKonstantin Zhuravlyov return true; 161e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 162e14df4b2SKonstantin Zhuravlyov return false; 163e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(16); 164e14df4b2SKonstantin Zhuravlyov } 165e14df4b2SKonstantin Zhuravlyov 166e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const { 167e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(32)) 168e14df4b2SKonstantin Zhuravlyov return true; 169e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 170e14df4b2SKonstantin Zhuravlyov return false; 171e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(32); 172e14df4b2SKonstantin Zhuravlyov } 173e14df4b2SKonstantin Zhuravlyov 174e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 175*691e2e02SKonstantin Zhuravlyov return I.getOpcode() == Instruction::AShr || 176*691e2e02SKonstantin Zhuravlyov I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 177e14df4b2SKonstantin Zhuravlyov } 178e14df4b2SKonstantin Zhuravlyov 179e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 180e14df4b2SKonstantin Zhuravlyov return isa<ICmpInst>(I.getOperand(0)) ? 181e14df4b2SKonstantin Zhuravlyov cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 182e14df4b2SKonstantin Zhuravlyov } 183e14df4b2SKonstantin Zhuravlyov 184e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { 185e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 186e14df4b2SKonstantin Zhuravlyov 187e14df4b2SKonstantin Zhuravlyov if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) 188e14df4b2SKonstantin Zhuravlyov return false; 189e14df4b2SKonstantin Zhuravlyov 190e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 191e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 192e14df4b2SKonstantin Zhuravlyov 193e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 194e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 195e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 196e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 197e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 198e14df4b2SKonstantin Zhuravlyov 199e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 200e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 201e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 202e14df4b2SKonstantin Zhuravlyov } else { 203e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 204e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 205e14df4b2SKonstantin Zhuravlyov } 206e14df4b2SKonstantin Zhuravlyov ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 207e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 208e14df4b2SKonstantin Zhuravlyov 209e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 210e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 211e14df4b2SKonstantin Zhuravlyov 212e14df4b2SKonstantin Zhuravlyov return true; 213e14df4b2SKonstantin Zhuravlyov } 214e14df4b2SKonstantin Zhuravlyov 215e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const { 216e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits"); 217e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits"); 218e14df4b2SKonstantin Zhuravlyov 219e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 220e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 221e14df4b2SKonstantin Zhuravlyov 222e14df4b2SKonstantin Zhuravlyov Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType()); 223e14df4b2SKonstantin Zhuravlyov Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType()); 224e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 225e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 226e14df4b2SKonstantin Zhuravlyov Value *NewICmp = nullptr; 227e14df4b2SKonstantin Zhuravlyov 228e14df4b2SKonstantin Zhuravlyov if (I.isSigned()) { 229e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0); 230e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1); 231e14df4b2SKonstantin Zhuravlyov } else { 232e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0); 233e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1); 234e14df4b2SKonstantin Zhuravlyov } 235e14df4b2SKonstantin Zhuravlyov NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 236e14df4b2SKonstantin Zhuravlyov 237e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(NewICmp); 238e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 239e14df4b2SKonstantin Zhuravlyov 240e14df4b2SKonstantin Zhuravlyov return true; 241e14df4b2SKonstantin Zhuravlyov } 242e14df4b2SKonstantin Zhuravlyov 243e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const { 244e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 245e14df4b2SKonstantin Zhuravlyov 246e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 247e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 248e14df4b2SKonstantin Zhuravlyov 249e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 250e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 251e14df4b2SKonstantin Zhuravlyov Value *ExtOp2 = nullptr; 252e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 253e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 254e14df4b2SKonstantin Zhuravlyov 255e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 256e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 257e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 258e14df4b2SKonstantin Zhuravlyov } else { 259e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 260e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 261e14df4b2SKonstantin Zhuravlyov } 262e14df4b2SKonstantin Zhuravlyov ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 263e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 264e14df4b2SKonstantin Zhuravlyov 265e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 266e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 267e14df4b2SKonstantin Zhuravlyov 268e14df4b2SKonstantin Zhuravlyov return true; 269e14df4b2SKonstantin Zhuravlyov } 270e14df4b2SKonstantin Zhuravlyov 271a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 272a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 273a1fe17c9SMatt Arsenault if (!CNum) 274a1fe17c9SMatt Arsenault return false; 275a1fe17c9SMatt Arsenault 276a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 277e3862cdcSMatt Arsenault return UnsafeDiv || CNum->isExactlyValue(+1.0); 278a1fe17c9SMatt Arsenault } 279a1fe17c9SMatt Arsenault 280a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 281a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 282a1fe17c9SMatt Arsenault // expected to be optimized. 283a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 284a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 285a1fe17c9SMatt Arsenault 286a1fe17c9SMatt Arsenault // TODO: Handle half 287a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 288a1fe17c9SMatt Arsenault return false; 289a1fe17c9SMatt Arsenault 290a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 291a1fe17c9SMatt Arsenault if (!FPMath) 292a1fe17c9SMatt Arsenault return false; 293a1fe17c9SMatt Arsenault 294a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 295a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 296a1fe17c9SMatt Arsenault if (ULP < 2.5f) 297a1fe17c9SMatt Arsenault return false; 298a1fe17c9SMatt Arsenault 299a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 300a1fe17c9SMatt Arsenault bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 301a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 302a1fe17c9SMatt Arsenault if (ST->hasFP32Denormals() && !UnsafeDiv) 303a1fe17c9SMatt Arsenault return false; 304a1fe17c9SMatt Arsenault 305a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 306a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 307a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 308a1fe17c9SMatt Arsenault 309a1fe17c9SMatt Arsenault const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 310a1fe17c9SMatt Arsenault Function *Decl 311a1fe17c9SMatt Arsenault = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 312a1fe17c9SMatt Arsenault 313a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 314a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 315a1fe17c9SMatt Arsenault 316a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 317a1fe17c9SMatt Arsenault 318a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 319a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 320a1fe17c9SMatt Arsenault 321a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 322a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 323a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 324a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 325a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 326a1fe17c9SMatt Arsenault Value *NewElt; 327a1fe17c9SMatt Arsenault 328a1fe17c9SMatt Arsenault if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 329a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 330a1fe17c9SMatt Arsenault } else { 331a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 332a1fe17c9SMatt Arsenault } 333a1fe17c9SMatt Arsenault 334a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 335a1fe17c9SMatt Arsenault } 336a1fe17c9SMatt Arsenault } else { 337a1fe17c9SMatt Arsenault if (!shouldKeepFDivF32(Num, UnsafeDiv)) 338a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 339a1fe17c9SMatt Arsenault } 340a1fe17c9SMatt Arsenault 341a1fe17c9SMatt Arsenault if (NewFDiv) { 342a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 343a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 344a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 345a1fe17c9SMatt Arsenault } 346a1fe17c9SMatt Arsenault 347a1fe17c9SMatt Arsenault return true; 348a1fe17c9SMatt Arsenault } 349a1fe17c9SMatt Arsenault 350a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 351a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 352a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 353a1fe17c9SMatt Arsenault } 354a1fe17c9SMatt Arsenault 355e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 356e14df4b2SKonstantin Zhuravlyov bool Changed = false; 357e14df4b2SKonstantin Zhuravlyov 358e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 359e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 360e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 361e14df4b2SKonstantin Zhuravlyov 362e14df4b2SKonstantin Zhuravlyov return Changed; 363e14df4b2SKonstantin Zhuravlyov } 364e14df4b2SKonstantin Zhuravlyov 365e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 366e14df4b2SKonstantin Zhuravlyov bool Changed = false; 367e14df4b2SKonstantin Zhuravlyov 368e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 369e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) && 370e14df4b2SKonstantin Zhuravlyov isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I)) 371e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 372e14df4b2SKonstantin Zhuravlyov 373e14df4b2SKonstantin Zhuravlyov return Changed; 374e14df4b2SKonstantin Zhuravlyov } 375e14df4b2SKonstantin Zhuravlyov 376e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 377e14df4b2SKonstantin Zhuravlyov bool Changed = false; 378e14df4b2SKonstantin Zhuravlyov 379e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 380e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 381e14df4b2SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32Op(I); 382e14df4b2SKonstantin Zhuravlyov 383e14df4b2SKonstantin Zhuravlyov return Changed; 384e14df4b2SKonstantin Zhuravlyov } 385e14df4b2SKonstantin Zhuravlyov 38686de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 387a1fe17c9SMatt Arsenault Mod = &M; 38886de486dSMatt Arsenault return false; 38986de486dSMatt Arsenault } 39086de486dSMatt Arsenault 39186de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 39286de486dSMatt Arsenault if (!TM || skipFunction(F)) 39386de486dSMatt Arsenault return false; 39486de486dSMatt Arsenault 395a1fe17c9SMatt Arsenault ST = &TM->getSubtarget<SISubtarget>(F); 39686de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 397a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 39886de486dSMatt Arsenault 399a1fe17c9SMatt Arsenault bool MadeChange = false; 400a1fe17c9SMatt Arsenault 401a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 402a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 403a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 404a1fe17c9SMatt Arsenault Next = std::next(I); 405a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 406a1fe17c9SMatt Arsenault } 407a1fe17c9SMatt Arsenault } 408a1fe17c9SMatt Arsenault 409a1fe17c9SMatt Arsenault return MadeChange; 41086de486dSMatt Arsenault } 41186de486dSMatt Arsenault 41286de486dSMatt Arsenault INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 41386de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 41486de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 41586de486dSMatt Arsenault INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 41686de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 41786de486dSMatt Arsenault 41886de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 41986de486dSMatt Arsenault 420a1fe17c9SMatt Arsenault FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 42186de486dSMatt Arsenault return new AMDGPUCodeGenPrepare(TM); 42286de486dSMatt Arsenault } 423