186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 1786de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 18a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 19734bb7bbSEugene Zelenko #include "llvm/ADT/StringRef.h" 2086de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 21a126a13bSWei Ding #include "llvm/Analysis/Loads.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 238b61764cSFrancis Visoiu Mistrih #include "llvm/CodeGen/TargetPassConfig.h" 24734bb7bbSEugene Zelenko #include "llvm/IR/Attributes.h" 25734bb7bbSEugene Zelenko #include "llvm/IR/BasicBlock.h" 26734bb7bbSEugene Zelenko #include "llvm/IR/Constants.h" 27734bb7bbSEugene Zelenko #include "llvm/IR/DerivedTypes.h" 28734bb7bbSEugene Zelenko #include "llvm/IR/Function.h" 296bda14b3SChandler Carruth #include "llvm/IR/IRBuilder.h" 306bda14b3SChandler Carruth #include "llvm/IR/InstVisitor.h" 31734bb7bbSEugene Zelenko #include "llvm/IR/InstrTypes.h" 32734bb7bbSEugene Zelenko #include "llvm/IR/Instruction.h" 33734bb7bbSEugene Zelenko #include "llvm/IR/Instructions.h" 34734bb7bbSEugene Zelenko #include "llvm/IR/IntrinsicInst.h" 35734bb7bbSEugene Zelenko #include "llvm/IR/Intrinsics.h" 36734bb7bbSEugene Zelenko #include "llvm/IR/LLVMContext.h" 37734bb7bbSEugene Zelenko #include "llvm/IR/Operator.h" 38734bb7bbSEugene Zelenko #include "llvm/IR/Type.h" 39734bb7bbSEugene Zelenko #include "llvm/IR/Value.h" 40734bb7bbSEugene Zelenko #include "llvm/Pass.h" 41734bb7bbSEugene Zelenko #include "llvm/Support/Casting.h" 42734bb7bbSEugene Zelenko #include <cassert> 43734bb7bbSEugene Zelenko #include <iterator> 4486de486dSMatt Arsenault 4586de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 4686de486dSMatt Arsenault 4786de486dSMatt Arsenault using namespace llvm; 4886de486dSMatt Arsenault 4986de486dSMatt Arsenault namespace { 5086de486dSMatt Arsenault 5186de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 52a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 53734bb7bbSEugene Zelenko const SISubtarget *ST = nullptr; 54734bb7bbSEugene Zelenko DivergenceAnalysis *DA = nullptr; 55734bb7bbSEugene Zelenko Module *Mod = nullptr; 56734bb7bbSEugene Zelenko bool HasUnsafeFPMath = false; 57a126a13bSWei Ding AMDGPUAS AMDGPUASI; 5886de486dSMatt Arsenault 59f74fc60aSKonstantin Zhuravlyov /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 60f74fc60aSKonstantin Zhuravlyov /// binary operation \p V. 61e14df4b2SKonstantin Zhuravlyov /// 62f74fc60aSKonstantin Zhuravlyov /// \returns Binary operation \p V. 63f74fc60aSKonstantin Zhuravlyov /// \returns \p T's base element bit width. 64f74fc60aSKonstantin Zhuravlyov unsigned getBaseElementBitWidth(const Type *T) const; 65e14df4b2SKonstantin Zhuravlyov 66f74fc60aSKonstantin Zhuravlyov /// \returns Equivalent 32 bit integer type for given type \p T. For example, 67f74fc60aSKonstantin Zhuravlyov /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 68f74fc60aSKonstantin Zhuravlyov /// is returned. 69e14df4b2SKonstantin Zhuravlyov Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 70e14df4b2SKonstantin Zhuravlyov 71e14df4b2SKonstantin Zhuravlyov /// \returns True if binary operation \p I is a signed binary operation, false 72e14df4b2SKonstantin Zhuravlyov /// otherwise. 73e14df4b2SKonstantin Zhuravlyov bool isSigned(const BinaryOperator &I) const; 74e14df4b2SKonstantin Zhuravlyov 75e14df4b2SKonstantin Zhuravlyov /// \returns True if the condition of 'select' operation \p I comes from a 76e14df4b2SKonstantin Zhuravlyov /// signed 'icmp' operation, false otherwise. 77e14df4b2SKonstantin Zhuravlyov bool isSigned(const SelectInst &I) const; 78e14df4b2SKonstantin Zhuravlyov 79f74fc60aSKonstantin Zhuravlyov /// \returns True if type \p T needs to be promoted to 32 bit integer type, 80f74fc60aSKonstantin Zhuravlyov /// false otherwise. 81f74fc60aSKonstantin Zhuravlyov bool needsPromotionToI32(const Type *T) const; 82e14df4b2SKonstantin Zhuravlyov 83f74fc60aSKonstantin Zhuravlyov /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 84f74fc60aSKonstantin Zhuravlyov /// operation. 85f74fc60aSKonstantin Zhuravlyov /// 86f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 87f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 88f74fc60aSKonstantin Zhuravlyov /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 89f74fc60aSKonstantin Zhuravlyov /// truncating the result of 32 bit binary operation back to \p I's original 90f74fc60aSKonstantin Zhuravlyov /// type. Division operation is not promoted. 91f74fc60aSKonstantin Zhuravlyov /// 92f74fc60aSKonstantin Zhuravlyov /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 93f74fc60aSKonstantin Zhuravlyov /// false otherwise. 94f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(BinaryOperator &I) const; 95f74fc60aSKonstantin Zhuravlyov 96f74fc60aSKonstantin Zhuravlyov /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 97f74fc60aSKonstantin Zhuravlyov /// 98f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 99f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 100f74fc60aSKonstantin Zhuravlyov /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 101e14df4b2SKonstantin Zhuravlyov /// 102e14df4b2SKonstantin Zhuravlyov /// \returns True. 103f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(ICmpInst &I) const; 104e14df4b2SKonstantin Zhuravlyov 105f74fc60aSKonstantin Zhuravlyov /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 106f74fc60aSKonstantin Zhuravlyov /// operation. 107f74fc60aSKonstantin Zhuravlyov /// 108f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 109f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 110f74fc60aSKonstantin Zhuravlyov /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 111f74fc60aSKonstantin Zhuravlyov /// result of 32 bit 'select' operation back to \p I's original type. 112e14df4b2SKonstantin Zhuravlyov /// 113e14df4b2SKonstantin Zhuravlyov /// \returns True. 114f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(SelectInst &I) const; 115b4eb5d50SKonstantin Zhuravlyov 116f74fc60aSKonstantin Zhuravlyov /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 117f74fc60aSKonstantin Zhuravlyov /// intrinsic. 118f74fc60aSKonstantin Zhuravlyov /// 119f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 120f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by zero extending the operand to 32 121f74fc60aSKonstantin Zhuravlyov /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 122f74fc60aSKonstantin Zhuravlyov /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 123f74fc60aSKonstantin Zhuravlyov /// shift amount is 32 minus \p I's base element bit width), and truncating 124f74fc60aSKonstantin Zhuravlyov /// the result of the shift operation back to \p I's original type. 125b4eb5d50SKonstantin Zhuravlyov /// 126b4eb5d50SKonstantin Zhuravlyov /// \returns True. 127f74fc60aSKonstantin Zhuravlyov bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 128a126a13bSWei Ding /// \brief Widen a scalar load. 129a126a13bSWei Ding /// 130a126a13bSWei Ding /// \details \p Widen scalar load for uniform, small type loads from constant 131a126a13bSWei Ding // memory / to a full 32-bits and then truncate the input to allow a scalar 132a126a13bSWei Ding // load instead of a vector load. 133a126a13bSWei Ding // 134a126a13bSWei Ding /// \returns True. 135a126a13bSWei Ding 136a126a13bSWei Ding bool canWidenScalarExtLoad(LoadInst &I) const; 137e14df4b2SKonstantin Zhuravlyov 13886de486dSMatt Arsenault public: 13986de486dSMatt Arsenault static char ID; 140734bb7bbSEugene Zelenko 1418b61764cSFrancis Visoiu Mistrih AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 142a1fe17c9SMatt Arsenault 143a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 144a1fe17c9SMatt Arsenault 145e14df4b2SKonstantin Zhuravlyov bool visitInstruction(Instruction &I) { return false; } 146e14df4b2SKonstantin Zhuravlyov bool visitBinaryOperator(BinaryOperator &I); 147a126a13bSWei Ding bool visitLoadInst(LoadInst &I); 148e14df4b2SKonstantin Zhuravlyov bool visitICmpInst(ICmpInst &I); 149e14df4b2SKonstantin Zhuravlyov bool visitSelectInst(SelectInst &I); 15086de486dSMatt Arsenault 151b4eb5d50SKonstantin Zhuravlyov bool visitIntrinsicInst(IntrinsicInst &I); 152b4eb5d50SKonstantin Zhuravlyov bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 153b4eb5d50SKonstantin Zhuravlyov 15486de486dSMatt Arsenault bool doInitialization(Module &M) override; 15586de486dSMatt Arsenault bool runOnFunction(Function &F) override; 15686de486dSMatt Arsenault 157117296c0SMehdi Amini StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 15886de486dSMatt Arsenault 15986de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 16086de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 16186de486dSMatt Arsenault AU.setPreservesAll(); 16286de486dSMatt Arsenault } 16386de486dSMatt Arsenault }; 16486de486dSMatt Arsenault 165734bb7bbSEugene Zelenko } // end anonymous namespace 16686de486dSMatt Arsenault 167f74fc60aSKonstantin Zhuravlyov unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 168f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 169e14df4b2SKonstantin Zhuravlyov 170e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 171f74fc60aSKonstantin Zhuravlyov return T->getIntegerBitWidth(); 172f74fc60aSKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 173e14df4b2SKonstantin Zhuravlyov } 174e14df4b2SKonstantin Zhuravlyov 175e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 176f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 177e14df4b2SKonstantin Zhuravlyov 178e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 179e14df4b2SKonstantin Zhuravlyov return B.getInt32Ty(); 180e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 181e14df4b2SKonstantin Zhuravlyov } 182e14df4b2SKonstantin Zhuravlyov 183e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 184691e2e02SKonstantin Zhuravlyov return I.getOpcode() == Instruction::AShr || 185691e2e02SKonstantin Zhuravlyov I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 186e14df4b2SKonstantin Zhuravlyov } 187e14df4b2SKonstantin Zhuravlyov 188e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 189e14df4b2SKonstantin Zhuravlyov return isa<ICmpInst>(I.getOperand(0)) ? 190e14df4b2SKonstantin Zhuravlyov cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 191e14df4b2SKonstantin Zhuravlyov } 192e14df4b2SKonstantin Zhuravlyov 193f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 194eb522e68SMatt Arsenault const IntegerType *IntTy = dyn_cast<IntegerType>(T); 195eb522e68SMatt Arsenault if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 196f74fc60aSKonstantin Zhuravlyov return true; 197eb522e68SMatt Arsenault 198eb522e68SMatt Arsenault if (const VectorType *VT = dyn_cast<VectorType>(T)) { 199eb522e68SMatt Arsenault // TODO: The set of packed operations is more limited, so may want to 200eb522e68SMatt Arsenault // promote some anyway. 201eb522e68SMatt Arsenault if (ST->hasVOP3PInsts()) 202f74fc60aSKonstantin Zhuravlyov return false; 203eb522e68SMatt Arsenault 204eb522e68SMatt Arsenault return needsPromotionToI32(VT->getElementType()); 205eb522e68SMatt Arsenault } 206eb522e68SMatt Arsenault 207eb522e68SMatt Arsenault return false; 208f74fc60aSKonstantin Zhuravlyov } 209e14df4b2SKonstantin Zhuravlyov 210d59e6404SMatt Arsenault // Return true if the op promoted to i32 should have nsw set. 211d59e6404SMatt Arsenault static bool promotedOpIsNSW(const Instruction &I) { 212d59e6404SMatt Arsenault switch (I.getOpcode()) { 213d59e6404SMatt Arsenault case Instruction::Shl: 214d59e6404SMatt Arsenault case Instruction::Add: 215d59e6404SMatt Arsenault case Instruction::Sub: 216d59e6404SMatt Arsenault return true; 217d59e6404SMatt Arsenault case Instruction::Mul: 218d59e6404SMatt Arsenault return I.hasNoUnsignedWrap(); 219d59e6404SMatt Arsenault default: 220d59e6404SMatt Arsenault return false; 221d59e6404SMatt Arsenault } 222d59e6404SMatt Arsenault } 223d59e6404SMatt Arsenault 224d59e6404SMatt Arsenault // Return true if the op promoted to i32 should have nuw set. 225d59e6404SMatt Arsenault static bool promotedOpIsNUW(const Instruction &I) { 226d59e6404SMatt Arsenault switch (I.getOpcode()) { 227d59e6404SMatt Arsenault case Instruction::Shl: 228d59e6404SMatt Arsenault case Instruction::Add: 229d59e6404SMatt Arsenault case Instruction::Mul: 230d59e6404SMatt Arsenault return true; 231d59e6404SMatt Arsenault case Instruction::Sub: 232d59e6404SMatt Arsenault return I.hasNoUnsignedWrap(); 233d59e6404SMatt Arsenault default: 234d59e6404SMatt Arsenault return false; 235d59e6404SMatt Arsenault } 236d59e6404SMatt Arsenault } 237d59e6404SMatt Arsenault 238a126a13bSWei Ding bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 239a126a13bSWei Ding Type *Ty = I.getType(); 240a126a13bSWei Ding const DataLayout &DL = Mod->getDataLayout(); 241a126a13bSWei Ding int TySize = DL.getTypeSizeInBits(Ty); 242a126a13bSWei Ding unsigned Align = I.getAlignment() ? 243a126a13bSWei Ding I.getAlignment() : DL.getABITypeAlignment(Ty); 244a126a13bSWei Ding 245a126a13bSWei Ding return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 246a126a13bSWei Ding } 247a126a13bSWei Ding 248f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 249f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 250f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 251f74fc60aSKonstantin Zhuravlyov 252f74fc60aSKonstantin Zhuravlyov if (I.getOpcode() == Instruction::SDiv || 253f74fc60aSKonstantin Zhuravlyov I.getOpcode() == Instruction::UDiv) 254e14df4b2SKonstantin Zhuravlyov return false; 255e14df4b2SKonstantin Zhuravlyov 256e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 257e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 258e14df4b2SKonstantin Zhuravlyov 259e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 260e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 261e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 262e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 263e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 264e14df4b2SKonstantin Zhuravlyov 265e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 266e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 267e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 268e14df4b2SKonstantin Zhuravlyov } else { 269e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 270e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 271e14df4b2SKonstantin Zhuravlyov } 272d59e6404SMatt Arsenault 273d59e6404SMatt Arsenault ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 274d59e6404SMatt Arsenault if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 275d59e6404SMatt Arsenault if (promotedOpIsNSW(cast<Instruction>(I))) 276d59e6404SMatt Arsenault Inst->setHasNoSignedWrap(); 277d59e6404SMatt Arsenault 278d59e6404SMatt Arsenault if (promotedOpIsNUW(cast<Instruction>(I))) 279d59e6404SMatt Arsenault Inst->setHasNoUnsignedWrap(); 280d59e6404SMatt Arsenault 281d59e6404SMatt Arsenault if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 282d59e6404SMatt Arsenault Inst->setIsExact(ExactOp->isExact()); 283d59e6404SMatt Arsenault } 284d59e6404SMatt Arsenault 285f74fc60aSKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 286e14df4b2SKonstantin Zhuravlyov 287e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 288e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 289e14df4b2SKonstantin Zhuravlyov 290e14df4b2SKonstantin Zhuravlyov return true; 291e14df4b2SKonstantin Zhuravlyov } 292e14df4b2SKonstantin Zhuravlyov 293f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 294f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getOperand(0)->getType()) && 295f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 296e14df4b2SKonstantin Zhuravlyov 297e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 298e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 299e14df4b2SKonstantin Zhuravlyov 300f74fc60aSKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 301e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 302e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 303e14df4b2SKonstantin Zhuravlyov Value *NewICmp = nullptr; 304e14df4b2SKonstantin Zhuravlyov 305e14df4b2SKonstantin Zhuravlyov if (I.isSigned()) { 306f74fc60aSKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 307f74fc60aSKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 308e14df4b2SKonstantin Zhuravlyov } else { 309f74fc60aSKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 310f74fc60aSKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 311e14df4b2SKonstantin Zhuravlyov } 312e14df4b2SKonstantin Zhuravlyov NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 313e14df4b2SKonstantin Zhuravlyov 314e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(NewICmp); 315e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 316e14df4b2SKonstantin Zhuravlyov 317e14df4b2SKonstantin Zhuravlyov return true; 318e14df4b2SKonstantin Zhuravlyov } 319e14df4b2SKonstantin Zhuravlyov 320f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 321f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 322f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 323e14df4b2SKonstantin Zhuravlyov 324e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 325e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 326e14df4b2SKonstantin Zhuravlyov 327e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 328e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 329e14df4b2SKonstantin Zhuravlyov Value *ExtOp2 = nullptr; 330e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 331e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 332e14df4b2SKonstantin Zhuravlyov 333e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 334e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 335e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 336e14df4b2SKonstantin Zhuravlyov } else { 337e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 338e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 339e14df4b2SKonstantin Zhuravlyov } 340e14df4b2SKonstantin Zhuravlyov ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 341f74fc60aSKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 342e14df4b2SKonstantin Zhuravlyov 343e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 344e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 345e14df4b2SKonstantin Zhuravlyov 346e14df4b2SKonstantin Zhuravlyov return true; 347e14df4b2SKonstantin Zhuravlyov } 348e14df4b2SKonstantin Zhuravlyov 349f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 350b4eb5d50SKonstantin Zhuravlyov IntrinsicInst &I) const { 351f74fc60aSKonstantin Zhuravlyov assert(I.getIntrinsicID() == Intrinsic::bitreverse && 352f74fc60aSKonstantin Zhuravlyov "I must be bitreverse intrinsic"); 353f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 354f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 355b4eb5d50SKonstantin Zhuravlyov 356b4eb5d50SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 357b4eb5d50SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 358b4eb5d50SKonstantin Zhuravlyov 359b4eb5d50SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 360b4eb5d50SKonstantin Zhuravlyov Function *I32 = 361c09e2d7eSKonstantin Zhuravlyov Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 362b4eb5d50SKonstantin Zhuravlyov Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 363b4eb5d50SKonstantin Zhuravlyov Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 364f74fc60aSKonstantin Zhuravlyov Value *LShrOp = 365f74fc60aSKonstantin Zhuravlyov Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 366b4eb5d50SKonstantin Zhuravlyov Value *TruncRes = 367f74fc60aSKonstantin Zhuravlyov Builder.CreateTrunc(LShrOp, I.getType()); 368b4eb5d50SKonstantin Zhuravlyov 369b4eb5d50SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 370b4eb5d50SKonstantin Zhuravlyov I.eraseFromParent(); 371b4eb5d50SKonstantin Zhuravlyov 372b4eb5d50SKonstantin Zhuravlyov return true; 373b4eb5d50SKonstantin Zhuravlyov } 374b4eb5d50SKonstantin Zhuravlyov 375a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 376a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 377a1fe17c9SMatt Arsenault if (!CNum) 378a1fe17c9SMatt Arsenault return false; 379a1fe17c9SMatt Arsenault 380a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 381e3862cdcSMatt Arsenault return UnsafeDiv || CNum->isExactlyValue(+1.0); 382a1fe17c9SMatt Arsenault } 383a1fe17c9SMatt Arsenault 384a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 385a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 386a1fe17c9SMatt Arsenault // expected to be optimized. 387a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 388a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 389a1fe17c9SMatt Arsenault 390a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 391a1fe17c9SMatt Arsenault return false; 392a1fe17c9SMatt Arsenault 393a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 394a1fe17c9SMatt Arsenault if (!FPMath) 395a1fe17c9SMatt Arsenault return false; 396a1fe17c9SMatt Arsenault 397a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 398a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 399a1fe17c9SMatt Arsenault if (ULP < 2.5f) 400a1fe17c9SMatt Arsenault return false; 401a1fe17c9SMatt Arsenault 402a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 403629c4115SSanjay Patel bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 404a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 4059d7b1c9dSStanislav Mekhanoshin 4069d7b1c9dSStanislav Mekhanoshin // With UnsafeDiv node will be optimized to just rcp and mul. 4079d7b1c9dSStanislav Mekhanoshin if (ST->hasFP32Denormals() || UnsafeDiv) 408a1fe17c9SMatt Arsenault return false; 409a1fe17c9SMatt Arsenault 410a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 411a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 412a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 413a1fe17c9SMatt Arsenault 414c5b641acSMatt Arsenault Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 415a1fe17c9SMatt Arsenault 416a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 417a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 418a1fe17c9SMatt Arsenault 419a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 420a1fe17c9SMatt Arsenault 421a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 422a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 423a1fe17c9SMatt Arsenault 424a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 425a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 426a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 427a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 428a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 429a1fe17c9SMatt Arsenault Value *NewElt; 430a1fe17c9SMatt Arsenault 431a1fe17c9SMatt Arsenault if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 432a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 433a1fe17c9SMatt Arsenault } else { 434a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 435a1fe17c9SMatt Arsenault } 436a1fe17c9SMatt Arsenault 437a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 438a1fe17c9SMatt Arsenault } 439a1fe17c9SMatt Arsenault } else { 440a1fe17c9SMatt Arsenault if (!shouldKeepFDivF32(Num, UnsafeDiv)) 441a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 442a1fe17c9SMatt Arsenault } 443a1fe17c9SMatt Arsenault 444a1fe17c9SMatt Arsenault if (NewFDiv) { 445a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 446a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 447a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 448a1fe17c9SMatt Arsenault } 449a1fe17c9SMatt Arsenault 450a1fe17c9SMatt Arsenault return true; 451a1fe17c9SMatt Arsenault } 452a1fe17c9SMatt Arsenault 453a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 454a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 455a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 456a1fe17c9SMatt Arsenault } 457a1fe17c9SMatt Arsenault 458e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 459e14df4b2SKonstantin Zhuravlyov bool Changed = false; 460e14df4b2SKonstantin Zhuravlyov 461f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 462f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 463f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 464e14df4b2SKonstantin Zhuravlyov 465e14df4b2SKonstantin Zhuravlyov return Changed; 466e14df4b2SKonstantin Zhuravlyov } 467e14df4b2SKonstantin Zhuravlyov 468a126a13bSWei Ding bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 469*923712b6SMatt Arsenault if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 470*923712b6SMatt Arsenault I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 471a126a13bSWei Ding canWidenScalarExtLoad(I)) { 472a126a13bSWei Ding IRBuilder<> Builder(&I); 473a126a13bSWei Ding Builder.SetCurrentDebugLocation(I.getDebugLoc()); 474a126a13bSWei Ding 475a126a13bSWei Ding Type *I32Ty = Builder.getInt32Ty(); 476a126a13bSWei Ding Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 477a126a13bSWei Ding Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 478a126a13bSWei Ding Value *WidenLoad = Builder.CreateLoad(BitCast); 479a126a13bSWei Ding 480a126a13bSWei Ding int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 481a126a13bSWei Ding Type *IntNTy = Builder.getIntNTy(TySize); 482a126a13bSWei Ding Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 483a126a13bSWei Ding Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 484a126a13bSWei Ding I.replaceAllUsesWith(ValOrig); 485a126a13bSWei Ding I.eraseFromParent(); 486a126a13bSWei Ding return true; 487a126a13bSWei Ding } 488a126a13bSWei Ding 489a126a13bSWei Ding return false; 490a126a13bSWei Ding } 491a126a13bSWei Ding 492e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 493e14df4b2SKonstantin Zhuravlyov bool Changed = false; 494e14df4b2SKonstantin Zhuravlyov 495f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 496f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 497f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 498e14df4b2SKonstantin Zhuravlyov 499e14df4b2SKonstantin Zhuravlyov return Changed; 500e14df4b2SKonstantin Zhuravlyov } 501e14df4b2SKonstantin Zhuravlyov 502e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 503e14df4b2SKonstantin Zhuravlyov bool Changed = false; 504e14df4b2SKonstantin Zhuravlyov 505f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 506f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 507f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 508b4eb5d50SKonstantin Zhuravlyov 509b4eb5d50SKonstantin Zhuravlyov return Changed; 510b4eb5d50SKonstantin Zhuravlyov } 511b4eb5d50SKonstantin Zhuravlyov 512b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 513b4eb5d50SKonstantin Zhuravlyov switch (I.getIntrinsicID()) { 514b4eb5d50SKonstantin Zhuravlyov case Intrinsic::bitreverse: 515b4eb5d50SKonstantin Zhuravlyov return visitBitreverseIntrinsicInst(I); 516b4eb5d50SKonstantin Zhuravlyov default: 517b4eb5d50SKonstantin Zhuravlyov return false; 518b4eb5d50SKonstantin Zhuravlyov } 519b4eb5d50SKonstantin Zhuravlyov } 520b4eb5d50SKonstantin Zhuravlyov 521b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 522b4eb5d50SKonstantin Zhuravlyov bool Changed = false; 523b4eb5d50SKonstantin Zhuravlyov 524f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 525f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 526f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformBitreverseToI32(I); 527e14df4b2SKonstantin Zhuravlyov 528e14df4b2SKonstantin Zhuravlyov return Changed; 529e14df4b2SKonstantin Zhuravlyov } 530e14df4b2SKonstantin Zhuravlyov 53186de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 532a1fe17c9SMatt Arsenault Mod = &M; 53386de486dSMatt Arsenault return false; 53486de486dSMatt Arsenault } 53586de486dSMatt Arsenault 53686de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 5378b61764cSFrancis Visoiu Mistrih if (skipFunction(F)) 53886de486dSMatt Arsenault return false; 53986de486dSMatt Arsenault 5408b61764cSFrancis Visoiu Mistrih auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 5418b61764cSFrancis Visoiu Mistrih if (!TPC) 5428b61764cSFrancis Visoiu Mistrih return false; 5438b61764cSFrancis Visoiu Mistrih 5448b61764cSFrancis Visoiu Mistrih const TargetMachine &TM = TPC->getTM<TargetMachine>(); 5458b61764cSFrancis Visoiu Mistrih ST = &TM.getSubtarget<SISubtarget>(F); 54686de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 547a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 54886de486dSMatt Arsenault 549a1fe17c9SMatt Arsenault bool MadeChange = false; 550a1fe17c9SMatt Arsenault 551a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 552a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 553a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 554a1fe17c9SMatt Arsenault Next = std::next(I); 555a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 556a1fe17c9SMatt Arsenault } 557a1fe17c9SMatt Arsenault } 558a1fe17c9SMatt Arsenault 559a1fe17c9SMatt Arsenault return MadeChange; 56086de486dSMatt Arsenault } 56186de486dSMatt Arsenault 5628b61764cSFrancis Visoiu Mistrih INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 56386de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 56486de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 5658b61764cSFrancis Visoiu Mistrih INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 5668b61764cSFrancis Visoiu Mistrih false, false) 56786de486dSMatt Arsenault 56886de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 56986de486dSMatt Arsenault 5708b61764cSFrancis Visoiu Mistrih FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 5718b61764cSFrancis Visoiu Mistrih return new AMDGPUCodeGenPrepare(); 57286de486dSMatt Arsenault } 573