186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 1786de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 18a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 19734bb7bbSEugene Zelenko #include "llvm/ADT/StringRef.h" 2086de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 21a126a13bSWei Ding #include "llvm/Analysis/Loads.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 238b61764cSFrancis Visoiu Mistrih #include "llvm/CodeGen/TargetPassConfig.h" 24734bb7bbSEugene Zelenko #include "llvm/IR/Attributes.h" 25734bb7bbSEugene Zelenko #include "llvm/IR/BasicBlock.h" 26734bb7bbSEugene Zelenko #include "llvm/IR/Constants.h" 27734bb7bbSEugene Zelenko #include "llvm/IR/DerivedTypes.h" 28734bb7bbSEugene Zelenko #include "llvm/IR/Function.h" 296bda14b3SChandler Carruth #include "llvm/IR/IRBuilder.h" 306bda14b3SChandler Carruth #include "llvm/IR/InstVisitor.h" 31734bb7bbSEugene Zelenko #include "llvm/IR/InstrTypes.h" 32734bb7bbSEugene Zelenko #include "llvm/IR/Instruction.h" 33734bb7bbSEugene Zelenko #include "llvm/IR/Instructions.h" 34734bb7bbSEugene Zelenko #include "llvm/IR/IntrinsicInst.h" 35734bb7bbSEugene Zelenko #include "llvm/IR/Intrinsics.h" 36734bb7bbSEugene Zelenko #include "llvm/IR/LLVMContext.h" 37734bb7bbSEugene Zelenko #include "llvm/IR/Operator.h" 38734bb7bbSEugene Zelenko #include "llvm/IR/Type.h" 39734bb7bbSEugene Zelenko #include "llvm/IR/Value.h" 40734bb7bbSEugene Zelenko #include "llvm/Pass.h" 41734bb7bbSEugene Zelenko #include "llvm/Support/Casting.h" 42734bb7bbSEugene Zelenko #include <cassert> 43734bb7bbSEugene Zelenko #include <iterator> 4486de486dSMatt Arsenault 4586de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 4686de486dSMatt Arsenault 4786de486dSMatt Arsenault using namespace llvm; 4886de486dSMatt Arsenault 4986de486dSMatt Arsenault namespace { 5086de486dSMatt Arsenault 5186de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 52a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 53734bb7bbSEugene Zelenko const SISubtarget *ST = nullptr; 54734bb7bbSEugene Zelenko DivergenceAnalysis *DA = nullptr; 55734bb7bbSEugene Zelenko Module *Mod = nullptr; 56734bb7bbSEugene Zelenko bool HasUnsafeFPMath = false; 57a126a13bSWei Ding AMDGPUAS AMDGPUASI; 5886de486dSMatt Arsenault 595f8f34e4SAdrian Prantl /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 60f74fc60aSKonstantin Zhuravlyov /// binary operation \p V. 61e14df4b2SKonstantin Zhuravlyov /// 62f74fc60aSKonstantin Zhuravlyov /// \returns Binary operation \p V. 63f74fc60aSKonstantin Zhuravlyov /// \returns \p T's base element bit width. 64f74fc60aSKonstantin Zhuravlyov unsigned getBaseElementBitWidth(const Type *T) const; 65e14df4b2SKonstantin Zhuravlyov 66f74fc60aSKonstantin Zhuravlyov /// \returns Equivalent 32 bit integer type for given type \p T. For example, 67f74fc60aSKonstantin Zhuravlyov /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 68f74fc60aSKonstantin Zhuravlyov /// is returned. 69e14df4b2SKonstantin Zhuravlyov Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 70e14df4b2SKonstantin Zhuravlyov 71e14df4b2SKonstantin Zhuravlyov /// \returns True if binary operation \p I is a signed binary operation, false 72e14df4b2SKonstantin Zhuravlyov /// otherwise. 73e14df4b2SKonstantin Zhuravlyov bool isSigned(const BinaryOperator &I) const; 74e14df4b2SKonstantin Zhuravlyov 75e14df4b2SKonstantin Zhuravlyov /// \returns True if the condition of 'select' operation \p I comes from a 76e14df4b2SKonstantin Zhuravlyov /// signed 'icmp' operation, false otherwise. 77e14df4b2SKonstantin Zhuravlyov bool isSigned(const SelectInst &I) const; 78e14df4b2SKonstantin Zhuravlyov 79f74fc60aSKonstantin Zhuravlyov /// \returns True if type \p T needs to be promoted to 32 bit integer type, 80f74fc60aSKonstantin Zhuravlyov /// false otherwise. 81f74fc60aSKonstantin Zhuravlyov bool needsPromotionToI32(const Type *T) const; 82e14df4b2SKonstantin Zhuravlyov 835f8f34e4SAdrian Prantl /// Promotes uniform binary operation \p I to equivalent 32 bit binary 84f74fc60aSKonstantin Zhuravlyov /// operation. 85f74fc60aSKonstantin Zhuravlyov /// 86f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 87f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 88f74fc60aSKonstantin Zhuravlyov /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 89f74fc60aSKonstantin Zhuravlyov /// truncating the result of 32 bit binary operation back to \p I's original 90f74fc60aSKonstantin Zhuravlyov /// type. Division operation is not promoted. 91f74fc60aSKonstantin Zhuravlyov /// 92f74fc60aSKonstantin Zhuravlyov /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 93f74fc60aSKonstantin Zhuravlyov /// false otherwise. 94f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(BinaryOperator &I) const; 95f74fc60aSKonstantin Zhuravlyov 965f8f34e4SAdrian Prantl /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 97f74fc60aSKonstantin Zhuravlyov /// 98f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 99f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 100f74fc60aSKonstantin Zhuravlyov /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 101e14df4b2SKonstantin Zhuravlyov /// 102e14df4b2SKonstantin Zhuravlyov /// \returns True. 103f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(ICmpInst &I) const; 104e14df4b2SKonstantin Zhuravlyov 1055f8f34e4SAdrian Prantl /// Promotes uniform 'select' operation \p I to 32 bit 'select' 106f74fc60aSKonstantin Zhuravlyov /// operation. 107f74fc60aSKonstantin Zhuravlyov /// 108f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 109f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by sign or zero extending operands to 110f74fc60aSKonstantin Zhuravlyov /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 111f74fc60aSKonstantin Zhuravlyov /// result of 32 bit 'select' operation back to \p I's original type. 112e14df4b2SKonstantin Zhuravlyov /// 113e14df4b2SKonstantin Zhuravlyov /// \returns True. 114f74fc60aSKonstantin Zhuravlyov bool promoteUniformOpToI32(SelectInst &I) const; 115b4eb5d50SKonstantin Zhuravlyov 1165f8f34e4SAdrian Prantl /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 117f74fc60aSKonstantin Zhuravlyov /// intrinsic. 118f74fc60aSKonstantin Zhuravlyov /// 119f74fc60aSKonstantin Zhuravlyov /// \details \p I's base element bit width must be greater than 1 and less 120f74fc60aSKonstantin Zhuravlyov /// than or equal 16. Promotion is done by zero extending the operand to 32 121f74fc60aSKonstantin Zhuravlyov /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 122f74fc60aSKonstantin Zhuravlyov /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 123f74fc60aSKonstantin Zhuravlyov /// shift amount is 32 minus \p I's base element bit width), and truncating 124f74fc60aSKonstantin Zhuravlyov /// the result of the shift operation back to \p I's original type. 125b4eb5d50SKonstantin Zhuravlyov /// 126b4eb5d50SKonstantin Zhuravlyov /// \returns True. 127f74fc60aSKonstantin Zhuravlyov bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 1285f8f34e4SAdrian Prantl /// Widen a scalar load. 129a126a13bSWei Ding /// 130a126a13bSWei Ding /// \details \p Widen scalar load for uniform, small type loads from constant 131a126a13bSWei Ding // memory / to a full 32-bits and then truncate the input to allow a scalar 132a126a13bSWei Ding // load instead of a vector load. 133a126a13bSWei Ding // 134a126a13bSWei Ding /// \returns True. 135a126a13bSWei Ding 136a126a13bSWei Ding bool canWidenScalarExtLoad(LoadInst &I) const; 137e14df4b2SKonstantin Zhuravlyov 13886de486dSMatt Arsenault public: 13986de486dSMatt Arsenault static char ID; 140734bb7bbSEugene Zelenko 1418b61764cSFrancis Visoiu Mistrih AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 142a1fe17c9SMatt Arsenault 143a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 144a1fe17c9SMatt Arsenault 145e14df4b2SKonstantin Zhuravlyov bool visitInstruction(Instruction &I) { return false; } 146e14df4b2SKonstantin Zhuravlyov bool visitBinaryOperator(BinaryOperator &I); 147a126a13bSWei Ding bool visitLoadInst(LoadInst &I); 148e14df4b2SKonstantin Zhuravlyov bool visitICmpInst(ICmpInst &I); 149e14df4b2SKonstantin Zhuravlyov bool visitSelectInst(SelectInst &I); 15086de486dSMatt Arsenault 151b4eb5d50SKonstantin Zhuravlyov bool visitIntrinsicInst(IntrinsicInst &I); 152b4eb5d50SKonstantin Zhuravlyov bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 153b4eb5d50SKonstantin Zhuravlyov 15486de486dSMatt Arsenault bool doInitialization(Module &M) override; 15586de486dSMatt Arsenault bool runOnFunction(Function &F) override; 15686de486dSMatt Arsenault 157117296c0SMehdi Amini StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 15886de486dSMatt Arsenault 15986de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 16086de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 16186de486dSMatt Arsenault AU.setPreservesAll(); 16286de486dSMatt Arsenault } 16386de486dSMatt Arsenault }; 16486de486dSMatt Arsenault 165734bb7bbSEugene Zelenko } // end anonymous namespace 16686de486dSMatt Arsenault 167f74fc60aSKonstantin Zhuravlyov unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 168f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 169e14df4b2SKonstantin Zhuravlyov 170e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 171f74fc60aSKonstantin Zhuravlyov return T->getIntegerBitWidth(); 172f74fc60aSKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 173e14df4b2SKonstantin Zhuravlyov } 174e14df4b2SKonstantin Zhuravlyov 175e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 176f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 177e14df4b2SKonstantin Zhuravlyov 178e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 179e14df4b2SKonstantin Zhuravlyov return B.getInt32Ty(); 180e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 181e14df4b2SKonstantin Zhuravlyov } 182e14df4b2SKonstantin Zhuravlyov 183e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 184691e2e02SKonstantin Zhuravlyov return I.getOpcode() == Instruction::AShr || 185691e2e02SKonstantin Zhuravlyov I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 186e14df4b2SKonstantin Zhuravlyov } 187e14df4b2SKonstantin Zhuravlyov 188e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 189e14df4b2SKonstantin Zhuravlyov return isa<ICmpInst>(I.getOperand(0)) ? 190e14df4b2SKonstantin Zhuravlyov cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 191e14df4b2SKonstantin Zhuravlyov } 192e14df4b2SKonstantin Zhuravlyov 193f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 194eb522e68SMatt Arsenault const IntegerType *IntTy = dyn_cast<IntegerType>(T); 195eb522e68SMatt Arsenault if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 196f74fc60aSKonstantin Zhuravlyov return true; 197eb522e68SMatt Arsenault 198eb522e68SMatt Arsenault if (const VectorType *VT = dyn_cast<VectorType>(T)) { 199eb522e68SMatt Arsenault // TODO: The set of packed operations is more limited, so may want to 200eb522e68SMatt Arsenault // promote some anyway. 201eb522e68SMatt Arsenault if (ST->hasVOP3PInsts()) 202f74fc60aSKonstantin Zhuravlyov return false; 203eb522e68SMatt Arsenault 204eb522e68SMatt Arsenault return needsPromotionToI32(VT->getElementType()); 205eb522e68SMatt Arsenault } 206eb522e68SMatt Arsenault 207eb522e68SMatt Arsenault return false; 208f74fc60aSKonstantin Zhuravlyov } 209e14df4b2SKonstantin Zhuravlyov 210d59e6404SMatt Arsenault // Return true if the op promoted to i32 should have nsw set. 211d59e6404SMatt Arsenault static bool promotedOpIsNSW(const Instruction &I) { 212d59e6404SMatt Arsenault switch (I.getOpcode()) { 213d59e6404SMatt Arsenault case Instruction::Shl: 214d59e6404SMatt Arsenault case Instruction::Add: 215d59e6404SMatt Arsenault case Instruction::Sub: 216d59e6404SMatt Arsenault return true; 217d59e6404SMatt Arsenault case Instruction::Mul: 218d59e6404SMatt Arsenault return I.hasNoUnsignedWrap(); 219d59e6404SMatt Arsenault default: 220d59e6404SMatt Arsenault return false; 221d59e6404SMatt Arsenault } 222d59e6404SMatt Arsenault } 223d59e6404SMatt Arsenault 224d59e6404SMatt Arsenault // Return true if the op promoted to i32 should have nuw set. 225d59e6404SMatt Arsenault static bool promotedOpIsNUW(const Instruction &I) { 226d59e6404SMatt Arsenault switch (I.getOpcode()) { 227d59e6404SMatt Arsenault case Instruction::Shl: 228d59e6404SMatt Arsenault case Instruction::Add: 229d59e6404SMatt Arsenault case Instruction::Mul: 230d59e6404SMatt Arsenault return true; 231d59e6404SMatt Arsenault case Instruction::Sub: 232d59e6404SMatt Arsenault return I.hasNoUnsignedWrap(); 233d59e6404SMatt Arsenault default: 234d59e6404SMatt Arsenault return false; 235d59e6404SMatt Arsenault } 236d59e6404SMatt Arsenault } 237d59e6404SMatt Arsenault 238a126a13bSWei Ding bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 239a126a13bSWei Ding Type *Ty = I.getType(); 240a126a13bSWei Ding const DataLayout &DL = Mod->getDataLayout(); 241a126a13bSWei Ding int TySize = DL.getTypeSizeInBits(Ty); 242a126a13bSWei Ding unsigned Align = I.getAlignment() ? 243a126a13bSWei Ding I.getAlignment() : DL.getABITypeAlignment(Ty); 244a126a13bSWei Ding 245a126a13bSWei Ding return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 246a126a13bSWei Ding } 247a126a13bSWei Ding 248f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 249f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 250f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 251f74fc60aSKonstantin Zhuravlyov 252f74fc60aSKonstantin Zhuravlyov if (I.getOpcode() == Instruction::SDiv || 253f74fc60aSKonstantin Zhuravlyov I.getOpcode() == Instruction::UDiv) 254e14df4b2SKonstantin Zhuravlyov return false; 255e14df4b2SKonstantin Zhuravlyov 256e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 257e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 258e14df4b2SKonstantin Zhuravlyov 259e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 260e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 261e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 262e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 263e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 264e14df4b2SKonstantin Zhuravlyov 265e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 266e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 267e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 268e14df4b2SKonstantin Zhuravlyov } else { 269e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 270e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 271e14df4b2SKonstantin Zhuravlyov } 272d59e6404SMatt Arsenault 273d59e6404SMatt Arsenault ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 274d59e6404SMatt Arsenault if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 275d59e6404SMatt Arsenault if (promotedOpIsNSW(cast<Instruction>(I))) 276d59e6404SMatt Arsenault Inst->setHasNoSignedWrap(); 277d59e6404SMatt Arsenault 278d59e6404SMatt Arsenault if (promotedOpIsNUW(cast<Instruction>(I))) 279d59e6404SMatt Arsenault Inst->setHasNoUnsignedWrap(); 280d59e6404SMatt Arsenault 281d59e6404SMatt Arsenault if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 282d59e6404SMatt Arsenault Inst->setIsExact(ExactOp->isExact()); 283d59e6404SMatt Arsenault } 284d59e6404SMatt Arsenault 285f74fc60aSKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 286e14df4b2SKonstantin Zhuravlyov 287e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 288e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 289e14df4b2SKonstantin Zhuravlyov 290e14df4b2SKonstantin Zhuravlyov return true; 291e14df4b2SKonstantin Zhuravlyov } 292e14df4b2SKonstantin Zhuravlyov 293f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 294f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getOperand(0)->getType()) && 295f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 296e14df4b2SKonstantin Zhuravlyov 297e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 298e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 299e14df4b2SKonstantin Zhuravlyov 300f74fc60aSKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 301e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 302e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 303e14df4b2SKonstantin Zhuravlyov Value *NewICmp = nullptr; 304e14df4b2SKonstantin Zhuravlyov 305e14df4b2SKonstantin Zhuravlyov if (I.isSigned()) { 306f74fc60aSKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 307f74fc60aSKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 308e14df4b2SKonstantin Zhuravlyov } else { 309f74fc60aSKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 310f74fc60aSKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 311e14df4b2SKonstantin Zhuravlyov } 312e14df4b2SKonstantin Zhuravlyov NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 313e14df4b2SKonstantin Zhuravlyov 314e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(NewICmp); 315e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 316e14df4b2SKonstantin Zhuravlyov 317e14df4b2SKonstantin Zhuravlyov return true; 318e14df4b2SKonstantin Zhuravlyov } 319e14df4b2SKonstantin Zhuravlyov 320f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 321f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 322f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 323e14df4b2SKonstantin Zhuravlyov 324e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 325e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 326e14df4b2SKonstantin Zhuravlyov 327e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 328e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 329e14df4b2SKonstantin Zhuravlyov Value *ExtOp2 = nullptr; 330e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 331e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 332e14df4b2SKonstantin Zhuravlyov 333e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 334e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 335e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 336e14df4b2SKonstantin Zhuravlyov } else { 337e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 338e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 339e14df4b2SKonstantin Zhuravlyov } 340e14df4b2SKonstantin Zhuravlyov ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 341f74fc60aSKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 342e14df4b2SKonstantin Zhuravlyov 343e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 344e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 345e14df4b2SKonstantin Zhuravlyov 346e14df4b2SKonstantin Zhuravlyov return true; 347e14df4b2SKonstantin Zhuravlyov } 348e14df4b2SKonstantin Zhuravlyov 349f74fc60aSKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 350b4eb5d50SKonstantin Zhuravlyov IntrinsicInst &I) const { 351f74fc60aSKonstantin Zhuravlyov assert(I.getIntrinsicID() == Intrinsic::bitreverse && 352f74fc60aSKonstantin Zhuravlyov "I must be bitreverse intrinsic"); 353f74fc60aSKonstantin Zhuravlyov assert(needsPromotionToI32(I.getType()) && 354f74fc60aSKonstantin Zhuravlyov "I does not need promotion to i32"); 355b4eb5d50SKonstantin Zhuravlyov 356b4eb5d50SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 357b4eb5d50SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 358b4eb5d50SKonstantin Zhuravlyov 359b4eb5d50SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 360b4eb5d50SKonstantin Zhuravlyov Function *I32 = 361c09e2d7eSKonstantin Zhuravlyov Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 362b4eb5d50SKonstantin Zhuravlyov Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 363b4eb5d50SKonstantin Zhuravlyov Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 364f74fc60aSKonstantin Zhuravlyov Value *LShrOp = 365f74fc60aSKonstantin Zhuravlyov Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 366b4eb5d50SKonstantin Zhuravlyov Value *TruncRes = 367f74fc60aSKonstantin Zhuravlyov Builder.CreateTrunc(LShrOp, I.getType()); 368b4eb5d50SKonstantin Zhuravlyov 369b4eb5d50SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 370b4eb5d50SKonstantin Zhuravlyov I.eraseFromParent(); 371b4eb5d50SKonstantin Zhuravlyov 372b4eb5d50SKonstantin Zhuravlyov return true; 373b4eb5d50SKonstantin Zhuravlyov } 374b4eb5d50SKonstantin Zhuravlyov 375*df61be70SStanislav Mekhanoshin static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 376a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 377a1fe17c9SMatt Arsenault if (!CNum) 378*df61be70SStanislav Mekhanoshin return HasDenormals; 379*df61be70SStanislav Mekhanoshin 380*df61be70SStanislav Mekhanoshin if (UnsafeDiv) 381*df61be70SStanislav Mekhanoshin return true; 382*df61be70SStanislav Mekhanoshin 383*df61be70SStanislav Mekhanoshin bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 384a1fe17c9SMatt Arsenault 385a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 386*df61be70SStanislav Mekhanoshin return HasDenormals ^ IsOne; 387a1fe17c9SMatt Arsenault } 388a1fe17c9SMatt Arsenault 389a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 390a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 391a1fe17c9SMatt Arsenault // expected to be optimized. 392a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 393a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 394a1fe17c9SMatt Arsenault 395a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 396a1fe17c9SMatt Arsenault return false; 397a1fe17c9SMatt Arsenault 398a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 399a1fe17c9SMatt Arsenault if (!FPMath) 400a1fe17c9SMatt Arsenault return false; 401a1fe17c9SMatt Arsenault 402a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 403a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 404a1fe17c9SMatt Arsenault if (ULP < 2.5f) 405a1fe17c9SMatt Arsenault return false; 406a1fe17c9SMatt Arsenault 407a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 408629c4115SSanjay Patel bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 409a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 4109d7b1c9dSStanislav Mekhanoshin 4119d7b1c9dSStanislav Mekhanoshin // With UnsafeDiv node will be optimized to just rcp and mul. 412*df61be70SStanislav Mekhanoshin if (UnsafeDiv) 413a1fe17c9SMatt Arsenault return false; 414a1fe17c9SMatt Arsenault 415a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 416a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 417a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 418a1fe17c9SMatt Arsenault 419c5b641acSMatt Arsenault Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 420a1fe17c9SMatt Arsenault 421a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 422a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 423a1fe17c9SMatt Arsenault 424a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 425a1fe17c9SMatt Arsenault 426*df61be70SStanislav Mekhanoshin bool HasDenormals = ST->hasFP32Denormals(); 427a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 428a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 429a1fe17c9SMatt Arsenault 430a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 431a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 432a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 433a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 434a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 435a1fe17c9SMatt Arsenault Value *NewElt; 436a1fe17c9SMatt Arsenault 437*df61be70SStanislav Mekhanoshin if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 438a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 439a1fe17c9SMatt Arsenault } else { 440a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 441a1fe17c9SMatt Arsenault } 442a1fe17c9SMatt Arsenault 443a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 444a1fe17c9SMatt Arsenault } 445a1fe17c9SMatt Arsenault } else { 446*df61be70SStanislav Mekhanoshin if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 447a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 448a1fe17c9SMatt Arsenault } 449a1fe17c9SMatt Arsenault 450a1fe17c9SMatt Arsenault if (NewFDiv) { 451a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 452a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 453a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 454a1fe17c9SMatt Arsenault } 455a1fe17c9SMatt Arsenault 456*df61be70SStanislav Mekhanoshin return !!NewFDiv; 457a1fe17c9SMatt Arsenault } 458a1fe17c9SMatt Arsenault 459a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 460a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 461a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 462a1fe17c9SMatt Arsenault } 463a1fe17c9SMatt Arsenault 464e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 465e14df4b2SKonstantin Zhuravlyov bool Changed = false; 466e14df4b2SKonstantin Zhuravlyov 467f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 468f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 469f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 470e14df4b2SKonstantin Zhuravlyov 471e14df4b2SKonstantin Zhuravlyov return Changed; 472e14df4b2SKonstantin Zhuravlyov } 473e14df4b2SKonstantin Zhuravlyov 474a126a13bSWei Ding bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 475923712b6SMatt Arsenault if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || 476923712b6SMatt Arsenault I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && 477a126a13bSWei Ding canWidenScalarExtLoad(I)) { 478a126a13bSWei Ding IRBuilder<> Builder(&I); 479a126a13bSWei Ding Builder.SetCurrentDebugLocation(I.getDebugLoc()); 480a126a13bSWei Ding 481a126a13bSWei Ding Type *I32Ty = Builder.getInt32Ty(); 482a126a13bSWei Ding Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 483a126a13bSWei Ding Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 48457e541e8SMatt Arsenault LoadInst *WidenLoad = Builder.CreateLoad(BitCast); 48557e541e8SMatt Arsenault WidenLoad->copyMetadata(I); 48657e541e8SMatt Arsenault 48757e541e8SMatt Arsenault // If we have range metadata, we need to convert the type, and not make 48857e541e8SMatt Arsenault // assumptions about the high bits. 48957e541e8SMatt Arsenault if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 49057e541e8SMatt Arsenault ConstantInt *Lower = 49157e541e8SMatt Arsenault mdconst::extract<ConstantInt>(Range->getOperand(0)); 49257e541e8SMatt Arsenault 49357e541e8SMatt Arsenault if (Lower->getValue().isNullValue()) { 49457e541e8SMatt Arsenault WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 49557e541e8SMatt Arsenault } else { 49657e541e8SMatt Arsenault Metadata *LowAndHigh[] = { 49757e541e8SMatt Arsenault ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 49857e541e8SMatt Arsenault // Don't make assumptions about the high bits. 49957e541e8SMatt Arsenault ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 50057e541e8SMatt Arsenault }; 50157e541e8SMatt Arsenault 50257e541e8SMatt Arsenault WidenLoad->setMetadata(LLVMContext::MD_range, 50357e541e8SMatt Arsenault MDNode::get(Mod->getContext(), LowAndHigh)); 50457e541e8SMatt Arsenault } 50557e541e8SMatt Arsenault } 506a126a13bSWei Ding 507a126a13bSWei Ding int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 508a126a13bSWei Ding Type *IntNTy = Builder.getIntNTy(TySize); 509a126a13bSWei Ding Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 510a126a13bSWei Ding Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 511a126a13bSWei Ding I.replaceAllUsesWith(ValOrig); 512a126a13bSWei Ding I.eraseFromParent(); 513a126a13bSWei Ding return true; 514a126a13bSWei Ding } 515a126a13bSWei Ding 516a126a13bSWei Ding return false; 517a126a13bSWei Ding } 518a126a13bSWei Ding 519e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 520e14df4b2SKonstantin Zhuravlyov bool Changed = false; 521e14df4b2SKonstantin Zhuravlyov 522f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 523f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 524f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 525e14df4b2SKonstantin Zhuravlyov 526e14df4b2SKonstantin Zhuravlyov return Changed; 527e14df4b2SKonstantin Zhuravlyov } 528e14df4b2SKonstantin Zhuravlyov 529e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 530e14df4b2SKonstantin Zhuravlyov bool Changed = false; 531e14df4b2SKonstantin Zhuravlyov 532f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 533f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 534f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformOpToI32(I); 535b4eb5d50SKonstantin Zhuravlyov 536b4eb5d50SKonstantin Zhuravlyov return Changed; 537b4eb5d50SKonstantin Zhuravlyov } 538b4eb5d50SKonstantin Zhuravlyov 539b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 540b4eb5d50SKonstantin Zhuravlyov switch (I.getIntrinsicID()) { 541b4eb5d50SKonstantin Zhuravlyov case Intrinsic::bitreverse: 542b4eb5d50SKonstantin Zhuravlyov return visitBitreverseIntrinsicInst(I); 543b4eb5d50SKonstantin Zhuravlyov default: 544b4eb5d50SKonstantin Zhuravlyov return false; 545b4eb5d50SKonstantin Zhuravlyov } 546b4eb5d50SKonstantin Zhuravlyov } 547b4eb5d50SKonstantin Zhuravlyov 548b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 549b4eb5d50SKonstantin Zhuravlyov bool Changed = false; 550b4eb5d50SKonstantin Zhuravlyov 551f74fc60aSKonstantin Zhuravlyov if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 552f74fc60aSKonstantin Zhuravlyov DA->isUniform(&I)) 553f74fc60aSKonstantin Zhuravlyov Changed |= promoteUniformBitreverseToI32(I); 554e14df4b2SKonstantin Zhuravlyov 555e14df4b2SKonstantin Zhuravlyov return Changed; 556e14df4b2SKonstantin Zhuravlyov } 557e14df4b2SKonstantin Zhuravlyov 55886de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 559a1fe17c9SMatt Arsenault Mod = &M; 56086de486dSMatt Arsenault return false; 56186de486dSMatt Arsenault } 56286de486dSMatt Arsenault 56386de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 5648b61764cSFrancis Visoiu Mistrih if (skipFunction(F)) 56586de486dSMatt Arsenault return false; 56686de486dSMatt Arsenault 5678b61764cSFrancis Visoiu Mistrih auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 5688b61764cSFrancis Visoiu Mistrih if (!TPC) 5698b61764cSFrancis Visoiu Mistrih return false; 5708b61764cSFrancis Visoiu Mistrih 5718b61764cSFrancis Visoiu Mistrih const TargetMachine &TM = TPC->getTM<TargetMachine>(); 5728b61764cSFrancis Visoiu Mistrih ST = &TM.getSubtarget<SISubtarget>(F); 57386de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 574a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 57586de486dSMatt Arsenault 576a1fe17c9SMatt Arsenault bool MadeChange = false; 577a1fe17c9SMatt Arsenault 578a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 579a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 580a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 581a1fe17c9SMatt Arsenault Next = std::next(I); 582a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 583a1fe17c9SMatt Arsenault } 584a1fe17c9SMatt Arsenault } 585a1fe17c9SMatt Arsenault 586a1fe17c9SMatt Arsenault return MadeChange; 58786de486dSMatt Arsenault } 58886de486dSMatt Arsenault 5898b61764cSFrancis Visoiu Mistrih INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 59086de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 59186de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 5928b61764cSFrancis Visoiu Mistrih INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 5938b61764cSFrancis Visoiu Mistrih false, false) 59486de486dSMatt Arsenault 59586de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 59686de486dSMatt Arsenault 5978b61764cSFrancis Visoiu Mistrih FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 5988b61764cSFrancis Visoiu Mistrih return new AMDGPUCodeGenPrepare(); 59986de486dSMatt Arsenault } 600