186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 17a1fe17c9SMatt Arsenault #include "AMDGPUIntrinsicInfo.h" 1886de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 19a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 2086de486dSMatt Arsenault 2186de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 2386de486dSMatt Arsenault #include "llvm/IR/InstVisitor.h" 2486de486dSMatt Arsenault #include "llvm/IR/IRBuilder.h" 2586de486dSMatt Arsenault #include "llvm/Support/Debug.h" 2686de486dSMatt Arsenault #include "llvm/Support/raw_ostream.h" 2786de486dSMatt Arsenault 2886de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 2986de486dSMatt Arsenault 3086de486dSMatt Arsenault using namespace llvm; 3186de486dSMatt Arsenault 3286de486dSMatt Arsenault namespace { 3386de486dSMatt Arsenault 3486de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 35a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36a1fe17c9SMatt Arsenault const GCNTargetMachine *TM; 37a1fe17c9SMatt Arsenault const SISubtarget *ST; 3886de486dSMatt Arsenault DivergenceAnalysis *DA; 39a1fe17c9SMatt Arsenault Module *Mod; 40a1fe17c9SMatt Arsenault bool HasUnsafeFPMath; 4186de486dSMatt Arsenault 42e14df4b2SKonstantin Zhuravlyov /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to 43e14df4b2SKonstantin Zhuravlyov /// binary operator \p V. 44e14df4b2SKonstantin Zhuravlyov /// 45e14df4b2SKonstantin Zhuravlyov /// \returns Binary operator \p V. 46e14df4b2SKonstantin Zhuravlyov Value *copyFlags(const BinaryOperator &I, Value *V) const; 47e14df4b2SKonstantin Zhuravlyov 48e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 16 bit integer type for given 32 bit integer type 49e14df4b2SKonstantin Zhuravlyov /// \p T. 50e14df4b2SKonstantin Zhuravlyov Type *getI16Ty(IRBuilder<> &B, const Type *T) const; 51e14df4b2SKonstantin Zhuravlyov 52e14df4b2SKonstantin Zhuravlyov /// \returns Equivalent 32 bit integer type for given 16 bit integer type 53e14df4b2SKonstantin Zhuravlyov /// \p T. 54e14df4b2SKonstantin Zhuravlyov Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 55e14df4b2SKonstantin Zhuravlyov 56e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 16 bit integer, false 57e14df4b2SKonstantin Zhuravlyov /// otherwise. 58e14df4b2SKonstantin Zhuravlyov bool isI16Ty(const Type *T) const; 59e14df4b2SKonstantin Zhuravlyov 60e14df4b2SKonstantin Zhuravlyov /// \returns True if the base element of type \p T is 32 bit integer, false 61e14df4b2SKonstantin Zhuravlyov /// otherwise. 62e14df4b2SKonstantin Zhuravlyov bool isI32Ty(const Type *T) const; 63e14df4b2SKonstantin Zhuravlyov 64e14df4b2SKonstantin Zhuravlyov /// \returns True if binary operation \p I is a signed binary operation, false 65e14df4b2SKonstantin Zhuravlyov /// otherwise. 66e14df4b2SKonstantin Zhuravlyov bool isSigned(const BinaryOperator &I) const; 67e14df4b2SKonstantin Zhuravlyov 68e14df4b2SKonstantin Zhuravlyov /// \returns True if the condition of 'select' operation \p I comes from a 69e14df4b2SKonstantin Zhuravlyov /// signed 'icmp' operation, false otherwise. 70e14df4b2SKonstantin Zhuravlyov bool isSigned(const SelectInst &I) const; 71e14df4b2SKonstantin Zhuravlyov 72e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit 73e14df4b2SKonstantin Zhuravlyov /// binary operation by sign or zero extending operands to 32 bits, replacing 74e14df4b2SKonstantin Zhuravlyov /// 16 bit operation with equivalent 32 bit operation, and truncating the 75e14df4b2SKonstantin Zhuravlyov /// result of 32 bit operation back to 16 bits. 16 bit division operation is 76e14df4b2SKonstantin Zhuravlyov /// not promoted. 77e14df4b2SKonstantin Zhuravlyov /// 78e14df4b2SKonstantin Zhuravlyov /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit 79e14df4b2SKonstantin Zhuravlyov /// binary operation, false otherwise. 80*b4eb5d50SKonstantin Zhuravlyov bool promoteUniformI16OpToI32(BinaryOperator &I) const; 81e14df4b2SKonstantin Zhuravlyov 82e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' 83e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, and replacing 16 84e14df4b2SKonstantin Zhuravlyov /// bit operation with 32 bit operation. 85e14df4b2SKonstantin Zhuravlyov /// 86e14df4b2SKonstantin Zhuravlyov /// \returns True. 87*b4eb5d50SKonstantin Zhuravlyov bool promoteUniformI16OpToI32(ICmpInst &I) const; 88e14df4b2SKonstantin Zhuravlyov 89e14df4b2SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' 90e14df4b2SKonstantin Zhuravlyov /// operation by sign or zero extending operands to 32 bits, replacing 16 bit 91e14df4b2SKonstantin Zhuravlyov /// operation with 32 bit operation, and truncating the result of 32 bit 92e14df4b2SKonstantin Zhuravlyov /// operation back to 16 bits. 93e14df4b2SKonstantin Zhuravlyov /// 94e14df4b2SKonstantin Zhuravlyov /// \returns True. 95*b4eb5d50SKonstantin Zhuravlyov bool promoteUniformI16OpToI32(SelectInst &I) const; 96*b4eb5d50SKonstantin Zhuravlyov 97*b4eb5d50SKonstantin Zhuravlyov /// \brief Promotes uniform 16 bit 'bitreverse' intrinsic \p I to 32 bit 98*b4eb5d50SKonstantin Zhuravlyov /// 'bitreverse' intrinsic by zero extending operand to 32 bits, replacing 16 99*b4eb5d50SKonstantin Zhuravlyov /// bit intrinsic with 32 bit intrinsic, shifting the result of 32 bit 100*b4eb5d50SKonstantin Zhuravlyov /// intrinsic 16 bits to the right with zero fill, and truncating the result 101*b4eb5d50SKonstantin Zhuravlyov /// of shift operation back to 16 bits. 102*b4eb5d50SKonstantin Zhuravlyov /// 103*b4eb5d50SKonstantin Zhuravlyov /// \returns True. 104*b4eb5d50SKonstantin Zhuravlyov bool promoteUniformI16BitreverseIntrinsicToI32(IntrinsicInst &I) const; 105e14df4b2SKonstantin Zhuravlyov 10686de486dSMatt Arsenault public: 10786de486dSMatt Arsenault static char ID; 10886de486dSMatt Arsenault AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 10986de486dSMatt Arsenault FunctionPass(ID), 110a1fe17c9SMatt Arsenault TM(static_cast<const GCNTargetMachine *>(TM)), 111a1fe17c9SMatt Arsenault ST(nullptr), 112a1fe17c9SMatt Arsenault DA(nullptr), 113a1fe17c9SMatt Arsenault Mod(nullptr), 114a1fe17c9SMatt Arsenault HasUnsafeFPMath(false) { } 115a1fe17c9SMatt Arsenault 116a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 117a1fe17c9SMatt Arsenault 118e14df4b2SKonstantin Zhuravlyov bool visitInstruction(Instruction &I) { return false; } 119e14df4b2SKonstantin Zhuravlyov bool visitBinaryOperator(BinaryOperator &I); 120e14df4b2SKonstantin Zhuravlyov bool visitICmpInst(ICmpInst &I); 121e14df4b2SKonstantin Zhuravlyov bool visitSelectInst(SelectInst &I); 12286de486dSMatt Arsenault 123*b4eb5d50SKonstantin Zhuravlyov bool visitIntrinsicInst(IntrinsicInst &I); 124*b4eb5d50SKonstantin Zhuravlyov bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 125*b4eb5d50SKonstantin Zhuravlyov 12686de486dSMatt Arsenault bool doInitialization(Module &M) override; 12786de486dSMatt Arsenault bool runOnFunction(Function &F) override; 12886de486dSMatt Arsenault 129117296c0SMehdi Amini StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 13086de486dSMatt Arsenault 13186de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 13286de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 13386de486dSMatt Arsenault AU.setPreservesAll(); 13486de486dSMatt Arsenault } 13586de486dSMatt Arsenault }; 13686de486dSMatt Arsenault 13786de486dSMatt Arsenault } // End anonymous namespace 13886de486dSMatt Arsenault 139e14df4b2SKonstantin Zhuravlyov Value *AMDGPUCodeGenPrepare::copyFlags( 140e14df4b2SKonstantin Zhuravlyov const BinaryOperator &I, Value *V) const { 141e14df4b2SKonstantin Zhuravlyov assert(isa<BinaryOperator>(V) && "V must be binary operator"); 142e14df4b2SKonstantin Zhuravlyov 143e14df4b2SKonstantin Zhuravlyov BinaryOperator *BinOp = cast<BinaryOperator>(V); 144e14df4b2SKonstantin Zhuravlyov if (isa<OverflowingBinaryOperator>(BinOp)) { 145e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 146e14df4b2SKonstantin Zhuravlyov BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 147e14df4b2SKonstantin Zhuravlyov } else if (isa<PossiblyExactOperator>(BinOp)) { 148e14df4b2SKonstantin Zhuravlyov BinOp->setIsExact(I.isExact()); 149e14df4b2SKonstantin Zhuravlyov } 150e14df4b2SKonstantin Zhuravlyov 151e14df4b2SKonstantin Zhuravlyov return V; 152e14df4b2SKonstantin Zhuravlyov } 153e14df4b2SKonstantin Zhuravlyov 154e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const { 155e14df4b2SKonstantin Zhuravlyov assert(isI32Ty(T) && "T must be 32 bits"); 156e14df4b2SKonstantin Zhuravlyov 157e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 158e14df4b2SKonstantin Zhuravlyov return B.getInt16Ty(); 159e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements()); 160e14df4b2SKonstantin Zhuravlyov } 161e14df4b2SKonstantin Zhuravlyov 162e14df4b2SKonstantin Zhuravlyov Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 163e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(T) && "T must be 16 bits"); 164e14df4b2SKonstantin Zhuravlyov 165e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy()) 166e14df4b2SKonstantin Zhuravlyov return B.getInt32Ty(); 167e14df4b2SKonstantin Zhuravlyov return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 168e14df4b2SKonstantin Zhuravlyov } 169e14df4b2SKonstantin Zhuravlyov 170e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const { 171e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(16)) 172e14df4b2SKonstantin Zhuravlyov return true; 173e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 174e14df4b2SKonstantin Zhuravlyov return false; 175e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(16); 176e14df4b2SKonstantin Zhuravlyov } 177e14df4b2SKonstantin Zhuravlyov 178e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const { 179e14df4b2SKonstantin Zhuravlyov if (T->isIntegerTy(32)) 180e14df4b2SKonstantin Zhuravlyov return true; 181e14df4b2SKonstantin Zhuravlyov if (!T->isVectorTy()) 182e14df4b2SKonstantin Zhuravlyov return false; 183e14df4b2SKonstantin Zhuravlyov return cast<VectorType>(T)->getElementType()->isIntegerTy(32); 184e14df4b2SKonstantin Zhuravlyov } 185e14df4b2SKonstantin Zhuravlyov 186e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 187691e2e02SKonstantin Zhuravlyov return I.getOpcode() == Instruction::AShr || 188691e2e02SKonstantin Zhuravlyov I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 189e14df4b2SKonstantin Zhuravlyov } 190e14df4b2SKonstantin Zhuravlyov 191e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 192e14df4b2SKonstantin Zhuravlyov return isa<ICmpInst>(I.getOperand(0)) ? 193e14df4b2SKonstantin Zhuravlyov cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 194e14df4b2SKonstantin Zhuravlyov } 195e14df4b2SKonstantin Zhuravlyov 196*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(BinaryOperator &I) const { 197*b4eb5d50SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "I must be 16 bits"); 198e14df4b2SKonstantin Zhuravlyov 199e14df4b2SKonstantin Zhuravlyov if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) 200e14df4b2SKonstantin Zhuravlyov return false; 201e14df4b2SKonstantin Zhuravlyov 202e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 203e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 204e14df4b2SKonstantin Zhuravlyov 205e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 206e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 207e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 208e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 209e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 210e14df4b2SKonstantin Zhuravlyov 211e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 212e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 213e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 214e14df4b2SKonstantin Zhuravlyov } else { 215e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 216e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 217e14df4b2SKonstantin Zhuravlyov } 218e14df4b2SKonstantin Zhuravlyov ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 219e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 220e14df4b2SKonstantin Zhuravlyov 221e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 222e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 223e14df4b2SKonstantin Zhuravlyov 224e14df4b2SKonstantin Zhuravlyov return true; 225e14df4b2SKonstantin Zhuravlyov } 226e14df4b2SKonstantin Zhuravlyov 227*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(ICmpInst &I) const { 228e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits"); 229e14df4b2SKonstantin Zhuravlyov assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits"); 230e14df4b2SKonstantin Zhuravlyov 231e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 232e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 233e14df4b2SKonstantin Zhuravlyov 234e14df4b2SKonstantin Zhuravlyov Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType()); 235e14df4b2SKonstantin Zhuravlyov Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType()); 236e14df4b2SKonstantin Zhuravlyov Value *ExtOp0 = nullptr; 237e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 238e14df4b2SKonstantin Zhuravlyov Value *NewICmp = nullptr; 239e14df4b2SKonstantin Zhuravlyov 240e14df4b2SKonstantin Zhuravlyov if (I.isSigned()) { 241e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0); 242e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1); 243e14df4b2SKonstantin Zhuravlyov } else { 244e14df4b2SKonstantin Zhuravlyov ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0); 245e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1); 246e14df4b2SKonstantin Zhuravlyov } 247e14df4b2SKonstantin Zhuravlyov NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 248e14df4b2SKonstantin Zhuravlyov 249e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(NewICmp); 250e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 251e14df4b2SKonstantin Zhuravlyov 252e14df4b2SKonstantin Zhuravlyov return true; 253e14df4b2SKonstantin Zhuravlyov } 254e14df4b2SKonstantin Zhuravlyov 255*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32(SelectInst &I) const { 256*b4eb5d50SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "I must be 16 bits"); 257e14df4b2SKonstantin Zhuravlyov 258e14df4b2SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 259e14df4b2SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 260e14df4b2SKonstantin Zhuravlyov 261e14df4b2SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 262e14df4b2SKonstantin Zhuravlyov Value *ExtOp1 = nullptr; 263e14df4b2SKonstantin Zhuravlyov Value *ExtOp2 = nullptr; 264e14df4b2SKonstantin Zhuravlyov Value *ExtRes = nullptr; 265e14df4b2SKonstantin Zhuravlyov Value *TruncRes = nullptr; 266e14df4b2SKonstantin Zhuravlyov 267e14df4b2SKonstantin Zhuravlyov if (isSigned(I)) { 268e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 269e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 270e14df4b2SKonstantin Zhuravlyov } else { 271e14df4b2SKonstantin Zhuravlyov ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 272e14df4b2SKonstantin Zhuravlyov ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 273e14df4b2SKonstantin Zhuravlyov } 274e14df4b2SKonstantin Zhuravlyov ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 275e14df4b2SKonstantin Zhuravlyov TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 276e14df4b2SKonstantin Zhuravlyov 277e14df4b2SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 278e14df4b2SKonstantin Zhuravlyov I.eraseFromParent(); 279e14df4b2SKonstantin Zhuravlyov 280e14df4b2SKonstantin Zhuravlyov return true; 281e14df4b2SKonstantin Zhuravlyov } 282e14df4b2SKonstantin Zhuravlyov 283*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::promoteUniformI16BitreverseIntrinsicToI32( 284*b4eb5d50SKonstantin Zhuravlyov IntrinsicInst &I) const { 285*b4eb5d50SKonstantin Zhuravlyov assert(I.getIntrinsicID() == Intrinsic::bitreverse && "I must be bitreverse"); 286*b4eb5d50SKonstantin Zhuravlyov assert(isI16Ty(I.getType()) && "I must be 16 bits"); 287*b4eb5d50SKonstantin Zhuravlyov 288*b4eb5d50SKonstantin Zhuravlyov IRBuilder<> Builder(&I); 289*b4eb5d50SKonstantin Zhuravlyov Builder.SetCurrentDebugLocation(I.getDebugLoc()); 290*b4eb5d50SKonstantin Zhuravlyov 291*b4eb5d50SKonstantin Zhuravlyov Type *I32Ty = getI32Ty(Builder, I.getType()); 292*b4eb5d50SKonstantin Zhuravlyov Function *I32 = 293*b4eb5d50SKonstantin Zhuravlyov Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });; 294*b4eb5d50SKonstantin Zhuravlyov Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 295*b4eb5d50SKonstantin Zhuravlyov Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 296*b4eb5d50SKonstantin Zhuravlyov Value *LShrOp = Builder.CreateLShr(ExtRes, 16); 297*b4eb5d50SKonstantin Zhuravlyov Value *TruncRes = 298*b4eb5d50SKonstantin Zhuravlyov Builder.CreateTrunc(LShrOp, getI16Ty(Builder, ExtRes->getType())); 299*b4eb5d50SKonstantin Zhuravlyov 300*b4eb5d50SKonstantin Zhuravlyov I.replaceAllUsesWith(TruncRes); 301*b4eb5d50SKonstantin Zhuravlyov I.eraseFromParent(); 302*b4eb5d50SKonstantin Zhuravlyov 303*b4eb5d50SKonstantin Zhuravlyov return true; 304*b4eb5d50SKonstantin Zhuravlyov } 305*b4eb5d50SKonstantin Zhuravlyov 306a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 307a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 308a1fe17c9SMatt Arsenault if (!CNum) 309a1fe17c9SMatt Arsenault return false; 310a1fe17c9SMatt Arsenault 311a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 312e3862cdcSMatt Arsenault return UnsafeDiv || CNum->isExactlyValue(+1.0); 313a1fe17c9SMatt Arsenault } 314a1fe17c9SMatt Arsenault 315a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 316a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 317a1fe17c9SMatt Arsenault // expected to be optimized. 318a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 319a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 320a1fe17c9SMatt Arsenault 321a1fe17c9SMatt Arsenault // TODO: Handle half 322a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 323a1fe17c9SMatt Arsenault return false; 324a1fe17c9SMatt Arsenault 325a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 326a1fe17c9SMatt Arsenault if (!FPMath) 327a1fe17c9SMatt Arsenault return false; 328a1fe17c9SMatt Arsenault 329a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 330a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 331a1fe17c9SMatt Arsenault if (ULP < 2.5f) 332a1fe17c9SMatt Arsenault return false; 333a1fe17c9SMatt Arsenault 334a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 335a1fe17c9SMatt Arsenault bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 336a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 337a1fe17c9SMatt Arsenault if (ST->hasFP32Denormals() && !UnsafeDiv) 338a1fe17c9SMatt Arsenault return false; 339a1fe17c9SMatt Arsenault 340a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 341a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 342a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 343a1fe17c9SMatt Arsenault 344a1fe17c9SMatt Arsenault const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 345a1fe17c9SMatt Arsenault Function *Decl 346a1fe17c9SMatt Arsenault = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 347a1fe17c9SMatt Arsenault 348a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 349a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 350a1fe17c9SMatt Arsenault 351a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 352a1fe17c9SMatt Arsenault 353a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 354a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 355a1fe17c9SMatt Arsenault 356a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 357a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 358a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 359a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 360a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 361a1fe17c9SMatt Arsenault Value *NewElt; 362a1fe17c9SMatt Arsenault 363a1fe17c9SMatt Arsenault if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 364a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 365a1fe17c9SMatt Arsenault } else { 366a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 367a1fe17c9SMatt Arsenault } 368a1fe17c9SMatt Arsenault 369a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 370a1fe17c9SMatt Arsenault } 371a1fe17c9SMatt Arsenault } else { 372a1fe17c9SMatt Arsenault if (!shouldKeepFDivF32(Num, UnsafeDiv)) 373a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 374a1fe17c9SMatt Arsenault } 375a1fe17c9SMatt Arsenault 376a1fe17c9SMatt Arsenault if (NewFDiv) { 377a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 378a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 379a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 380a1fe17c9SMatt Arsenault } 381a1fe17c9SMatt Arsenault 382a1fe17c9SMatt Arsenault return true; 383a1fe17c9SMatt Arsenault } 384a1fe17c9SMatt Arsenault 385a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 386a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 387a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 388a1fe17c9SMatt Arsenault } 389a1fe17c9SMatt Arsenault 390e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 391e14df4b2SKonstantin Zhuravlyov bool Changed = false; 392e14df4b2SKonstantin Zhuravlyov 393e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 394e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 395*b4eb5d50SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32(I); 396e14df4b2SKonstantin Zhuravlyov 397e14df4b2SKonstantin Zhuravlyov return Changed; 398e14df4b2SKonstantin Zhuravlyov } 399e14df4b2SKonstantin Zhuravlyov 400e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 401e14df4b2SKonstantin Zhuravlyov bool Changed = false; 402e14df4b2SKonstantin Zhuravlyov 403e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 404e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) && 405e14df4b2SKonstantin Zhuravlyov isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I)) 406*b4eb5d50SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32(I); 407e14df4b2SKonstantin Zhuravlyov 408e14df4b2SKonstantin Zhuravlyov return Changed; 409e14df4b2SKonstantin Zhuravlyov } 410e14df4b2SKonstantin Zhuravlyov 411e14df4b2SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 412e14df4b2SKonstantin Zhuravlyov bool Changed = false; 413e14df4b2SKonstantin Zhuravlyov 414e14df4b2SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 415e14df4b2SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 416*b4eb5d50SKonstantin Zhuravlyov Changed |= promoteUniformI16OpToI32(I); 417*b4eb5d50SKonstantin Zhuravlyov 418*b4eb5d50SKonstantin Zhuravlyov return Changed; 419*b4eb5d50SKonstantin Zhuravlyov } 420*b4eb5d50SKonstantin Zhuravlyov 421*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 422*b4eb5d50SKonstantin Zhuravlyov switch (I.getIntrinsicID()) { 423*b4eb5d50SKonstantin Zhuravlyov case Intrinsic::bitreverse: 424*b4eb5d50SKonstantin Zhuravlyov return visitBitreverseIntrinsicInst(I); 425*b4eb5d50SKonstantin Zhuravlyov default: 426*b4eb5d50SKonstantin Zhuravlyov return false; 427*b4eb5d50SKonstantin Zhuravlyov } 428*b4eb5d50SKonstantin Zhuravlyov } 429*b4eb5d50SKonstantin Zhuravlyov 430*b4eb5d50SKonstantin Zhuravlyov bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 431*b4eb5d50SKonstantin Zhuravlyov bool Changed = false; 432*b4eb5d50SKonstantin Zhuravlyov 433*b4eb5d50SKonstantin Zhuravlyov // TODO: Should we promote smaller types that will be legalized to i16? 434*b4eb5d50SKonstantin Zhuravlyov if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 435*b4eb5d50SKonstantin Zhuravlyov Changed |= promoteUniformI16BitreverseIntrinsicToI32(I); 436e14df4b2SKonstantin Zhuravlyov 437e14df4b2SKonstantin Zhuravlyov return Changed; 438e14df4b2SKonstantin Zhuravlyov } 439e14df4b2SKonstantin Zhuravlyov 44086de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 441a1fe17c9SMatt Arsenault Mod = &M; 44286de486dSMatt Arsenault return false; 44386de486dSMatt Arsenault } 44486de486dSMatt Arsenault 44586de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 44686de486dSMatt Arsenault if (!TM || skipFunction(F)) 44786de486dSMatt Arsenault return false; 44886de486dSMatt Arsenault 449a1fe17c9SMatt Arsenault ST = &TM->getSubtarget<SISubtarget>(F); 45086de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 451a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 45286de486dSMatt Arsenault 453a1fe17c9SMatt Arsenault bool MadeChange = false; 454a1fe17c9SMatt Arsenault 455a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 456a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 457a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 458a1fe17c9SMatt Arsenault Next = std::next(I); 459a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 460a1fe17c9SMatt Arsenault } 461a1fe17c9SMatt Arsenault } 462a1fe17c9SMatt Arsenault 463a1fe17c9SMatt Arsenault return MadeChange; 46486de486dSMatt Arsenault } 46586de486dSMatt Arsenault 46686de486dSMatt Arsenault INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 46786de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 46886de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 46986de486dSMatt Arsenault INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 47086de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 47186de486dSMatt Arsenault 47286de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 47386de486dSMatt Arsenault 474a1fe17c9SMatt Arsenault FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 47586de486dSMatt Arsenault return new AMDGPUCodeGenPrepare(TM); 47686de486dSMatt Arsenault } 477