186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 286de486dSMatt Arsenault // 386de486dSMatt Arsenault // The LLVM Compiler Infrastructure 486de486dSMatt Arsenault // 586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source 686de486dSMatt Arsenault // License. See LICENSE.TXT for details. 786de486dSMatt Arsenault // 886de486dSMatt Arsenault //===----------------------------------------------------------------------===// 986de486dSMatt Arsenault // 1086de486dSMatt Arsenault /// \file 1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction 1286de486dSMatt Arsenault /// selection. 1386de486dSMatt Arsenault // 1486de486dSMatt Arsenault //===----------------------------------------------------------------------===// 1586de486dSMatt Arsenault 1686de486dSMatt Arsenault #include "AMDGPU.h" 17a1fe17c9SMatt Arsenault #include "AMDGPUIntrinsicInfo.h" 1886de486dSMatt Arsenault #include "AMDGPUSubtarget.h" 19a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h" 2086de486dSMatt Arsenault 2186de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h" 2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h" 2386de486dSMatt Arsenault #include "llvm/IR/InstVisitor.h" 2486de486dSMatt Arsenault #include "llvm/IR/IRBuilder.h" 2586de486dSMatt Arsenault #include "llvm/Support/Debug.h" 2686de486dSMatt Arsenault #include "llvm/Support/raw_ostream.h" 2786de486dSMatt Arsenault 2886de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare" 2986de486dSMatt Arsenault 3086de486dSMatt Arsenault using namespace llvm; 3186de486dSMatt Arsenault 3286de486dSMatt Arsenault namespace { 3386de486dSMatt Arsenault 3486de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass, 35a1fe17c9SMatt Arsenault public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36a1fe17c9SMatt Arsenault const GCNTargetMachine *TM; 37a1fe17c9SMatt Arsenault const SISubtarget *ST; 3886de486dSMatt Arsenault DivergenceAnalysis *DA; 39a1fe17c9SMatt Arsenault Module *Mod; 40a1fe17c9SMatt Arsenault bool HasUnsafeFPMath; 4186de486dSMatt Arsenault 4286de486dSMatt Arsenault public: 4386de486dSMatt Arsenault static char ID; 4486de486dSMatt Arsenault AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 4586de486dSMatt Arsenault FunctionPass(ID), 46a1fe17c9SMatt Arsenault TM(static_cast<const GCNTargetMachine *>(TM)), 47a1fe17c9SMatt Arsenault ST(nullptr), 48a1fe17c9SMatt Arsenault DA(nullptr), 49a1fe17c9SMatt Arsenault Mod(nullptr), 50a1fe17c9SMatt Arsenault HasUnsafeFPMath(false) { } 51a1fe17c9SMatt Arsenault 52a1fe17c9SMatt Arsenault bool visitFDiv(BinaryOperator &I); 53a1fe17c9SMatt Arsenault 54a1fe17c9SMatt Arsenault bool visitInstruction(Instruction &I) { 55a1fe17c9SMatt Arsenault return false; 56a1fe17c9SMatt Arsenault } 5786de486dSMatt Arsenault 5886de486dSMatt Arsenault bool doInitialization(Module &M) override; 5986de486dSMatt Arsenault bool runOnFunction(Function &F) override; 6086de486dSMatt Arsenault 6186de486dSMatt Arsenault const char *getPassName() const override { 6286de486dSMatt Arsenault return "AMDGPU IR optimizations"; 6386de486dSMatt Arsenault } 6486de486dSMatt Arsenault 6586de486dSMatt Arsenault void getAnalysisUsage(AnalysisUsage &AU) const override { 6686de486dSMatt Arsenault AU.addRequired<DivergenceAnalysis>(); 6786de486dSMatt Arsenault AU.setPreservesAll(); 6886de486dSMatt Arsenault } 6986de486dSMatt Arsenault }; 7086de486dSMatt Arsenault 7186de486dSMatt Arsenault } // End anonymous namespace 7286de486dSMatt Arsenault 73a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 74a1fe17c9SMatt Arsenault const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 75a1fe17c9SMatt Arsenault if (!CNum) 76a1fe17c9SMatt Arsenault return false; 77a1fe17c9SMatt Arsenault 78a1fe17c9SMatt Arsenault // Reciprocal f32 is handled separately without denormals. 79*e3862cdcSMatt Arsenault return UnsafeDiv || CNum->isExactlyValue(+1.0); 80a1fe17c9SMatt Arsenault } 81a1fe17c9SMatt Arsenault 82a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can 83a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is 84a1fe17c9SMatt Arsenault // expected to be optimized. 85a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 86a1fe17c9SMatt Arsenault Type *Ty = FDiv.getType(); 87a1fe17c9SMatt Arsenault 88a1fe17c9SMatt Arsenault // TODO: Handle half 89a1fe17c9SMatt Arsenault if (!Ty->getScalarType()->isFloatTy()) 90a1fe17c9SMatt Arsenault return false; 91a1fe17c9SMatt Arsenault 92a1fe17c9SMatt Arsenault MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 93a1fe17c9SMatt Arsenault if (!FPMath) 94a1fe17c9SMatt Arsenault return false; 95a1fe17c9SMatt Arsenault 96a1fe17c9SMatt Arsenault const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 97a1fe17c9SMatt Arsenault float ULP = FPOp->getFPAccuracy(); 98a1fe17c9SMatt Arsenault if (ULP < 2.5f) 99a1fe17c9SMatt Arsenault return false; 100a1fe17c9SMatt Arsenault 101a1fe17c9SMatt Arsenault FastMathFlags FMF = FPOp->getFastMathFlags(); 102a1fe17c9SMatt Arsenault bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 103a1fe17c9SMatt Arsenault FMF.allowReciprocal(); 104a1fe17c9SMatt Arsenault if (ST->hasFP32Denormals() && !UnsafeDiv) 105a1fe17c9SMatt Arsenault return false; 106a1fe17c9SMatt Arsenault 107a1fe17c9SMatt Arsenault IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 108a1fe17c9SMatt Arsenault Builder.setFastMathFlags(FMF); 109a1fe17c9SMatt Arsenault Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 110a1fe17c9SMatt Arsenault 111a1fe17c9SMatt Arsenault const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 112a1fe17c9SMatt Arsenault Function *Decl 113a1fe17c9SMatt Arsenault = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 114a1fe17c9SMatt Arsenault 115a1fe17c9SMatt Arsenault Value *Num = FDiv.getOperand(0); 116a1fe17c9SMatt Arsenault Value *Den = FDiv.getOperand(1); 117a1fe17c9SMatt Arsenault 118a1fe17c9SMatt Arsenault Value *NewFDiv = nullptr; 119a1fe17c9SMatt Arsenault 120a1fe17c9SMatt Arsenault if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 121a1fe17c9SMatt Arsenault NewFDiv = UndefValue::get(VT); 122a1fe17c9SMatt Arsenault 123a1fe17c9SMatt Arsenault // FIXME: Doesn't do the right thing for cases where the vector is partially 124a1fe17c9SMatt Arsenault // constant. This works when the scalarizer pass is run first. 125a1fe17c9SMatt Arsenault for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 126a1fe17c9SMatt Arsenault Value *NumEltI = Builder.CreateExtractElement(Num, I); 127a1fe17c9SMatt Arsenault Value *DenEltI = Builder.CreateExtractElement(Den, I); 128a1fe17c9SMatt Arsenault Value *NewElt; 129a1fe17c9SMatt Arsenault 130a1fe17c9SMatt Arsenault if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 131a1fe17c9SMatt Arsenault NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 132a1fe17c9SMatt Arsenault } else { 133a1fe17c9SMatt Arsenault NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 134a1fe17c9SMatt Arsenault } 135a1fe17c9SMatt Arsenault 136a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 137a1fe17c9SMatt Arsenault } 138a1fe17c9SMatt Arsenault } else { 139a1fe17c9SMatt Arsenault if (!shouldKeepFDivF32(Num, UnsafeDiv)) 140a1fe17c9SMatt Arsenault NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 141a1fe17c9SMatt Arsenault } 142a1fe17c9SMatt Arsenault 143a1fe17c9SMatt Arsenault if (NewFDiv) { 144a1fe17c9SMatt Arsenault FDiv.replaceAllUsesWith(NewFDiv); 145a1fe17c9SMatt Arsenault NewFDiv->takeName(&FDiv); 146a1fe17c9SMatt Arsenault FDiv.eraseFromParent(); 147a1fe17c9SMatt Arsenault } 148a1fe17c9SMatt Arsenault 149a1fe17c9SMatt Arsenault return true; 150a1fe17c9SMatt Arsenault } 151a1fe17c9SMatt Arsenault 152a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) { 153a1fe17c9SMatt Arsenault Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 154a1fe17c9SMatt Arsenault return Attr.getValueAsString() == "true"; 155a1fe17c9SMatt Arsenault } 156a1fe17c9SMatt Arsenault 15786de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 158a1fe17c9SMatt Arsenault Mod = &M; 15986de486dSMatt Arsenault return false; 16086de486dSMatt Arsenault } 16186de486dSMatt Arsenault 16286de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 16386de486dSMatt Arsenault if (!TM || skipFunction(F)) 16486de486dSMatt Arsenault return false; 16586de486dSMatt Arsenault 166a1fe17c9SMatt Arsenault ST = &TM->getSubtarget<SISubtarget>(F); 16786de486dSMatt Arsenault DA = &getAnalysis<DivergenceAnalysis>(); 168a1fe17c9SMatt Arsenault HasUnsafeFPMath = hasUnsafeFPMath(F); 16986de486dSMatt Arsenault 170a1fe17c9SMatt Arsenault bool MadeChange = false; 171a1fe17c9SMatt Arsenault 172a1fe17c9SMatt Arsenault for (BasicBlock &BB : F) { 173a1fe17c9SMatt Arsenault BasicBlock::iterator Next; 174a1fe17c9SMatt Arsenault for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 175a1fe17c9SMatt Arsenault Next = std::next(I); 176a1fe17c9SMatt Arsenault MadeChange |= visit(*I); 177a1fe17c9SMatt Arsenault } 178a1fe17c9SMatt Arsenault } 179a1fe17c9SMatt Arsenault 180a1fe17c9SMatt Arsenault return MadeChange; 18186de486dSMatt Arsenault } 18286de486dSMatt Arsenault 18386de486dSMatt Arsenault INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 18486de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 18586de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 18686de486dSMatt Arsenault INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 18786de486dSMatt Arsenault "AMDGPU IR optimizations", false, false) 18886de486dSMatt Arsenault 18986de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0; 19086de486dSMatt Arsenault 191a1fe17c9SMatt Arsenault FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 19286de486dSMatt Arsenault return new AMDGPUCodeGenPrepare(TM); 19386de486dSMatt Arsenault } 194