186de486dSMatt Arsenault //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
286de486dSMatt Arsenault //
386de486dSMatt Arsenault //                     The LLVM Compiler Infrastructure
486de486dSMatt Arsenault //
586de486dSMatt Arsenault // This file is distributed under the University of Illinois Open Source
686de486dSMatt Arsenault // License. See LICENSE.TXT for details.
786de486dSMatt Arsenault //
886de486dSMatt Arsenault //===----------------------------------------------------------------------===//
986de486dSMatt Arsenault //
1086de486dSMatt Arsenault /// \file
1186de486dSMatt Arsenault /// This pass does misc. AMDGPU optimizations on IR before instruction
1286de486dSMatt Arsenault /// selection.
1386de486dSMatt Arsenault //
1486de486dSMatt Arsenault //===----------------------------------------------------------------------===//
1586de486dSMatt Arsenault 
1686de486dSMatt Arsenault #include "AMDGPU.h"
17a1fe17c9SMatt Arsenault #include "AMDGPUIntrinsicInfo.h"
1886de486dSMatt Arsenault #include "AMDGPUSubtarget.h"
19a1fe17c9SMatt Arsenault #include "AMDGPUTargetMachine.h"
2086de486dSMatt Arsenault 
2186de486dSMatt Arsenault #include "llvm/Analysis/DivergenceAnalysis.h"
2286de486dSMatt Arsenault #include "llvm/CodeGen/Passes.h"
2386de486dSMatt Arsenault #include "llvm/IR/InstVisitor.h"
2486de486dSMatt Arsenault #include "llvm/IR/IRBuilder.h"
2586de486dSMatt Arsenault #include "llvm/Support/Debug.h"
2686de486dSMatt Arsenault #include "llvm/Support/raw_ostream.h"
2786de486dSMatt Arsenault 
2886de486dSMatt Arsenault #define DEBUG_TYPE "amdgpu-codegenprepare"
2986de486dSMatt Arsenault 
3086de486dSMatt Arsenault using namespace llvm;
3186de486dSMatt Arsenault 
3286de486dSMatt Arsenault namespace {
3386de486dSMatt Arsenault 
3486de486dSMatt Arsenault class AMDGPUCodeGenPrepare : public FunctionPass,
35a1fe17c9SMatt Arsenault                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
36a1fe17c9SMatt Arsenault   const GCNTargetMachine *TM;
37a1fe17c9SMatt Arsenault   const SISubtarget *ST;
3886de486dSMatt Arsenault   DivergenceAnalysis *DA;
39a1fe17c9SMatt Arsenault   Module *Mod;
40a1fe17c9SMatt Arsenault   bool HasUnsafeFPMath;
4186de486dSMatt Arsenault 
4286de486dSMatt Arsenault public:
4386de486dSMatt Arsenault   static char ID;
4486de486dSMatt Arsenault   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
4586de486dSMatt Arsenault     FunctionPass(ID),
46a1fe17c9SMatt Arsenault     TM(static_cast<const GCNTargetMachine *>(TM)),
47a1fe17c9SMatt Arsenault     ST(nullptr),
48a1fe17c9SMatt Arsenault     DA(nullptr),
49a1fe17c9SMatt Arsenault     Mod(nullptr),
50a1fe17c9SMatt Arsenault     HasUnsafeFPMath(false) { }
51a1fe17c9SMatt Arsenault 
52a1fe17c9SMatt Arsenault   bool visitFDiv(BinaryOperator &I);
53a1fe17c9SMatt Arsenault 
54a1fe17c9SMatt Arsenault   bool visitInstruction(Instruction &I) {
55a1fe17c9SMatt Arsenault     return false;
56a1fe17c9SMatt Arsenault   }
5786de486dSMatt Arsenault 
5886de486dSMatt Arsenault   bool doInitialization(Module &M) override;
5986de486dSMatt Arsenault   bool runOnFunction(Function &F) override;
6086de486dSMatt Arsenault 
6186de486dSMatt Arsenault   const char *getPassName() const override {
6286de486dSMatt Arsenault     return "AMDGPU IR optimizations";
6386de486dSMatt Arsenault   }
6486de486dSMatt Arsenault 
6586de486dSMatt Arsenault   void getAnalysisUsage(AnalysisUsage &AU) const override {
6686de486dSMatt Arsenault     AU.addRequired<DivergenceAnalysis>();
6786de486dSMatt Arsenault     AU.setPreservesAll();
6886de486dSMatt Arsenault  }
6986de486dSMatt Arsenault };
7086de486dSMatt Arsenault 
7186de486dSMatt Arsenault } // End anonymous namespace
7286de486dSMatt Arsenault 
73a1fe17c9SMatt Arsenault static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
74a1fe17c9SMatt Arsenault   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
75a1fe17c9SMatt Arsenault   if (!CNum)
76a1fe17c9SMatt Arsenault     return false;
77a1fe17c9SMatt Arsenault 
78a1fe17c9SMatt Arsenault   // Reciprocal f32 is handled separately without denormals.
79*e3862cdcSMatt Arsenault   return UnsafeDiv || CNum->isExactlyValue(+1.0);
80a1fe17c9SMatt Arsenault }
81a1fe17c9SMatt Arsenault 
82a1fe17c9SMatt Arsenault // Insert an intrinsic for fast fdiv for safe math situations where we can
83a1fe17c9SMatt Arsenault // reduce precision. Leave fdiv for situations where the generic node is
84a1fe17c9SMatt Arsenault // expected to be optimized.
85a1fe17c9SMatt Arsenault bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
86a1fe17c9SMatt Arsenault   Type *Ty = FDiv.getType();
87a1fe17c9SMatt Arsenault 
88a1fe17c9SMatt Arsenault   // TODO: Handle half
89a1fe17c9SMatt Arsenault   if (!Ty->getScalarType()->isFloatTy())
90a1fe17c9SMatt Arsenault     return false;
91a1fe17c9SMatt Arsenault 
92a1fe17c9SMatt Arsenault   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
93a1fe17c9SMatt Arsenault   if (!FPMath)
94a1fe17c9SMatt Arsenault     return false;
95a1fe17c9SMatt Arsenault 
96a1fe17c9SMatt Arsenault   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
97a1fe17c9SMatt Arsenault   float ULP = FPOp->getFPAccuracy();
98a1fe17c9SMatt Arsenault   if (ULP < 2.5f)
99a1fe17c9SMatt Arsenault     return false;
100a1fe17c9SMatt Arsenault 
101a1fe17c9SMatt Arsenault   FastMathFlags FMF = FPOp->getFastMathFlags();
102a1fe17c9SMatt Arsenault   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
103a1fe17c9SMatt Arsenault                                       FMF.allowReciprocal();
104a1fe17c9SMatt Arsenault   if (ST->hasFP32Denormals() && !UnsafeDiv)
105a1fe17c9SMatt Arsenault     return false;
106a1fe17c9SMatt Arsenault 
107a1fe17c9SMatt Arsenault   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
108a1fe17c9SMatt Arsenault   Builder.setFastMathFlags(FMF);
109a1fe17c9SMatt Arsenault   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
110a1fe17c9SMatt Arsenault 
111a1fe17c9SMatt Arsenault   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
112a1fe17c9SMatt Arsenault   Function *Decl
113a1fe17c9SMatt Arsenault     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
114a1fe17c9SMatt Arsenault 
115a1fe17c9SMatt Arsenault   Value *Num = FDiv.getOperand(0);
116a1fe17c9SMatt Arsenault   Value *Den = FDiv.getOperand(1);
117a1fe17c9SMatt Arsenault 
118a1fe17c9SMatt Arsenault   Value *NewFDiv = nullptr;
119a1fe17c9SMatt Arsenault 
120a1fe17c9SMatt Arsenault   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
121a1fe17c9SMatt Arsenault     NewFDiv = UndefValue::get(VT);
122a1fe17c9SMatt Arsenault 
123a1fe17c9SMatt Arsenault     // FIXME: Doesn't do the right thing for cases where the vector is partially
124a1fe17c9SMatt Arsenault     // constant. This works when the scalarizer pass is run first.
125a1fe17c9SMatt Arsenault     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
126a1fe17c9SMatt Arsenault       Value *NumEltI = Builder.CreateExtractElement(Num, I);
127a1fe17c9SMatt Arsenault       Value *DenEltI = Builder.CreateExtractElement(Den, I);
128a1fe17c9SMatt Arsenault       Value *NewElt;
129a1fe17c9SMatt Arsenault 
130a1fe17c9SMatt Arsenault       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
131a1fe17c9SMatt Arsenault         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
132a1fe17c9SMatt Arsenault       } else {
133a1fe17c9SMatt Arsenault         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
134a1fe17c9SMatt Arsenault       }
135a1fe17c9SMatt Arsenault 
136a1fe17c9SMatt Arsenault       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
137a1fe17c9SMatt Arsenault     }
138a1fe17c9SMatt Arsenault   } else {
139a1fe17c9SMatt Arsenault     if (!shouldKeepFDivF32(Num, UnsafeDiv))
140a1fe17c9SMatt Arsenault       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
141a1fe17c9SMatt Arsenault   }
142a1fe17c9SMatt Arsenault 
143a1fe17c9SMatt Arsenault   if (NewFDiv) {
144a1fe17c9SMatt Arsenault     FDiv.replaceAllUsesWith(NewFDiv);
145a1fe17c9SMatt Arsenault     NewFDiv->takeName(&FDiv);
146a1fe17c9SMatt Arsenault     FDiv.eraseFromParent();
147a1fe17c9SMatt Arsenault   }
148a1fe17c9SMatt Arsenault 
149a1fe17c9SMatt Arsenault   return true;
150a1fe17c9SMatt Arsenault }
151a1fe17c9SMatt Arsenault 
152a1fe17c9SMatt Arsenault static bool hasUnsafeFPMath(const Function &F) {
153a1fe17c9SMatt Arsenault   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
154a1fe17c9SMatt Arsenault   return Attr.getValueAsString() == "true";
155a1fe17c9SMatt Arsenault }
156a1fe17c9SMatt Arsenault 
15786de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
158a1fe17c9SMatt Arsenault   Mod = &M;
15986de486dSMatt Arsenault   return false;
16086de486dSMatt Arsenault }
16186de486dSMatt Arsenault 
16286de486dSMatt Arsenault bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
16386de486dSMatt Arsenault   if (!TM || skipFunction(F))
16486de486dSMatt Arsenault     return false;
16586de486dSMatt Arsenault 
166a1fe17c9SMatt Arsenault   ST = &TM->getSubtarget<SISubtarget>(F);
16786de486dSMatt Arsenault   DA = &getAnalysis<DivergenceAnalysis>();
168a1fe17c9SMatt Arsenault   HasUnsafeFPMath = hasUnsafeFPMath(F);
16986de486dSMatt Arsenault 
170a1fe17c9SMatt Arsenault   bool MadeChange = false;
171a1fe17c9SMatt Arsenault 
172a1fe17c9SMatt Arsenault   for (BasicBlock &BB : F) {
173a1fe17c9SMatt Arsenault     BasicBlock::iterator Next;
174a1fe17c9SMatt Arsenault     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
175a1fe17c9SMatt Arsenault       Next = std::next(I);
176a1fe17c9SMatt Arsenault       MadeChange |= visit(*I);
177a1fe17c9SMatt Arsenault     }
178a1fe17c9SMatt Arsenault   }
179a1fe17c9SMatt Arsenault 
180a1fe17c9SMatt Arsenault   return MadeChange;
18186de486dSMatt Arsenault }
18286de486dSMatt Arsenault 
18386de486dSMatt Arsenault INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
18486de486dSMatt Arsenault                       "AMDGPU IR optimizations", false, false)
18586de486dSMatt Arsenault INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
18686de486dSMatt Arsenault INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
18786de486dSMatt Arsenault                        "AMDGPU IR optimizations", false, false)
18886de486dSMatt Arsenault 
18986de486dSMatt Arsenault char AMDGPUCodeGenPrepare::ID = 0;
19086de486dSMatt Arsenault 
191a1fe17c9SMatt Arsenault FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
19286de486dSMatt Arsenault   return new AMDGPUCodeGenPrepare(TM);
19386de486dSMatt Arsenault }
194