1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 
21 #include "llvm/Analysis/DivergenceAnalysis.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/Support/raw_ostream.h"
27 
28 #define DEBUG_TYPE "amdgpu-codegenprepare"
29 
30 using namespace llvm;
31 
32 namespace {
33 
34 class AMDGPUCodeGenPrepare : public FunctionPass,
35                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
36   const GCNTargetMachine *TM;
37   const SISubtarget *ST;
38   DivergenceAnalysis *DA;
39   Module *Mod;
40   bool HasUnsafeFPMath;
41 
42   /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to
43   /// binary operator \p V.
44   ///
45   /// \returns Binary operator \p V.
46   Value *copyFlags(const BinaryOperator &I, Value *V) const;
47 
48   /// \returns Equivalent 16 bit integer type for given 32 bit integer type
49   /// \p T.
50   Type *getI16Ty(IRBuilder<> &B, const Type *T) const;
51 
52   /// \returns Equivalent 32 bit integer type for given 16 bit integer type
53   /// \p T.
54   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
55 
56   /// \returns True if the base element of type \p T is 16 bit integer, false
57   /// otherwise.
58   bool isI16Ty(const Type *T) const;
59 
60   /// \returns True if the base element of type \p T is 32 bit integer, false
61   /// otherwise.
62   bool isI32Ty(const Type *T) const;
63 
64   /// \returns True if binary operation \p I is a signed binary operation, false
65   /// otherwise.
66   bool isSigned(const BinaryOperator &I) const;
67 
68   /// \returns True if the condition of 'select' operation \p I comes from a
69   /// signed 'icmp' operation, false otherwise.
70   bool isSigned(const SelectInst &I) const;
71 
72   /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit
73   /// binary operation by sign or zero extending operands to 32 bits, replacing
74   /// 16 bit operation with equivalent 32 bit operation, and truncating the
75   /// result of 32 bit operation back to 16 bits. 16 bit division operation is
76   /// not promoted.
77   ///
78   /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit
79   /// binary operation, false otherwise.
80   bool promoteUniformI16OpToI32Op(BinaryOperator &I) const;
81 
82   /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp'
83   /// operation by sign or zero extending operands to 32 bits, and replacing 16
84   /// bit operation with 32 bit operation.
85   ///
86   /// \returns True.
87   bool promoteUniformI16OpToI32Op(ICmpInst &I) const;
88 
89   /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select'
90   /// operation by sign or zero extending operands to 32 bits, replacing 16 bit
91   /// operation with 32 bit operation, and truncating the result of 32 bit
92   /// operation back to 16 bits.
93   ///
94   /// \returns True.
95   bool promoteUniformI16OpToI32Op(SelectInst &I) const;
96 
97 public:
98   static char ID;
99   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
100     FunctionPass(ID),
101     TM(static_cast<const GCNTargetMachine *>(TM)),
102     ST(nullptr),
103     DA(nullptr),
104     Mod(nullptr),
105     HasUnsafeFPMath(false) { }
106 
107   bool visitFDiv(BinaryOperator &I);
108 
109   bool visitInstruction(Instruction &I) { return false; }
110   bool visitBinaryOperator(BinaryOperator &I);
111   bool visitICmpInst(ICmpInst &I);
112   bool visitSelectInst(SelectInst &I);
113 
114   bool doInitialization(Module &M) override;
115   bool runOnFunction(Function &F) override;
116 
117   const char *getPassName() const override {
118     return "AMDGPU IR optimizations";
119   }
120 
121   void getAnalysisUsage(AnalysisUsage &AU) const override {
122     AU.addRequired<DivergenceAnalysis>();
123     AU.setPreservesAll();
124  }
125 };
126 
127 } // End anonymous namespace
128 
129 Value *AMDGPUCodeGenPrepare::copyFlags(
130     const BinaryOperator &I, Value *V) const {
131   assert(isa<BinaryOperator>(V) && "V must be binary operator");
132 
133   BinaryOperator *BinOp = cast<BinaryOperator>(V);
134   if (isa<OverflowingBinaryOperator>(BinOp)) {
135     BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
136     BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
137   } else if (isa<PossiblyExactOperator>(BinOp)) {
138     BinOp->setIsExact(I.isExact());
139   }
140 
141   return V;
142 }
143 
144 Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const {
145   assert(isI32Ty(T) && "T must be 32 bits");
146 
147   if (T->isIntegerTy())
148     return B.getInt16Ty();
149   return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements());
150 }
151 
152 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
153   assert(isI16Ty(T) && "T must be 16 bits");
154 
155   if (T->isIntegerTy())
156     return B.getInt32Ty();
157   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
158 }
159 
160 bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const {
161   if (T->isIntegerTy(16))
162     return true;
163   if (!T->isVectorTy())
164     return false;
165   return cast<VectorType>(T)->getElementType()->isIntegerTy(16);
166 }
167 
168 bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const {
169   if (T->isIntegerTy(32))
170     return true;
171   if (!T->isVectorTy())
172     return false;
173   return cast<VectorType>(T)->getElementType()->isIntegerTy(32);
174 }
175 
176 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
177   return I.getOpcode() == Instruction::SDiv ||
178       I.getOpcode() == Instruction::SRem;
179 }
180 
181 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
182   return isa<ICmpInst>(I.getOperand(0)) ?
183       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
184 }
185 
186 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const {
187   assert(isI16Ty(I.getType()) && "Op must be 16 bits");
188 
189   if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv)
190     return false;
191 
192   IRBuilder<> Builder(&I);
193   Builder.SetCurrentDebugLocation(I.getDebugLoc());
194 
195   Type *I32Ty = getI32Ty(Builder, I.getType());
196   Value *ExtOp0 = nullptr;
197   Value *ExtOp1 = nullptr;
198   Value *ExtRes = nullptr;
199   Value *TruncRes = nullptr;
200 
201   if (isSigned(I)) {
202     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
203     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
204   } else {
205     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
206     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
207   }
208   ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
209   TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
210 
211   I.replaceAllUsesWith(TruncRes);
212   I.eraseFromParent();
213 
214   return true;
215 }
216 
217 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const {
218   assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits");
219   assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits");
220 
221   IRBuilder<> Builder(&I);
222   Builder.SetCurrentDebugLocation(I.getDebugLoc());
223 
224   Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType());
225   Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType());
226   Value *ExtOp0 = nullptr;
227   Value *ExtOp1 = nullptr;
228   Value *NewICmp  = nullptr;
229 
230   if (I.isSigned()) {
231     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0);
232     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1);
233   } else {
234     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0);
235     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1);
236   }
237   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
238 
239   I.replaceAllUsesWith(NewICmp);
240   I.eraseFromParent();
241 
242   return true;
243 }
244 
245 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const {
246   assert(isI16Ty(I.getType()) && "Op must be 16 bits");
247 
248   IRBuilder<> Builder(&I);
249   Builder.SetCurrentDebugLocation(I.getDebugLoc());
250 
251   Type *I32Ty = getI32Ty(Builder, I.getType());
252   Value *ExtOp1 = nullptr;
253   Value *ExtOp2 = nullptr;
254   Value *ExtRes = nullptr;
255   Value *TruncRes = nullptr;
256 
257   if (isSigned(I)) {
258     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
259     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
260   } else {
261     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
262     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
263   }
264   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
265   TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType()));
266 
267   I.replaceAllUsesWith(TruncRes);
268   I.eraseFromParent();
269 
270   return true;
271 }
272 
273 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
274   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
275   if (!CNum)
276     return false;
277 
278   // Reciprocal f32 is handled separately without denormals.
279   return UnsafeDiv || CNum->isExactlyValue(+1.0);
280 }
281 
282 // Insert an intrinsic for fast fdiv for safe math situations where we can
283 // reduce precision. Leave fdiv for situations where the generic node is
284 // expected to be optimized.
285 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
286   Type *Ty = FDiv.getType();
287 
288   // TODO: Handle half
289   if (!Ty->getScalarType()->isFloatTy())
290     return false;
291 
292   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
293   if (!FPMath)
294     return false;
295 
296   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
297   float ULP = FPOp->getFPAccuracy();
298   if (ULP < 2.5f)
299     return false;
300 
301   FastMathFlags FMF = FPOp->getFastMathFlags();
302   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
303                                       FMF.allowReciprocal();
304   if (ST->hasFP32Denormals() && !UnsafeDiv)
305     return false;
306 
307   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
308   Builder.setFastMathFlags(FMF);
309   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
310 
311   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
312   Function *Decl
313     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
314 
315   Value *Num = FDiv.getOperand(0);
316   Value *Den = FDiv.getOperand(1);
317 
318   Value *NewFDiv = nullptr;
319 
320   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
321     NewFDiv = UndefValue::get(VT);
322 
323     // FIXME: Doesn't do the right thing for cases where the vector is partially
324     // constant. This works when the scalarizer pass is run first.
325     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
326       Value *NumEltI = Builder.CreateExtractElement(Num, I);
327       Value *DenEltI = Builder.CreateExtractElement(Den, I);
328       Value *NewElt;
329 
330       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
331         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
332       } else {
333         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
334       }
335 
336       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
337     }
338   } else {
339     if (!shouldKeepFDivF32(Num, UnsafeDiv))
340       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
341   }
342 
343   if (NewFDiv) {
344     FDiv.replaceAllUsesWith(NewFDiv);
345     NewFDiv->takeName(&FDiv);
346     FDiv.eraseFromParent();
347   }
348 
349   return true;
350 }
351 
352 static bool hasUnsafeFPMath(const Function &F) {
353   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
354   return Attr.getValueAsString() == "true";
355 }
356 
357 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
358   bool Changed = false;
359 
360   // TODO: Should we promote smaller types that will be legalized to i16?
361   if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
362     Changed |= promoteUniformI16OpToI32Op(I);
363 
364   return Changed;
365 }
366 
367 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
368   bool Changed = false;
369 
370   // TODO: Should we promote smaller types that will be legalized to i16?
371   if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) &&
372           isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I))
373     Changed |= promoteUniformI16OpToI32Op(I);
374 
375   return Changed;
376 }
377 
378 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
379   bool Changed = false;
380 
381   // TODO: Should we promote smaller types that will be legalized to i16?
382   if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I))
383     Changed |= promoteUniformI16OpToI32Op(I);
384 
385   return Changed;
386 }
387 
388 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
389   Mod = &M;
390   return false;
391 }
392 
393 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
394   if (!TM || skipFunction(F))
395     return false;
396 
397   ST = &TM->getSubtarget<SISubtarget>(F);
398   DA = &getAnalysis<DivergenceAnalysis>();
399   HasUnsafeFPMath = hasUnsafeFPMath(F);
400 
401   bool MadeChange = false;
402 
403   for (BasicBlock &BB : F) {
404     BasicBlock::iterator Next;
405     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
406       Next = std::next(I);
407       MadeChange |= visit(*I);
408     }
409   }
410 
411   return MadeChange;
412 }
413 
414 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
415                       "AMDGPU IR optimizations", false, false)
416 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
417 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
418                        "AMDGPU IR optimizations", false, false)
419 
420 char AMDGPUCodeGenPrepare::ID = 0;
421 
422 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
423   return new AMDGPUCodeGenPrepare(TM);
424 }
425