1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Analysis/DivergenceAnalysis.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/IR/Attributes.h"
24 #include "llvm/IR/BasicBlock.h"
25 #include "llvm/IR/Constants.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/InstrTypes.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/InstVisitor.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/Intrinsics.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/LLVMContext.h"
36 #include "llvm/IR/Operator.h"
37 #include "llvm/IR/Type.h"
38 #include "llvm/IR/Value.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/Casting.h"
41 #include <cassert>
42 #include <iterator>
43 
44 #define DEBUG_TYPE "amdgpu-codegenprepare"
45 
46 using namespace llvm;
47 
48 namespace {
49 
50 class AMDGPUCodeGenPrepare : public FunctionPass,
51                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
52   const GCNTargetMachine *TM;
53   const SISubtarget *ST = nullptr;
54   DivergenceAnalysis *DA = nullptr;
55   Module *Mod = nullptr;
56   bool HasUnsafeFPMath = false;
57 
58   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
59   /// binary operation \p V.
60   ///
61   /// \returns Binary operation \p V.
62   Value *copyFlags(const BinaryOperator &I, Value *V) const;
63 
64   /// \returns \p T's base element bit width.
65   unsigned getBaseElementBitWidth(const Type *T) const;
66 
67   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
68   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
69   /// is returned.
70   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
71 
72   /// \returns True if binary operation \p I is a signed binary operation, false
73   /// otherwise.
74   bool isSigned(const BinaryOperator &I) const;
75 
76   /// \returns True if the condition of 'select' operation \p I comes from a
77   /// signed 'icmp' operation, false otherwise.
78   bool isSigned(const SelectInst &I) const;
79 
80   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
81   /// false otherwise.
82   bool needsPromotionToI32(const Type *T) const;
83 
84   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
85   /// operation.
86   ///
87   /// \details \p I's base element bit width must be greater than 1 and less
88   /// than or equal 16. Promotion is done by sign or zero extending operands to
89   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
90   /// truncating the result of 32 bit binary operation back to \p I's original
91   /// type. Division operation is not promoted.
92   ///
93   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
94   /// false otherwise.
95   bool promoteUniformOpToI32(BinaryOperator &I) const;
96 
97   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
98   ///
99   /// \details \p I's base element bit width must be greater than 1 and less
100   /// than or equal 16. Promotion is done by sign or zero extending operands to
101   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
102   ///
103   /// \returns True.
104   bool promoteUniformOpToI32(ICmpInst &I) const;
105 
106   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
107   /// operation.
108   ///
109   /// \details \p I's base element bit width must be greater than 1 and less
110   /// than or equal 16. Promotion is done by sign or zero extending operands to
111   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
112   /// result of 32 bit 'select' operation back to \p I's original type.
113   ///
114   /// \returns True.
115   bool promoteUniformOpToI32(SelectInst &I) const;
116 
117   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
118   /// intrinsic.
119   ///
120   /// \details \p I's base element bit width must be greater than 1 and less
121   /// than or equal 16. Promotion is done by zero extending the operand to 32
122   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
123   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
124   /// shift amount is 32 minus \p I's base element bit width), and truncating
125   /// the result of the shift operation back to \p I's original type.
126   ///
127   /// \returns True.
128   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
129 
130 public:
131   static char ID;
132 
133   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
134     FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {}
135 
136   bool visitFDiv(BinaryOperator &I);
137 
138   bool visitInstruction(Instruction &I) { return false; }
139   bool visitBinaryOperator(BinaryOperator &I);
140   bool visitICmpInst(ICmpInst &I);
141   bool visitSelectInst(SelectInst &I);
142 
143   bool visitIntrinsicInst(IntrinsicInst &I);
144   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
145 
146   bool doInitialization(Module &M) override;
147   bool runOnFunction(Function &F) override;
148 
149   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
150 
151   void getAnalysisUsage(AnalysisUsage &AU) const override {
152     AU.addRequired<DivergenceAnalysis>();
153     AU.setPreservesAll();
154  }
155 };
156 
157 } // end anonymous namespace
158 
159 Value *AMDGPUCodeGenPrepare::copyFlags(
160     const BinaryOperator &I, Value *V) const {
161   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
162   if (!BinOp) // Possibly constant expression.
163     return V;
164 
165   if (isa<OverflowingBinaryOperator>(BinOp)) {
166     BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
167     BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
168   } else if (isa<PossiblyExactOperator>(BinOp))
169     BinOp->setIsExact(I.isExact());
170 
171   return V;
172 }
173 
174 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
175   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
176 
177   if (T->isIntegerTy())
178     return T->getIntegerBitWidth();
179   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
180 }
181 
182 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
183   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
184 
185   if (T->isIntegerTy())
186     return B.getInt32Ty();
187   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
188 }
189 
190 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
191   return I.getOpcode() == Instruction::AShr ||
192       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
193 }
194 
195 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
196   return isa<ICmpInst>(I.getOperand(0)) ?
197       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
198 }
199 
200 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
201   if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
202       T->getIntegerBitWidth() <= 16)
203     return true;
204   if (!T->isVectorTy())
205     return false;
206   return needsPromotionToI32(cast<VectorType>(T)->getElementType());
207 }
208 
209 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
210   assert(needsPromotionToI32(I.getType()) &&
211          "I does not need promotion to i32");
212 
213   if (I.getOpcode() == Instruction::SDiv ||
214       I.getOpcode() == Instruction::UDiv)
215     return false;
216 
217   IRBuilder<> Builder(&I);
218   Builder.SetCurrentDebugLocation(I.getDebugLoc());
219 
220   Type *I32Ty = getI32Ty(Builder, I.getType());
221   Value *ExtOp0 = nullptr;
222   Value *ExtOp1 = nullptr;
223   Value *ExtRes = nullptr;
224   Value *TruncRes = nullptr;
225 
226   if (isSigned(I)) {
227     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
228     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
229   } else {
230     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
231     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
232   }
233   ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
234   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
235 
236   I.replaceAllUsesWith(TruncRes);
237   I.eraseFromParent();
238 
239   return true;
240 }
241 
242 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
243   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
244          "I does not need promotion to i32");
245 
246   IRBuilder<> Builder(&I);
247   Builder.SetCurrentDebugLocation(I.getDebugLoc());
248 
249   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
250   Value *ExtOp0 = nullptr;
251   Value *ExtOp1 = nullptr;
252   Value *NewICmp  = nullptr;
253 
254   if (I.isSigned()) {
255     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
256     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
257   } else {
258     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
259     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
260   }
261   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
262 
263   I.replaceAllUsesWith(NewICmp);
264   I.eraseFromParent();
265 
266   return true;
267 }
268 
269 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
270   assert(needsPromotionToI32(I.getType()) &&
271          "I does not need promotion to i32");
272 
273   IRBuilder<> Builder(&I);
274   Builder.SetCurrentDebugLocation(I.getDebugLoc());
275 
276   Type *I32Ty = getI32Ty(Builder, I.getType());
277   Value *ExtOp1 = nullptr;
278   Value *ExtOp2 = nullptr;
279   Value *ExtRes = nullptr;
280   Value *TruncRes = nullptr;
281 
282   if (isSigned(I)) {
283     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
284     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
285   } else {
286     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
287     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
288   }
289   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
290   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
291 
292   I.replaceAllUsesWith(TruncRes);
293   I.eraseFromParent();
294 
295   return true;
296 }
297 
298 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
299     IntrinsicInst &I) const {
300   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
301          "I must be bitreverse intrinsic");
302   assert(needsPromotionToI32(I.getType()) &&
303          "I does not need promotion to i32");
304 
305   IRBuilder<> Builder(&I);
306   Builder.SetCurrentDebugLocation(I.getDebugLoc());
307 
308   Type *I32Ty = getI32Ty(Builder, I.getType());
309   Function *I32 =
310       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
311   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
312   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
313   Value *LShrOp =
314       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
315   Value *TruncRes =
316       Builder.CreateTrunc(LShrOp, I.getType());
317 
318   I.replaceAllUsesWith(TruncRes);
319   I.eraseFromParent();
320 
321   return true;
322 }
323 
324 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
325   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
326   if (!CNum)
327     return false;
328 
329   // Reciprocal f32 is handled separately without denormals.
330   return UnsafeDiv || CNum->isExactlyValue(+1.0);
331 }
332 
333 // Insert an intrinsic for fast fdiv for safe math situations where we can
334 // reduce precision. Leave fdiv for situations where the generic node is
335 // expected to be optimized.
336 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
337   Type *Ty = FDiv.getType();
338 
339   if (!Ty->getScalarType()->isFloatTy())
340     return false;
341 
342   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
343   if (!FPMath)
344     return false;
345 
346   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
347   float ULP = FPOp->getFPAccuracy();
348   if (ULP < 2.5f)
349     return false;
350 
351   FastMathFlags FMF = FPOp->getFastMathFlags();
352   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
353                                       FMF.allowReciprocal();
354   if (ST->hasFP32Denormals() && !UnsafeDiv)
355     return false;
356 
357   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
358   Builder.setFastMathFlags(FMF);
359   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
360 
361   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
362   Function *Decl
363     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
364 
365   Value *Num = FDiv.getOperand(0);
366   Value *Den = FDiv.getOperand(1);
367 
368   Value *NewFDiv = nullptr;
369 
370   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
371     NewFDiv = UndefValue::get(VT);
372 
373     // FIXME: Doesn't do the right thing for cases where the vector is partially
374     // constant. This works when the scalarizer pass is run first.
375     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
376       Value *NumEltI = Builder.CreateExtractElement(Num, I);
377       Value *DenEltI = Builder.CreateExtractElement(Den, I);
378       Value *NewElt;
379 
380       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
381         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
382       } else {
383         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
384       }
385 
386       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
387     }
388   } else {
389     if (!shouldKeepFDivF32(Num, UnsafeDiv))
390       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
391   }
392 
393   if (NewFDiv) {
394     FDiv.replaceAllUsesWith(NewFDiv);
395     NewFDiv->takeName(&FDiv);
396     FDiv.eraseFromParent();
397   }
398 
399   return true;
400 }
401 
402 static bool hasUnsafeFPMath(const Function &F) {
403   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
404   return Attr.getValueAsString() == "true";
405 }
406 
407 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
408   bool Changed = false;
409 
410   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
411       DA->isUniform(&I))
412     Changed |= promoteUniformOpToI32(I);
413 
414   return Changed;
415 }
416 
417 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
418   bool Changed = false;
419 
420   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
421       DA->isUniform(&I))
422     Changed |= promoteUniformOpToI32(I);
423 
424   return Changed;
425 }
426 
427 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
428   bool Changed = false;
429 
430   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
431       DA->isUniform(&I))
432     Changed |= promoteUniformOpToI32(I);
433 
434   return Changed;
435 }
436 
437 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
438   switch (I.getIntrinsicID()) {
439   case Intrinsic::bitreverse:
440     return visitBitreverseIntrinsicInst(I);
441   default:
442     return false;
443   }
444 }
445 
446 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
447   bool Changed = false;
448 
449   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
450       DA->isUniform(&I))
451     Changed |= promoteUniformBitreverseToI32(I);
452 
453   return Changed;
454 }
455 
456 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
457   Mod = &M;
458   return false;
459 }
460 
461 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
462   if (!TM || skipFunction(F))
463     return false;
464 
465   ST = &TM->getSubtarget<SISubtarget>(F);
466   DA = &getAnalysis<DivergenceAnalysis>();
467   HasUnsafeFPMath = hasUnsafeFPMath(F);
468 
469   bool MadeChange = false;
470 
471   for (BasicBlock &BB : F) {
472     BasicBlock::iterator Next;
473     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
474       Next = std::next(I);
475       MadeChange |= visit(*I);
476     }
477   }
478 
479   return MadeChange;
480 }
481 
482 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
483                       "AMDGPU IR optimizations", false, false)
484 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
485 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
486                        "AMDGPU IR optimizations", false, false)
487 
488 char AMDGPUCodeGenPrepare::ID = 0;
489 
490 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
491   return new AMDGPUCodeGenPrepare(TM);
492 }
493