1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass does misc. AMDGPU optimizations on IR before instruction
12 /// selection.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 
21 #include "llvm/Analysis/DivergenceAnalysis.h"
22 #include "llvm/CodeGen/Passes.h"
23 #include "llvm/IR/InstVisitor.h"
24 #include "llvm/IR/IRBuilder.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/Support/raw_ostream.h"
27 
28 #define DEBUG_TYPE "amdgpu-codegenprepare"
29 
30 using namespace llvm;
31 
32 namespace {
33 
34 class AMDGPUCodeGenPrepare : public FunctionPass,
35                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
36   const GCNTargetMachine *TM;
37   const SISubtarget *ST;
38   DivergenceAnalysis *DA;
39   Module *Mod;
40   bool HasUnsafeFPMath;
41 
42   /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
43   /// binary operation \p V.
44   ///
45   /// \returns Binary operation \p V.
46   Value *copyFlags(const BinaryOperator &I, Value *V) const;
47 
48   /// \returns \p T's base element bit width.
49   unsigned getBaseElementBitWidth(const Type *T) const;
50 
51   /// \returns Equivalent 32 bit integer type for given type \p T. For example,
52   /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32>
53   /// is returned.
54   Type *getI32Ty(IRBuilder<> &B, const Type *T) const;
55 
56   /// \returns True if binary operation \p I is a signed binary operation, false
57   /// otherwise.
58   bool isSigned(const BinaryOperator &I) const;
59 
60   /// \returns True if the condition of 'select' operation \p I comes from a
61   /// signed 'icmp' operation, false otherwise.
62   bool isSigned(const SelectInst &I) const;
63 
64   /// \returns True if type \p T needs to be promoted to 32 bit integer type,
65   /// false otherwise.
66   bool needsPromotionToI32(const Type *T) const;
67 
68   /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary
69   /// operation.
70   ///
71   /// \details \p I's base element bit width must be greater than 1 and less
72   /// than or equal 16. Promotion is done by sign or zero extending operands to
73   /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and
74   /// truncating the result of 32 bit binary operation back to \p I's original
75   /// type. Division operation is not promoted.
76   ///
77   /// \returns True if \p I is promoted to equivalent 32 bit binary operation,
78   /// false otherwise.
79   bool promoteUniformOpToI32(BinaryOperator &I) const;
80 
81   /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation.
82   ///
83   /// \details \p I's base element bit width must be greater than 1 and less
84   /// than or equal 16. Promotion is done by sign or zero extending operands to
85   /// 32 bits, and replacing \p I with 32 bit 'icmp' operation.
86   ///
87   /// \returns True.
88   bool promoteUniformOpToI32(ICmpInst &I) const;
89 
90   /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select'
91   /// operation.
92   ///
93   /// \details \p I's base element bit width must be greater than 1 and less
94   /// than or equal 16. Promotion is done by sign or zero extending operands to
95   /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the
96   /// result of 32 bit 'select' operation back to \p I's original type.
97   ///
98   /// \returns True.
99   bool promoteUniformOpToI32(SelectInst &I) const;
100 
101   /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse'
102   /// intrinsic.
103   ///
104   /// \details \p I's base element bit width must be greater than 1 and less
105   /// than or equal 16. Promotion is done by zero extending the operand to 32
106   /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
107   /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
108   /// shift amount is 32 minus \p I's base element bit width), and truncating
109   /// the result of the shift operation back to \p I's original type.
110   ///
111   /// \returns True.
112   bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
113 
114 public:
115   static char ID;
116   AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) :
117     FunctionPass(ID),
118     TM(static_cast<const GCNTargetMachine *>(TM)),
119     ST(nullptr),
120     DA(nullptr),
121     Mod(nullptr),
122     HasUnsafeFPMath(false) { }
123 
124   bool visitFDiv(BinaryOperator &I);
125 
126   bool visitInstruction(Instruction &I) { return false; }
127   bool visitBinaryOperator(BinaryOperator &I);
128   bool visitICmpInst(ICmpInst &I);
129   bool visitSelectInst(SelectInst &I);
130 
131   bool visitIntrinsicInst(IntrinsicInst &I);
132   bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
133 
134   bool doInitialization(Module &M) override;
135   bool runOnFunction(Function &F) override;
136 
137   StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
138 
139   void getAnalysisUsage(AnalysisUsage &AU) const override {
140     AU.addRequired<DivergenceAnalysis>();
141     AU.setPreservesAll();
142  }
143 };
144 
145 } // End anonymous namespace
146 
147 Value *AMDGPUCodeGenPrepare::copyFlags(
148     const BinaryOperator &I, Value *V) const {
149   assert(isa<BinaryOperator>(V) && "V must be binary operation");
150 
151   BinaryOperator *BinOp = cast<BinaryOperator>(V);
152   if (isa<OverflowingBinaryOperator>(BinOp)) {
153     BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
154     BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
155   } else if (isa<PossiblyExactOperator>(BinOp))
156     BinOp->setIsExact(I.isExact());
157 
158   return V;
159 }
160 
161 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
162   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
163 
164   if (T->isIntegerTy())
165     return T->getIntegerBitWidth();
166   return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
167 }
168 
169 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
170   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
171 
172   if (T->isIntegerTy())
173     return B.getInt32Ty();
174   return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
175 }
176 
177 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
178   return I.getOpcode() == Instruction::AShr ||
179       I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
180 }
181 
182 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
183   return isa<ICmpInst>(I.getOperand(0)) ?
184       cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
185 }
186 
187 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
188   if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
189       T->getIntegerBitWidth() <= 16)
190     return true;
191   if (!T->isVectorTy())
192     return false;
193   return needsPromotionToI32(cast<VectorType>(T)->getElementType());
194 }
195 
196 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
197   assert(needsPromotionToI32(I.getType()) &&
198          "I does not need promotion to i32");
199 
200   if (I.getOpcode() == Instruction::SDiv ||
201       I.getOpcode() == Instruction::UDiv)
202     return false;
203 
204   IRBuilder<> Builder(&I);
205   Builder.SetCurrentDebugLocation(I.getDebugLoc());
206 
207   Type *I32Ty = getI32Ty(Builder, I.getType());
208   Value *ExtOp0 = nullptr;
209   Value *ExtOp1 = nullptr;
210   Value *ExtRes = nullptr;
211   Value *TruncRes = nullptr;
212 
213   if (isSigned(I)) {
214     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
215     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
216   } else {
217     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
218     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
219   }
220   ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
221   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
222 
223   I.replaceAllUsesWith(TruncRes);
224   I.eraseFromParent();
225 
226   return true;
227 }
228 
229 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
230   assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
231          "I does not need promotion to i32");
232 
233   IRBuilder<> Builder(&I);
234   Builder.SetCurrentDebugLocation(I.getDebugLoc());
235 
236   Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType());
237   Value *ExtOp0 = nullptr;
238   Value *ExtOp1 = nullptr;
239   Value *NewICmp  = nullptr;
240 
241   if (I.isSigned()) {
242     ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty);
243     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
244   } else {
245     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
246     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
247   }
248   NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1);
249 
250   I.replaceAllUsesWith(NewICmp);
251   I.eraseFromParent();
252 
253   return true;
254 }
255 
256 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
257   assert(needsPromotionToI32(I.getType()) &&
258          "I does not need promotion to i32");
259 
260   IRBuilder<> Builder(&I);
261   Builder.SetCurrentDebugLocation(I.getDebugLoc());
262 
263   Type *I32Ty = getI32Ty(Builder, I.getType());
264   Value *ExtOp1 = nullptr;
265   Value *ExtOp2 = nullptr;
266   Value *ExtRes = nullptr;
267   Value *TruncRes = nullptr;
268 
269   if (isSigned(I)) {
270     ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty);
271     ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty);
272   } else {
273     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
274     ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty);
275   }
276   ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2);
277   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
278 
279   I.replaceAllUsesWith(TruncRes);
280   I.eraseFromParent();
281 
282   return true;
283 }
284 
285 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
286     IntrinsicInst &I) const {
287   assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
288          "I must be bitreverse intrinsic");
289   assert(needsPromotionToI32(I.getType()) &&
290          "I does not need promotion to i32");
291 
292   IRBuilder<> Builder(&I);
293   Builder.SetCurrentDebugLocation(I.getDebugLoc());
294 
295   Type *I32Ty = getI32Ty(Builder, I.getType());
296   Function *I32 =
297       Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty });
298   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
299   Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
300   Value *LShrOp =
301       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
302   Value *TruncRes =
303       Builder.CreateTrunc(LShrOp, I.getType());
304 
305   I.replaceAllUsesWith(TruncRes);
306   I.eraseFromParent();
307 
308   return true;
309 }
310 
311 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) {
312   const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
313   if (!CNum)
314     return false;
315 
316   // Reciprocal f32 is handled separately without denormals.
317   return UnsafeDiv || CNum->isExactlyValue(+1.0);
318 }
319 
320 // Insert an intrinsic for fast fdiv for safe math situations where we can
321 // reduce precision. Leave fdiv for situations where the generic node is
322 // expected to be optimized.
323 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
324   Type *Ty = FDiv.getType();
325 
326   // TODO: Handle half
327   if (!Ty->getScalarType()->isFloatTy())
328     return false;
329 
330   MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
331   if (!FPMath)
332     return false;
333 
334   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
335   float ULP = FPOp->getFPAccuracy();
336   if (ULP < 2.5f)
337     return false;
338 
339   FastMathFlags FMF = FPOp->getFastMathFlags();
340   bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() ||
341                                       FMF.allowReciprocal();
342   if (ST->hasFP32Denormals() && !UnsafeDiv)
343     return false;
344 
345   IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
346   Builder.setFastMathFlags(FMF);
347   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
348 
349   const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
350   Function *Decl
351     = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
352 
353   Value *Num = FDiv.getOperand(0);
354   Value *Den = FDiv.getOperand(1);
355 
356   Value *NewFDiv = nullptr;
357 
358   if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
359     NewFDiv = UndefValue::get(VT);
360 
361     // FIXME: Doesn't do the right thing for cases where the vector is partially
362     // constant. This works when the scalarizer pass is run first.
363     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
364       Value *NumEltI = Builder.CreateExtractElement(Num, I);
365       Value *DenEltI = Builder.CreateExtractElement(Den, I);
366       Value *NewElt;
367 
368       if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) {
369         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
370       } else {
371         NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
372       }
373 
374       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
375     }
376   } else {
377     if (!shouldKeepFDivF32(Num, UnsafeDiv))
378       NewFDiv = Builder.CreateCall(Decl, { Num, Den });
379   }
380 
381   if (NewFDiv) {
382     FDiv.replaceAllUsesWith(NewFDiv);
383     NewFDiv->takeName(&FDiv);
384     FDiv.eraseFromParent();
385   }
386 
387   return true;
388 }
389 
390 static bool hasUnsafeFPMath(const Function &F) {
391   Attribute Attr = F.getFnAttribute("unsafe-fp-math");
392   return Attr.getValueAsString() == "true";
393 }
394 
395 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
396   bool Changed = false;
397 
398   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
399       DA->isUniform(&I))
400     Changed |= promoteUniformOpToI32(I);
401 
402   return Changed;
403 }
404 
405 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
406   bool Changed = false;
407 
408   if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
409       DA->isUniform(&I))
410     Changed |= promoteUniformOpToI32(I);
411 
412   return Changed;
413 }
414 
415 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
416   bool Changed = false;
417 
418   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
419       DA->isUniform(&I))
420     Changed |= promoteUniformOpToI32(I);
421 
422   return Changed;
423 }
424 
425 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
426   switch (I.getIntrinsicID()) {
427   case Intrinsic::bitreverse:
428     return visitBitreverseIntrinsicInst(I);
429   default:
430     return false;
431   }
432 }
433 
434 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
435   bool Changed = false;
436 
437   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
438       DA->isUniform(&I))
439     Changed |= promoteUniformBitreverseToI32(I);
440 
441   return Changed;
442 }
443 
444 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
445   Mod = &M;
446   return false;
447 }
448 
449 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
450   if (!TM || skipFunction(F))
451     return false;
452 
453   ST = &TM->getSubtarget<SISubtarget>(F);
454   DA = &getAnalysis<DivergenceAnalysis>();
455   HasUnsafeFPMath = hasUnsafeFPMath(F);
456 
457   bool MadeChange = false;
458 
459   for (BasicBlock &BB : F) {
460     BasicBlock::iterator Next;
461     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
462       Next = std::next(I);
463       MadeChange |= visit(*I);
464     }
465   }
466 
467   return MadeChange;
468 }
469 
470 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
471                       "AMDGPU IR optimizations", false, false)
472 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
473 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE,
474                        "AMDGPU IR optimizations", false, false)
475 
476 char AMDGPUCodeGenPrepare::ID = 0;
477 
478 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) {
479   return new AMDGPUCodeGenPrepare(TM);
480 }
481