1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/CallingConv.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/GetElementPtrTypeIterator.h"
29 #include "llvm/IR/GlobalAlias.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Operator.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "inline-cost"
39 
40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
41 
42 // Threshold to use when optsize is specified (and there is no
43 // -inline-threshold).
44 const int OptSizeThreshold = 75;
45 
46 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
47 const int OptMinSizeThreshold = 25;
48 
49 // Threshold to use when -O[34] is specified (and there is no
50 // -inline-threshold).
51 const int OptAggressiveThreshold = 275;
52 
53 static cl::opt<int> DefaultInlineThreshold(
54     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
55     cl::desc("Control the amount of inlining to perform (default = 225)"));
56 
57 static cl::opt<int> HintThreshold(
58     "inlinehint-threshold", cl::Hidden, cl::init(325),
59     cl::desc("Threshold for inlining functions with inline hint"));
60 
61 // We introduce this threshold to help performance of instrumentation based
62 // PGO before we actually hook up inliner with analysis passes such as BPI and
63 // BFI.
64 static cl::opt<int> ColdThreshold(
65     "inlinecold-threshold", cl::Hidden, cl::init(225),
66     cl::desc("Threshold for inlining functions with cold attribute"));
67 
68 namespace {
69 
70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
71   typedef InstVisitor<CallAnalyzer, bool> Base;
72   friend class InstVisitor<CallAnalyzer, bool>;
73 
74   /// The TargetTransformInfo available for this compilation.
75   const TargetTransformInfo &TTI;
76 
77   /// The cache of @llvm.assume intrinsics.
78   AssumptionCacheTracker *ACT;
79 
80   // The called function.
81   Function &F;
82 
83   // The candidate callsite being analyzed. Please do not use this to do
84   // analysis in the caller function; we want the inline cost query to be
85   // easily cacheable. Instead, use the cover function paramHasAttr.
86   CallSite CandidateCS;
87 
88   int Threshold;
89   int Cost;
90 
91   bool IsCallerRecursive;
92   bool IsRecursiveCall;
93   bool ExposesReturnsTwice;
94   bool HasDynamicAlloca;
95   bool ContainsNoDuplicateCall;
96   bool HasReturn;
97   bool HasIndirectBr;
98   bool HasFrameEscape;
99 
100   /// Number of bytes allocated statically by the callee.
101   uint64_t AllocatedSize;
102   unsigned NumInstructions, NumVectorInstructions;
103   int FiftyPercentVectorBonus, TenPercentVectorBonus;
104   int VectorBonus;
105 
106   // While we walk the potentially-inlined instructions, we build up and
107   // maintain a mapping of simplified values specific to this callsite. The
108   // idea is to propagate any special information we have about arguments to
109   // this call through the inlinable section of the function, and account for
110   // likely simplifications post-inlining. The most important aspect we track
111   // is CFG altering simplifications -- when we prove a basic block dead, that
112   // can cause dramatic shifts in the cost of inlining a function.
113   DenseMap<Value *, Constant *> SimplifiedValues;
114 
115   // Keep track of the values which map back (through function arguments) to
116   // allocas on the caller stack which could be simplified through SROA.
117   DenseMap<Value *, Value *> SROAArgValues;
118 
119   // The mapping of caller Alloca values to their accumulated cost savings. If
120   // we have to disable SROA for one of the allocas, this tells us how much
121   // cost must be added.
122   DenseMap<Value *, int> SROAArgCosts;
123 
124   // Keep track of values which map to a pointer base and constant offset.
125   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
126 
127   // Custom simplification helper routines.
128   bool isAllocaDerivedArg(Value *V);
129   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
130                             DenseMap<Value *, int>::iterator &CostIt);
131   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
132   void disableSROA(Value *V);
133   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
134                           int InstructionCost);
135   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
136   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
137   bool simplifyCallSite(Function *F, CallSite CS);
138   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
139 
140   /// Return true if the given argument to the function being considered for
141   /// inlining has the given attribute set either at the call site or the
142   /// function declaration.  Primarily used to inspect call site specific
143   /// attributes since these can be more precise than the ones on the callee
144   /// itself.
145   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
146 
147   /// Return true if the given value is known non null within the callee if
148   /// inlined through this particular callsite.
149   bool isKnownNonNullInCallee(Value *V);
150 
151   /// Update Threshold based on callsite properties such as callee
152   /// attributes and callee hotness for PGO builds. The Callee is explicitly
153   /// passed to support analyzing indirect calls whose target is inferred by
154   /// analysis.
155   void updateThreshold(CallSite CS, Function &Callee);
156 
157   /// Return true if size growth is allowed when inlining the callee at CS.
158   bool allowSizeGrowth(CallSite CS);
159 
160   // Custom analysis routines.
161   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
162 
163   // Disable several entry points to the visitor so we don't accidentally use
164   // them by declaring but not defining them here.
165   void visit(Module *);
166   void visit(Module &);
167   void visit(Function *);
168   void visit(Function &);
169   void visit(BasicBlock *);
170   void visit(BasicBlock &);
171 
172   // Provide base case for our instruction visit.
173   bool visitInstruction(Instruction &I);
174 
175   // Our visit overrides.
176   bool visitAlloca(AllocaInst &I);
177   bool visitPHI(PHINode &I);
178   bool visitGetElementPtr(GetElementPtrInst &I);
179   bool visitBitCast(BitCastInst &I);
180   bool visitPtrToInt(PtrToIntInst &I);
181   bool visitIntToPtr(IntToPtrInst &I);
182   bool visitCastInst(CastInst &I);
183   bool visitUnaryInstruction(UnaryInstruction &I);
184   bool visitCmpInst(CmpInst &I);
185   bool visitSub(BinaryOperator &I);
186   bool visitBinaryOperator(BinaryOperator &I);
187   bool visitLoad(LoadInst &I);
188   bool visitStore(StoreInst &I);
189   bool visitExtractValue(ExtractValueInst &I);
190   bool visitInsertValue(InsertValueInst &I);
191   bool visitCallSite(CallSite CS);
192   bool visitReturnInst(ReturnInst &RI);
193   bool visitBranchInst(BranchInst &BI);
194   bool visitSwitchInst(SwitchInst &SI);
195   bool visitIndirectBrInst(IndirectBrInst &IBI);
196   bool visitResumeInst(ResumeInst &RI);
197   bool visitCleanupReturnInst(CleanupReturnInst &RI);
198   bool visitCatchReturnInst(CatchReturnInst &RI);
199   bool visitUnreachableInst(UnreachableInst &I);
200 
201 public:
202   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
203                Function &Callee, int Threshold, CallSite CSArg)
204       : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
205         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
206         ExposesReturnsTwice(false), HasDynamicAlloca(false),
207         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
208         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
209         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
210         TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
211         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
212         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
213         SROACostSavings(0), SROACostSavingsLost(0) {}
214 
215   bool analyzeCall(CallSite CS);
216 
217   int getThreshold() { return Threshold; }
218   int getCost() { return Cost; }
219 
220   // Keep a bunch of stats about the cost savings found so we can print them
221   // out when debugging.
222   unsigned NumConstantArgs;
223   unsigned NumConstantOffsetPtrArgs;
224   unsigned NumAllocaArgs;
225   unsigned NumConstantPtrCmps;
226   unsigned NumConstantPtrDiffs;
227   unsigned NumInstructionsSimplified;
228   unsigned SROACostSavings;
229   unsigned SROACostSavingsLost;
230 
231   void dump();
232 };
233 
234 } // namespace
235 
236 /// \brief Test whether the given value is an Alloca-derived function argument.
237 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
238   return SROAArgValues.count(V);
239 }
240 
241 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
242 /// Returns false if V does not map to a SROA-candidate.
243 bool CallAnalyzer::lookupSROAArgAndCost(
244     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
245   if (SROAArgValues.empty() || SROAArgCosts.empty())
246     return false;
247 
248   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
249   if (ArgIt == SROAArgValues.end())
250     return false;
251 
252   Arg = ArgIt->second;
253   CostIt = SROAArgCosts.find(Arg);
254   return CostIt != SROAArgCosts.end();
255 }
256 
257 /// \brief Disable SROA for the candidate marked by this cost iterator.
258 ///
259 /// This marks the candidate as no longer viable for SROA, and adds the cost
260 /// savings associated with it back into the inline cost measurement.
261 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
262   // If we're no longer able to perform SROA we need to undo its cost savings
263   // and prevent subsequent analysis.
264   Cost += CostIt->second;
265   SROACostSavings -= CostIt->second;
266   SROACostSavingsLost += CostIt->second;
267   SROAArgCosts.erase(CostIt);
268 }
269 
270 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
271 void CallAnalyzer::disableSROA(Value *V) {
272   Value *SROAArg;
273   DenseMap<Value *, int>::iterator CostIt;
274   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
275     disableSROA(CostIt);
276 }
277 
278 /// \brief Accumulate the given cost for a particular SROA candidate.
279 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
280                                       int InstructionCost) {
281   CostIt->second += InstructionCost;
282   SROACostSavings += InstructionCost;
283 }
284 
285 /// \brief Check whether a GEP's indices are all constant.
286 ///
287 /// Respects any simplified values known during the analysis of this callsite.
288 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
289   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
290     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
291       return false;
292 
293   return true;
294 }
295 
296 /// \brief Accumulate a constant GEP offset into an APInt if possible.
297 ///
298 /// Returns false if unable to compute the offset for any reason. Respects any
299 /// simplified values known during the analysis of this callsite.
300 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
301   const DataLayout &DL = F.getParent()->getDataLayout();
302   unsigned IntPtrWidth = DL.getPointerSizeInBits();
303   assert(IntPtrWidth == Offset.getBitWidth());
304 
305   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
306        GTI != GTE; ++GTI) {
307     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
308     if (!OpC)
309       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
310         OpC = dyn_cast<ConstantInt>(SimpleOp);
311     if (!OpC)
312       return false;
313     if (OpC->isZero())
314       continue;
315 
316     // Handle a struct index, which adds its field offset to the pointer.
317     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
318       unsigned ElementIdx = OpC->getZExtValue();
319       const StructLayout *SL = DL.getStructLayout(STy);
320       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
321       continue;
322     }
323 
324     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
325     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
326   }
327   return true;
328 }
329 
330 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
331   // Check whether inlining will turn a dynamic alloca into a static
332   // alloca and handle that case.
333   if (I.isArrayAllocation()) {
334     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
335     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
336       Type *Ty = I.getAllocatedType();
337       // FIXME: This can't be right. AllocatedSize is in *bytes*.
338       AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
339       return Base::visitAlloca(I);
340     }
341   }
342 
343   // Accumulate the allocated size.
344   if (I.isStaticAlloca()) {
345     const DataLayout &DL = F.getParent()->getDataLayout();
346     Type *Ty = I.getAllocatedType();
347     AllocatedSize += DL.getTypeAllocSize(Ty);
348   }
349 
350   // We will happily inline static alloca instructions.
351   if (I.isStaticAlloca())
352     return Base::visitAlloca(I);
353 
354   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
355   // a variety of reasons, and so we would like to not inline them into
356   // functions which don't currently have a dynamic alloca. This simply
357   // disables inlining altogether in the presence of a dynamic alloca.
358   HasDynamicAlloca = true;
359   return false;
360 }
361 
362 bool CallAnalyzer::visitPHI(PHINode &I) {
363   // FIXME: We should potentially be tracking values through phi nodes,
364   // especially when they collapse to a single value due to deleted CFG edges
365   // during inlining.
366 
367   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
368   // though we don't want to propagate it's bonuses. The idea is to disable
369   // SROA if it *might* be used in an inappropriate manner.
370 
371   // Phi nodes are always zero-cost.
372   return true;
373 }
374 
375 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
376   Value *SROAArg;
377   DenseMap<Value *, int>::iterator CostIt;
378   bool SROACandidate =
379       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
380 
381   // Try to fold GEPs of constant-offset call site argument pointers. This
382   // requires target data and inbounds GEPs.
383   if (I.isInBounds()) {
384     // Check if we have a base + offset for the pointer.
385     Value *Ptr = I.getPointerOperand();
386     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
387     if (BaseAndOffset.first) {
388       // Check if the offset of this GEP is constant, and if so accumulate it
389       // into Offset.
390       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
391         // Non-constant GEPs aren't folded, and disable SROA.
392         if (SROACandidate)
393           disableSROA(CostIt);
394         return false;
395       }
396 
397       // Add the result as a new mapping to Base + Offset.
398       ConstantOffsetPtrs[&I] = BaseAndOffset;
399 
400       // Also handle SROA candidates here, we already know that the GEP is
401       // all-constant indexed.
402       if (SROACandidate)
403         SROAArgValues[&I] = SROAArg;
404 
405       return true;
406     }
407   }
408 
409   if (isGEPOffsetConstant(I)) {
410     if (SROACandidate)
411       SROAArgValues[&I] = SROAArg;
412 
413     // Constant GEPs are modeled as free.
414     return true;
415   }
416 
417   // Variable GEPs will require math and will disable SROA.
418   if (SROACandidate)
419     disableSROA(CostIt);
420   return false;
421 }
422 
423 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
424   // Propagate constants through bitcasts.
425   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
426   if (!COp)
427     COp = SimplifiedValues.lookup(I.getOperand(0));
428   if (COp)
429     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
430       SimplifiedValues[&I] = C;
431       return true;
432     }
433 
434   // Track base/offsets through casts
435   std::pair<Value *, APInt> BaseAndOffset =
436       ConstantOffsetPtrs.lookup(I.getOperand(0));
437   // Casts don't change the offset, just wrap it up.
438   if (BaseAndOffset.first)
439     ConstantOffsetPtrs[&I] = BaseAndOffset;
440 
441   // Also look for SROA candidates here.
442   Value *SROAArg;
443   DenseMap<Value *, int>::iterator CostIt;
444   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
445     SROAArgValues[&I] = SROAArg;
446 
447   // Bitcasts are always zero cost.
448   return true;
449 }
450 
451 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
452   // Propagate constants through ptrtoint.
453   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
454   if (!COp)
455     COp = SimplifiedValues.lookup(I.getOperand(0));
456   if (COp)
457     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
458       SimplifiedValues[&I] = C;
459       return true;
460     }
461 
462   // Track base/offset pairs when converted to a plain integer provided the
463   // integer is large enough to represent the pointer.
464   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
465   const DataLayout &DL = F.getParent()->getDataLayout();
466   if (IntegerSize >= DL.getPointerSizeInBits()) {
467     std::pair<Value *, APInt> BaseAndOffset =
468         ConstantOffsetPtrs.lookup(I.getOperand(0));
469     if (BaseAndOffset.first)
470       ConstantOffsetPtrs[&I] = BaseAndOffset;
471   }
472 
473   // This is really weird. Technically, ptrtoint will disable SROA. However,
474   // unless that ptrtoint is *used* somewhere in the live basic blocks after
475   // inlining, it will be nuked, and SROA should proceed. All of the uses which
476   // would block SROA would also block SROA if applied directly to a pointer,
477   // and so we can just add the integer in here. The only places where SROA is
478   // preserved either cannot fire on an integer, or won't in-and-of themselves
479   // disable SROA (ext) w/o some later use that we would see and disable.
480   Value *SROAArg;
481   DenseMap<Value *, int>::iterator CostIt;
482   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
483     SROAArgValues[&I] = SROAArg;
484 
485   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
486 }
487 
488 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
489   // Propagate constants through ptrtoint.
490   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
491   if (!COp)
492     COp = SimplifiedValues.lookup(I.getOperand(0));
493   if (COp)
494     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
495       SimplifiedValues[&I] = C;
496       return true;
497     }
498 
499   // Track base/offset pairs when round-tripped through a pointer without
500   // modifications provided the integer is not too large.
501   Value *Op = I.getOperand(0);
502   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
503   const DataLayout &DL = F.getParent()->getDataLayout();
504   if (IntegerSize <= DL.getPointerSizeInBits()) {
505     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
506     if (BaseAndOffset.first)
507       ConstantOffsetPtrs[&I] = BaseAndOffset;
508   }
509 
510   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
511   Value *SROAArg;
512   DenseMap<Value *, int>::iterator CostIt;
513   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
514     SROAArgValues[&I] = SROAArg;
515 
516   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
517 }
518 
519 bool CallAnalyzer::visitCastInst(CastInst &I) {
520   // Propagate constants through ptrtoint.
521   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
522   if (!COp)
523     COp = SimplifiedValues.lookup(I.getOperand(0));
524   if (COp)
525     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
526       SimplifiedValues[&I] = C;
527       return true;
528     }
529 
530   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
531   disableSROA(I.getOperand(0));
532 
533   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
534 }
535 
536 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
537   Value *Operand = I.getOperand(0);
538   Constant *COp = dyn_cast<Constant>(Operand);
539   if (!COp)
540     COp = SimplifiedValues.lookup(Operand);
541   if (COp) {
542     const DataLayout &DL = F.getParent()->getDataLayout();
543     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
544       SimplifiedValues[&I] = C;
545       return true;
546     }
547   }
548 
549   // Disable any SROA on the argument to arbitrary unary operators.
550   disableSROA(Operand);
551 
552   return false;
553 }
554 
555 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
556   unsigned ArgNo = A->getArgNo();
557   return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
558 }
559 
560 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
561   // Does the *call site* have the NonNull attribute set on an argument?  We
562   // use the attribute on the call site to memoize any analysis done in the
563   // caller. This will also trip if the callee function has a non-null
564   // parameter attribute, but that's a less interesting case because hopefully
565   // the callee would already have been simplified based on that.
566   if (Argument *A = dyn_cast<Argument>(V))
567     if (paramHasAttr(A, Attribute::NonNull))
568       return true;
569 
570   // Is this an alloca in the caller?  This is distinct from the attribute case
571   // above because attributes aren't updated within the inliner itself and we
572   // always want to catch the alloca derived case.
573   if (isAllocaDerivedArg(V))
574     // We can actually predict the result of comparisons between an
575     // alloca-derived value and null. Note that this fires regardless of
576     // SROA firing.
577     return true;
578 
579   return false;
580 }
581 
582 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
583   // If the normal destination of the invoke or the parent block of the call
584   // site is unreachable-terminated, there is little point in inlining this
585   // unless there is literally zero cost.
586   // FIXME: Note that it is possible that an unreachable-terminated block has a
587   // hot entry. For example, in below scenario inlining hot_call_X() may be
588   // beneficial :
589   // main() {
590   //   hot_call_1();
591   //   ...
592   //   hot_call_N()
593   //   exit(0);
594   // }
595   // For now, we are not handling this corner case here as it is rare in real
596   // code. In future, we should elaborate this based on BPI and BFI in more
597   // general threshold adjusting heuristics in updateThreshold().
598   Instruction *Instr = CS.getInstruction();
599   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
600     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
601       return false;
602   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
603     return false;
604 
605   return true;
606 }
607 
608 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
609   // If no size growth is allowed for this inlining, set Threshold to 0.
610   if (!allowSizeGrowth(CS)) {
611     Threshold = 0;
612     return;
613   }
614 
615   Function *Caller = CS.getCaller();
616   if (DefaultInlineThreshold.getNumOccurrences() > 0) {
617     // Explicitly specified -inline-threhold overrides the threshold passed to
618     // CallAnalyzer's constructor.
619     Threshold = DefaultInlineThreshold;
620   } else {
621     // If -inline-threshold is not given, listen to the optsize and minsize
622     // attributes when they would decrease the threshold.
623     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
624       Threshold = OptMinSizeThreshold;
625     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
626       Threshold = OptSizeThreshold;
627   }
628 
629   // If profile information is available, use that to adjust threshold of hot
630   // and cold functions.
631   // FIXME: The heuristic used below for determining hotness and coldness are
632   // based on preliminary SPEC tuning and may not be optimal. Replace this with
633   // a well-tuned heuristic based on *callsite* hotness and not callee hotness.
634   uint64_t FunctionCount = 0, MaxFunctionCount = 0;
635   bool HasPGOCounts = false;
636   if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) {
637     HasPGOCounts = true;
638     FunctionCount = Callee.getEntryCount().getValue();
639     MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue();
640   }
641 
642   // Listen to the inlinehint attribute or profile based hotness information
643   // when it would increase the threshold and the caller does not need to
644   // minimize its size.
645   bool InlineHint =
646       Callee.hasFnAttribute(Attribute::InlineHint) ||
647       (HasPGOCounts &&
648        FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount));
649   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
650     Threshold = HintThreshold;
651 
652   // Listen to the cold attribute or profile based coldness information
653   // when it would decrease the threshold.
654   bool ColdCallee =
655       Callee.hasFnAttribute(Attribute::Cold) ||
656       (HasPGOCounts &&
657        FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));
658   // Command line argument for DefaultInlineThreshold will override the default
659   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
660   // do not use the default cold threshold even if it is smaller.
661   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
662        ColdThreshold.getNumOccurrences() > 0) &&
663       ColdCallee && ColdThreshold < Threshold)
664     Threshold = ColdThreshold;
665 
666   // Finally, take the target-specific inlining threshold multiplier into
667   // account.
668   Threshold *= TTI.getInliningThresholdMultiplier();
669 }
670 
671 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
672   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
673   // First try to handle simplified comparisons.
674   if (!isa<Constant>(LHS))
675     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
676       LHS = SimpleLHS;
677   if (!isa<Constant>(RHS))
678     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
679       RHS = SimpleRHS;
680   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
681     if (Constant *CRHS = dyn_cast<Constant>(RHS))
682       if (Constant *C =
683               ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
684         SimplifiedValues[&I] = C;
685         return true;
686       }
687   }
688 
689   if (I.getOpcode() == Instruction::FCmp)
690     return false;
691 
692   // Otherwise look for a comparison between constant offset pointers with
693   // a common base.
694   Value *LHSBase, *RHSBase;
695   APInt LHSOffset, RHSOffset;
696   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
697   if (LHSBase) {
698     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
699     if (RHSBase && LHSBase == RHSBase) {
700       // We have common bases, fold the icmp to a constant based on the
701       // offsets.
702       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
703       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
704       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
705         SimplifiedValues[&I] = C;
706         ++NumConstantPtrCmps;
707         return true;
708       }
709     }
710   }
711 
712   // If the comparison is an equality comparison with null, we can simplify it
713   // if we know the value (argument) can't be null
714   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
715       isKnownNonNullInCallee(I.getOperand(0))) {
716     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
717     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
718                                       : ConstantInt::getFalse(I.getType());
719     return true;
720   }
721   // Finally check for SROA candidates in comparisons.
722   Value *SROAArg;
723   DenseMap<Value *, int>::iterator CostIt;
724   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
725     if (isa<ConstantPointerNull>(I.getOperand(1))) {
726       accumulateSROACost(CostIt, InlineConstants::InstrCost);
727       return true;
728     }
729 
730     disableSROA(CostIt);
731   }
732 
733   return false;
734 }
735 
736 bool CallAnalyzer::visitSub(BinaryOperator &I) {
737   // Try to handle a special case: we can fold computing the difference of two
738   // constant-related pointers.
739   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
740   Value *LHSBase, *RHSBase;
741   APInt LHSOffset, RHSOffset;
742   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
743   if (LHSBase) {
744     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
745     if (RHSBase && LHSBase == RHSBase) {
746       // We have common bases, fold the subtract to a constant based on the
747       // offsets.
748       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
749       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
750       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
751         SimplifiedValues[&I] = C;
752         ++NumConstantPtrDiffs;
753         return true;
754       }
755     }
756   }
757 
758   // Otherwise, fall back to the generic logic for simplifying and handling
759   // instructions.
760   return Base::visitSub(I);
761 }
762 
763 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
764   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
765   const DataLayout &DL = F.getParent()->getDataLayout();
766   if (!isa<Constant>(LHS))
767     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
768       LHS = SimpleLHS;
769   if (!isa<Constant>(RHS))
770     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
771       RHS = SimpleRHS;
772   Value *SimpleV = nullptr;
773   if (auto FI = dyn_cast<FPMathOperator>(&I))
774     SimpleV =
775         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
776   else
777     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
778 
779   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
780     SimplifiedValues[&I] = C;
781     return true;
782   }
783 
784   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
785   disableSROA(LHS);
786   disableSROA(RHS);
787 
788   return false;
789 }
790 
791 bool CallAnalyzer::visitLoad(LoadInst &I) {
792   Value *SROAArg;
793   DenseMap<Value *, int>::iterator CostIt;
794   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
795     if (I.isSimple()) {
796       accumulateSROACost(CostIt, InlineConstants::InstrCost);
797       return true;
798     }
799 
800     disableSROA(CostIt);
801   }
802 
803   return false;
804 }
805 
806 bool CallAnalyzer::visitStore(StoreInst &I) {
807   Value *SROAArg;
808   DenseMap<Value *, int>::iterator CostIt;
809   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
810     if (I.isSimple()) {
811       accumulateSROACost(CostIt, InlineConstants::InstrCost);
812       return true;
813     }
814 
815     disableSROA(CostIt);
816   }
817 
818   return false;
819 }
820 
821 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
822   // Constant folding for extract value is trivial.
823   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
824   if (!C)
825     C = SimplifiedValues.lookup(I.getAggregateOperand());
826   if (C) {
827     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
828     return true;
829   }
830 
831   // SROA can look through these but give them a cost.
832   return false;
833 }
834 
835 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
836   // Constant folding for insert value is trivial.
837   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
838   if (!AggC)
839     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
840   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
841   if (!InsertedC)
842     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
843   if (AggC && InsertedC) {
844     SimplifiedValues[&I] =
845         ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
846     return true;
847   }
848 
849   // SROA can look through these but give them a cost.
850   return false;
851 }
852 
853 /// \brief Try to simplify a call site.
854 ///
855 /// Takes a concrete function and callsite and tries to actually simplify it by
856 /// analyzing the arguments and call itself with instsimplify. Returns true if
857 /// it has simplified the callsite to some other entity (a constant), making it
858 /// free.
859 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
860   // FIXME: Using the instsimplify logic directly for this is inefficient
861   // because we have to continually rebuild the argument list even when no
862   // simplifications can be performed. Until that is fixed with remapping
863   // inside of instsimplify, directly constant fold calls here.
864   if (!canConstantFoldCallTo(F))
865     return false;
866 
867   // Try to re-map the arguments to constants.
868   SmallVector<Constant *, 4> ConstantArgs;
869   ConstantArgs.reserve(CS.arg_size());
870   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
871        ++I) {
872     Constant *C = dyn_cast<Constant>(*I);
873     if (!C)
874       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
875     if (!C)
876       return false; // This argument doesn't map to a constant.
877 
878     ConstantArgs.push_back(C);
879   }
880   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
881     SimplifiedValues[CS.getInstruction()] = C;
882     return true;
883   }
884 
885   return false;
886 }
887 
888 bool CallAnalyzer::visitCallSite(CallSite CS) {
889   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
890       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
891     // This aborts the entire analysis.
892     ExposesReturnsTwice = true;
893     return false;
894   }
895   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
896     ContainsNoDuplicateCall = true;
897 
898   if (Function *F = CS.getCalledFunction()) {
899     // When we have a concrete function, first try to simplify it directly.
900     if (simplifyCallSite(F, CS))
901       return true;
902 
903     // Next check if it is an intrinsic we know about.
904     // FIXME: Lift this into part of the InstVisitor.
905     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
906       switch (II->getIntrinsicID()) {
907       default:
908         return Base::visitCallSite(CS);
909 
910       case Intrinsic::load_relative:
911         // This is normally lowered to 4 LLVM instructions.
912         Cost += 3 * InlineConstants::InstrCost;
913         return false;
914 
915       case Intrinsic::memset:
916       case Intrinsic::memcpy:
917       case Intrinsic::memmove:
918         // SROA can usually chew through these intrinsics, but they aren't free.
919         return false;
920       case Intrinsic::localescape:
921         HasFrameEscape = true;
922         return false;
923       }
924     }
925 
926     if (F == CS.getInstruction()->getParent()->getParent()) {
927       // This flag will fully abort the analysis, so don't bother with anything
928       // else.
929       IsRecursiveCall = true;
930       return false;
931     }
932 
933     if (TTI.isLoweredToCall(F)) {
934       // We account for the average 1 instruction per call argument setup
935       // here.
936       Cost += CS.arg_size() * InlineConstants::InstrCost;
937 
938       // Everything other than inline ASM will also have a significant cost
939       // merely from making the call.
940       if (!isa<InlineAsm>(CS.getCalledValue()))
941         Cost += InlineConstants::CallPenalty;
942     }
943 
944     return Base::visitCallSite(CS);
945   }
946 
947   // Otherwise we're in a very special case -- an indirect function call. See
948   // if we can be particularly clever about this.
949   Value *Callee = CS.getCalledValue();
950 
951   // First, pay the price of the argument setup. We account for the average
952   // 1 instruction per call argument setup here.
953   Cost += CS.arg_size() * InlineConstants::InstrCost;
954 
955   // Next, check if this happens to be an indirect function call to a known
956   // function in this inline context. If not, we've done all we can.
957   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
958   if (!F)
959     return Base::visitCallSite(CS);
960 
961   // If we have a constant that we are calling as a function, we can peer
962   // through it and see the function target. This happens not infrequently
963   // during devirtualization and so we want to give it a hefty bonus for
964   // inlining, but cap that bonus in the event that inlining wouldn't pan
965   // out. Pretend to inline the function, with a custom threshold.
966   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
967   if (CA.analyzeCall(CS)) {
968     // We were able to inline the indirect call! Subtract the cost from the
969     // threshold to get the bonus we want to apply, but don't go below zero.
970     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
971   }
972 
973   return Base::visitCallSite(CS);
974 }
975 
976 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
977   // At least one return instruction will be free after inlining.
978   bool Free = !HasReturn;
979   HasReturn = true;
980   return Free;
981 }
982 
983 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
984   // We model unconditional branches as essentially free -- they really
985   // shouldn't exist at all, but handling them makes the behavior of the
986   // inliner more regular and predictable. Interestingly, conditional branches
987   // which will fold away are also free.
988   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
989          dyn_cast_or_null<ConstantInt>(
990              SimplifiedValues.lookup(BI.getCondition()));
991 }
992 
993 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
994   // We model unconditional switches as free, see the comments on handling
995   // branches.
996   if (isa<ConstantInt>(SI.getCondition()))
997     return true;
998   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
999     if (isa<ConstantInt>(V))
1000       return true;
1001 
1002   // Otherwise, we need to accumulate a cost proportional to the number of
1003   // distinct successor blocks. This fan-out in the CFG cannot be represented
1004   // for free even if we can represent the core switch as a jumptable that
1005   // takes a single instruction.
1006   //
1007   // NB: We convert large switches which are just used to initialize large phi
1008   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1009   // inlining those. It will prevent inlining in cases where the optimization
1010   // does not (yet) fire.
1011   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1012   SuccessorBlocks.insert(SI.getDefaultDest());
1013   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
1014     SuccessorBlocks.insert(I.getCaseSuccessor());
1015   // Add cost corresponding to the number of distinct destinations. The first
1016   // we model as free because of fallthrough.
1017   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1018   return false;
1019 }
1020 
1021 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1022   // We never want to inline functions that contain an indirectbr.  This is
1023   // incorrect because all the blockaddress's (in static global initializers
1024   // for example) would be referring to the original function, and this
1025   // indirect jump would jump from the inlined copy of the function into the
1026   // original function which is extremely undefined behavior.
1027   // FIXME: This logic isn't really right; we can safely inline functions with
1028   // indirectbr's as long as no other function or global references the
1029   // blockaddress of a block within the current function.
1030   HasIndirectBr = true;
1031   return false;
1032 }
1033 
1034 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1035   // FIXME: It's not clear that a single instruction is an accurate model for
1036   // the inline cost of a resume instruction.
1037   return false;
1038 }
1039 
1040 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1041   // FIXME: It's not clear that a single instruction is an accurate model for
1042   // the inline cost of a cleanupret instruction.
1043   return false;
1044 }
1045 
1046 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1047   // FIXME: It's not clear that a single instruction is an accurate model for
1048   // the inline cost of a catchret instruction.
1049   return false;
1050 }
1051 
1052 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1053   // FIXME: It might be reasonably to discount the cost of instructions leading
1054   // to unreachable as they have the lowest possible impact on both runtime and
1055   // code size.
1056   return true; // No actual code is needed for unreachable.
1057 }
1058 
1059 bool CallAnalyzer::visitInstruction(Instruction &I) {
1060   // Some instructions are free. All of the free intrinsics can also be
1061   // handled by SROA, etc.
1062   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1063     return true;
1064 
1065   // We found something we don't understand or can't handle. Mark any SROA-able
1066   // values in the operand list as no longer viable.
1067   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1068     disableSROA(*OI);
1069 
1070   return false;
1071 }
1072 
1073 /// \brief Analyze a basic block for its contribution to the inline cost.
1074 ///
1075 /// This method walks the analyzer over every instruction in the given basic
1076 /// block and accounts for their cost during inlining at this callsite. It
1077 /// aborts early if the threshold has been exceeded or an impossible to inline
1078 /// construct has been detected. It returns false if inlining is no longer
1079 /// viable, and true if inlining remains viable.
1080 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1081                                 SmallPtrSetImpl<const Value *> &EphValues) {
1082   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1083     // FIXME: Currently, the number of instructions in a function regardless of
1084     // our ability to simplify them during inline to constants or dead code,
1085     // are actually used by the vector bonus heuristic. As long as that's true,
1086     // we have to special case debug intrinsics here to prevent differences in
1087     // inlining due to debug symbols. Eventually, the number of unsimplified
1088     // instructions shouldn't factor into the cost computation, but until then,
1089     // hack around it here.
1090     if (isa<DbgInfoIntrinsic>(I))
1091       continue;
1092 
1093     // Skip ephemeral values.
1094     if (EphValues.count(&*I))
1095       continue;
1096 
1097     ++NumInstructions;
1098     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1099       ++NumVectorInstructions;
1100 
1101     // If the instruction is floating point, and the target says this operation
1102     // is expensive or the function has the "use-soft-float" attribute, this may
1103     // eventually become a library call. Treat the cost as such.
1104     if (I->getType()->isFloatingPointTy()) {
1105       bool hasSoftFloatAttr = false;
1106 
1107       // If the function has the "use-soft-float" attribute, mark it as
1108       // expensive.
1109       if (F.hasFnAttribute("use-soft-float")) {
1110         Attribute Attr = F.getFnAttribute("use-soft-float");
1111         StringRef Val = Attr.getValueAsString();
1112         if (Val == "true")
1113           hasSoftFloatAttr = true;
1114       }
1115 
1116       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1117           hasSoftFloatAttr)
1118         Cost += InlineConstants::CallPenalty;
1119     }
1120 
1121     // If the instruction simplified to a constant, there is no cost to this
1122     // instruction. Visit the instructions using our InstVisitor to account for
1123     // all of the per-instruction logic. The visit tree returns true if we
1124     // consumed the instruction in any way, and false if the instruction's base
1125     // cost should count against inlining.
1126     if (Base::visit(&*I))
1127       ++NumInstructionsSimplified;
1128     else
1129       Cost += InlineConstants::InstrCost;
1130 
1131     // If the visit this instruction detected an uninlinable pattern, abort.
1132     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1133         HasIndirectBr || HasFrameEscape)
1134       return false;
1135 
1136     // If the caller is a recursive function then we don't want to inline
1137     // functions which allocate a lot of stack space because it would increase
1138     // the caller stack usage dramatically.
1139     if (IsCallerRecursive &&
1140         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1141       return false;
1142 
1143     // Check if we've past the maximum possible threshold so we don't spin in
1144     // huge basic blocks that will never inline.
1145     if (Cost > Threshold)
1146       return false;
1147   }
1148 
1149   return true;
1150 }
1151 
1152 /// \brief Compute the base pointer and cumulative constant offsets for V.
1153 ///
1154 /// This strips all constant offsets off of V, leaving it the base pointer, and
1155 /// accumulates the total constant offset applied in the returned constant. It
1156 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1157 /// no constant offsets applied.
1158 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1159   if (!V->getType()->isPointerTy())
1160     return nullptr;
1161 
1162   const DataLayout &DL = F.getParent()->getDataLayout();
1163   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1164   APInt Offset = APInt::getNullValue(IntPtrWidth);
1165 
1166   // Even though we don't look through PHI nodes, we could be called on an
1167   // instruction in an unreachable block, which may be on a cycle.
1168   SmallPtrSet<Value *, 4> Visited;
1169   Visited.insert(V);
1170   do {
1171     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1172       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1173         return nullptr;
1174       V = GEP->getPointerOperand();
1175     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1176       V = cast<Operator>(V)->getOperand(0);
1177     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1178       if (GA->isInterposable())
1179         break;
1180       V = GA->getAliasee();
1181     } else {
1182       break;
1183     }
1184     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1185   } while (Visited.insert(V).second);
1186 
1187   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1188   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1189 }
1190 
1191 /// \brief Analyze a call site for potential inlining.
1192 ///
1193 /// Returns true if inlining this call is viable, and false if it is not
1194 /// viable. It computes the cost and adjusts the threshold based on numerous
1195 /// factors and heuristics. If this method returns false but the computed cost
1196 /// is below the computed threshold, then inlining was forcibly disabled by
1197 /// some artifact of the routine.
1198 bool CallAnalyzer::analyzeCall(CallSite CS) {
1199   ++NumCallsAnalyzed;
1200 
1201   // Perform some tweaks to the cost and threshold based on the direct
1202   // callsite information.
1203 
1204   // We want to more aggressively inline vector-dense kernels, so up the
1205   // threshold, and we'll lower it if the % of vector instructions gets too
1206   // low. Note that these bonuses are some what arbitrary and evolved over time
1207   // by accident as much as because they are principled bonuses.
1208   //
1209   // FIXME: It would be nice to remove all such bonuses. At least it would be
1210   // nice to base the bonus values on something more scientific.
1211   assert(NumInstructions == 0);
1212   assert(NumVectorInstructions == 0);
1213 
1214   // Update the threshold based on callsite properties
1215   updateThreshold(CS, F);
1216 
1217   FiftyPercentVectorBonus = 3 * Threshold / 2;
1218   TenPercentVectorBonus = 3 * Threshold / 4;
1219   const DataLayout &DL = F.getParent()->getDataLayout();
1220 
1221   // Track whether the post-inlining function would have more than one basic
1222   // block. A single basic block is often intended for inlining. Balloon the
1223   // threshold by 50% until we pass the single-BB phase.
1224   bool SingleBB = true;
1225   int SingleBBBonus = Threshold / 2;
1226 
1227   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1228   // this Threshold any time, and cost cannot decrease, we can stop processing
1229   // the rest of the function body.
1230   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1231 
1232   // Give out bonuses per argument, as the instructions setting them up will
1233   // be gone after inlining.
1234   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1235     if (CS.isByValArgument(I)) {
1236       // We approximate the number of loads and stores needed by dividing the
1237       // size of the byval type by the target's pointer size.
1238       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1239       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1240       unsigned PointerSize = DL.getPointerSizeInBits();
1241       // Ceiling division.
1242       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1243 
1244       // If it generates more than 8 stores it is likely to be expanded as an
1245       // inline memcpy so we take that as an upper bound. Otherwise we assume
1246       // one load and one store per word copied.
1247       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1248       // here instead of a magic number of 8, but it's not available via
1249       // DataLayout.
1250       NumStores = std::min(NumStores, 8U);
1251 
1252       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1253     } else {
1254       // For non-byval arguments subtract off one instruction per call
1255       // argument.
1256       Cost -= InlineConstants::InstrCost;
1257     }
1258   }
1259 
1260   // If there is only one call of the function, and it has internal linkage,
1261   // the cost of inlining it drops dramatically.
1262   bool OnlyOneCallAndLocalLinkage =
1263       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1264   if (OnlyOneCallAndLocalLinkage)
1265     Cost += InlineConstants::LastCallToStaticBonus;
1266 
1267   // If this function uses the coldcc calling convention, prefer not to inline
1268   // it.
1269   if (F.getCallingConv() == CallingConv::Cold)
1270     Cost += InlineConstants::ColdccPenalty;
1271 
1272   // Check if we're done. This can happen due to bonuses and penalties.
1273   if (Cost > Threshold)
1274     return false;
1275 
1276   if (F.empty())
1277     return true;
1278 
1279   Function *Caller = CS.getInstruction()->getParent()->getParent();
1280   // Check if the caller function is recursive itself.
1281   for (User *U : Caller->users()) {
1282     CallSite Site(U);
1283     if (!Site)
1284       continue;
1285     Instruction *I = Site.getInstruction();
1286     if (I->getParent()->getParent() == Caller) {
1287       IsCallerRecursive = true;
1288       break;
1289     }
1290   }
1291 
1292   // Populate our simplified values by mapping from function arguments to call
1293   // arguments with known important simplifications.
1294   CallSite::arg_iterator CAI = CS.arg_begin();
1295   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1296        FAI != FAE; ++FAI, ++CAI) {
1297     assert(CAI != CS.arg_end());
1298     if (Constant *C = dyn_cast<Constant>(CAI))
1299       SimplifiedValues[&*FAI] = C;
1300 
1301     Value *PtrArg = *CAI;
1302     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1303       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1304 
1305       // We can SROA any pointer arguments derived from alloca instructions.
1306       if (isa<AllocaInst>(PtrArg)) {
1307         SROAArgValues[&*FAI] = PtrArg;
1308         SROAArgCosts[PtrArg] = 0;
1309       }
1310     }
1311   }
1312   NumConstantArgs = SimplifiedValues.size();
1313   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1314   NumAllocaArgs = SROAArgValues.size();
1315 
1316   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1317   // the ephemeral values multiple times (and they're completely determined by
1318   // the callee, so this is purely duplicate work).
1319   SmallPtrSet<const Value *, 32> EphValues;
1320   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
1321                                       EphValues);
1322 
1323   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1324   // adding basic blocks of the callee which can be proven to be dead for this
1325   // particular call site in order to get more accurate cost estimates. This
1326   // requires a somewhat heavyweight iteration pattern: we need to walk the
1327   // basic blocks in a breadth-first order as we insert live successors. To
1328   // accomplish this, prioritizing for small iterations because we exit after
1329   // crossing our threshold, we use a small-size optimized SetVector.
1330   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1331                     SmallPtrSet<BasicBlock *, 16>>
1332       BBSetVector;
1333   BBSetVector BBWorklist;
1334   BBWorklist.insert(&F.getEntryBlock());
1335   // Note that we *must not* cache the size, this loop grows the worklist.
1336   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1337     // Bail out the moment we cross the threshold. This means we'll under-count
1338     // the cost, but only when undercounting doesn't matter.
1339     if (Cost > Threshold)
1340       break;
1341 
1342     BasicBlock *BB = BBWorklist[Idx];
1343     if (BB->empty())
1344       continue;
1345 
1346     // Disallow inlining a blockaddress. A blockaddress only has defined
1347     // behavior for an indirect branch in the same function, and we do not
1348     // currently support inlining indirect branches. But, the inliner may not
1349     // see an indirect branch that ends up being dead code at a particular call
1350     // site. If the blockaddress escapes the function, e.g., via a global
1351     // variable, inlining may lead to an invalid cross-function reference.
1352     if (BB->hasAddressTaken())
1353       return false;
1354 
1355     // Analyze the cost of this block. If we blow through the threshold, this
1356     // returns false, and we can bail on out.
1357     if (!analyzeBlock(BB, EphValues))
1358       return false;
1359 
1360     TerminatorInst *TI = BB->getTerminator();
1361 
1362     // Add in the live successors by first checking whether we have terminator
1363     // that may be simplified based on the values simplified by this call.
1364     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1365       if (BI->isConditional()) {
1366         Value *Cond = BI->getCondition();
1367         if (ConstantInt *SimpleCond =
1368                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1369           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1370           continue;
1371         }
1372       }
1373     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1374       Value *Cond = SI->getCondition();
1375       if (ConstantInt *SimpleCond =
1376               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1377         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1378         continue;
1379       }
1380     }
1381 
1382     // If we're unable to select a particular successor, just count all of
1383     // them.
1384     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1385          ++TIdx)
1386       BBWorklist.insert(TI->getSuccessor(TIdx));
1387 
1388     // If we had any successors at this point, than post-inlining is likely to
1389     // have them as well. Note that we assume any basic blocks which existed
1390     // due to branches or switches which folded above will also fold after
1391     // inlining.
1392     if (SingleBB && TI->getNumSuccessors() > 1) {
1393       // Take off the bonus we applied to the threshold.
1394       Threshold -= SingleBBBonus;
1395       SingleBB = false;
1396     }
1397   }
1398 
1399   // If this is a noduplicate call, we can still inline as long as
1400   // inlining this would cause the removal of the caller (so the instruction
1401   // is not actually duplicated, just moved).
1402   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1403     return false;
1404 
1405   // We applied the maximum possible vector bonus at the beginning. Now,
1406   // subtract the excess bonus, if any, from the Threshold before
1407   // comparing against Cost.
1408   if (NumVectorInstructions <= NumInstructions / 10)
1409     Threshold -= FiftyPercentVectorBonus;
1410   else if (NumVectorInstructions <= NumInstructions / 2)
1411     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1412 
1413   return Cost < std::max(1, Threshold);
1414 }
1415 
1416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1417 /// \brief Dump stats about this call's analysis.
1418 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1419 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1420   DEBUG_PRINT_STAT(NumConstantArgs);
1421   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1422   DEBUG_PRINT_STAT(NumAllocaArgs);
1423   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1424   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1425   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1426   DEBUG_PRINT_STAT(NumInstructions);
1427   DEBUG_PRINT_STAT(SROACostSavings);
1428   DEBUG_PRINT_STAT(SROACostSavingsLost);
1429   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1430   DEBUG_PRINT_STAT(Cost);
1431   DEBUG_PRINT_STAT(Threshold);
1432 #undef DEBUG_PRINT_STAT
1433 }
1434 #endif
1435 
1436 /// \brief Test that two functions either have or have not the given attribute
1437 ///        at the same time.
1438 template <typename AttrKind>
1439 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1440   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1441 }
1442 
1443 /// \brief Test that there are no attribute conflicts between Caller and Callee
1444 ///        that prevent inlining.
1445 static bool functionsHaveCompatibleAttributes(Function *Caller,
1446                                               Function *Callee,
1447                                               TargetTransformInfo &TTI) {
1448   return TTI.areInlineCompatible(Caller, Callee) &&
1449          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1450 }
1451 
1452 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
1453                                TargetTransformInfo &CalleeTTI,
1454                                AssumptionCacheTracker *ACT) {
1455   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
1456                        ACT);
1457 }
1458 
1459 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
1460                                         unsigned SizeOptLevel) {
1461   if (OptLevel > 2)
1462     return OptAggressiveThreshold;
1463   if (SizeOptLevel == 1) // -Os
1464     return OptSizeThreshold;
1465   if (SizeOptLevel == 2) // -Oz
1466     return OptMinSizeThreshold;
1467   return DefaultInlineThreshold;
1468 }
1469 
1470 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
1471 
1472 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
1473                                int DefaultThreshold,
1474                                TargetTransformInfo &CalleeTTI,
1475                                AssumptionCacheTracker *ACT) {
1476 
1477   // Cannot inline indirect calls.
1478   if (!Callee)
1479     return llvm::InlineCost::getNever();
1480 
1481   // Calls to functions with always-inline attributes should be inlined
1482   // whenever possible.
1483   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1484     if (isInlineViable(*Callee))
1485       return llvm::InlineCost::getAlways();
1486     return llvm::InlineCost::getNever();
1487   }
1488 
1489   // Never inline functions with conflicting attributes (unless callee has
1490   // always-inline attribute).
1491   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1492     return llvm::InlineCost::getNever();
1493 
1494   // Don't inline this call if the caller has the optnone attribute.
1495   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1496     return llvm::InlineCost::getNever();
1497 
1498   // Don't inline functions which can be interposed at link-time.  Don't inline
1499   // functions marked noinline or call sites marked noinline.
1500   // Note: inlining non-exact non-interposable fucntions is fine, since we know
1501   // we have *a* correct implementation of the source level function.
1502   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1503       CS.isNoInline())
1504     return llvm::InlineCost::getNever();
1505 
1506   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1507                      << "...\n");
1508 
1509   CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS);
1510   bool ShouldInline = CA.analyzeCall(CS);
1511 
1512   DEBUG(CA.dump());
1513 
1514   // Check if there was a reason to force inlining or no inlining.
1515   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1516     return InlineCost::getNever();
1517   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1518     return InlineCost::getAlways();
1519 
1520   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1521 }
1522 
1523 bool llvm::isInlineViable(Function &F) {
1524   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1525   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1526     // Disallow inlining of functions which contain indirect branches or
1527     // blockaddresses.
1528     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1529       return false;
1530 
1531     for (auto &II : *BI) {
1532       CallSite CS(&II);
1533       if (!CS)
1534         continue;
1535 
1536       // Disallow recursive calls.
1537       if (&F == CS.getCalledFunction())
1538         return false;
1539 
1540       // Disallow calls which expose returns-twice to a function not previously
1541       // attributed as such.
1542       if (!ReturnsTwice && CS.isCall() &&
1543           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1544         return false;
1545 
1546       // Disallow inlining functions that call @llvm.localescape. Doing this
1547       // correctly would require major changes to the inliner.
1548       if (CS.getCalledFunction() &&
1549           CS.getCalledFunction()->getIntrinsicID() ==
1550               llvm::Intrinsic::localescape)
1551         return false;
1552     }
1553   }
1554 
1555   return true;
1556 }
1557