1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/CallingConv.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/GetElementPtrTypeIterator.h"
29 #include "llvm/IR/GlobalAlias.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Operator.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "inline-cost"
39 
40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
41 
42 // Threshold to use when optsize is specified (and there is no
43 // -inline-threshold).
44 const int OptSizeThreshold = 75;
45 
46 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
47 const int OptMinSizeThreshold = 25;
48 
49 // Threshold to use when -O[34] is specified (and there is no
50 // -inline-threshold).
51 const int OptAggressiveThreshold = 275;
52 
53 static cl::opt<int> DefaultInlineThreshold(
54     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
55     cl::desc("Control the amount of inlining to perform (default = 225)"));
56 
57 static cl::opt<int> HintThreshold(
58     "inlinehint-threshold", cl::Hidden, cl::init(325),
59     cl::desc("Threshold for inlining functions with inline hint"));
60 
61 // We introduce this threshold to help performance of instrumentation based
62 // PGO before we actually hook up inliner with analysis passes such as BPI and
63 // BFI.
64 static cl::opt<int> ColdThreshold(
65     "inlinecold-threshold", cl::Hidden, cl::init(225),
66     cl::desc("Threshold for inlining functions with cold attribute"));
67 
68 namespace {
69 
70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
71   typedef InstVisitor<CallAnalyzer, bool> Base;
72   friend class InstVisitor<CallAnalyzer, bool>;
73 
74   /// The TargetTransformInfo available for this compilation.
75   const TargetTransformInfo &TTI;
76 
77   /// The cache of @llvm.assume intrinsics.
78   AssumptionCacheTracker *ACT;
79 
80   // The called function.
81   Function &F;
82 
83   // The candidate callsite being analyzed. Please do not use this to do
84   // analysis in the caller function; we want the inline cost query to be
85   // easily cacheable. Instead, use the cover function paramHasAttr.
86   CallSite CandidateCS;
87 
88   int Threshold;
89   int Cost;
90 
91   bool IsCallerRecursive;
92   bool IsRecursiveCall;
93   bool ExposesReturnsTwice;
94   bool HasDynamicAlloca;
95   bool ContainsNoDuplicateCall;
96   bool HasReturn;
97   bool HasIndirectBr;
98   bool HasFrameEscape;
99 
100   /// Number of bytes allocated statically by the callee.
101   uint64_t AllocatedSize;
102   unsigned NumInstructions, NumVectorInstructions;
103   int FiftyPercentVectorBonus, TenPercentVectorBonus;
104   int VectorBonus;
105 
106   // While we walk the potentially-inlined instructions, we build up and
107   // maintain a mapping of simplified values specific to this callsite. The
108   // idea is to propagate any special information we have about arguments to
109   // this call through the inlinable section of the function, and account for
110   // likely simplifications post-inlining. The most important aspect we track
111   // is CFG altering simplifications -- when we prove a basic block dead, that
112   // can cause dramatic shifts in the cost of inlining a function.
113   DenseMap<Value *, Constant *> SimplifiedValues;
114 
115   // Keep track of the values which map back (through function arguments) to
116   // allocas on the caller stack which could be simplified through SROA.
117   DenseMap<Value *, Value *> SROAArgValues;
118 
119   // The mapping of caller Alloca values to their accumulated cost savings. If
120   // we have to disable SROA for one of the allocas, this tells us how much
121   // cost must be added.
122   DenseMap<Value *, int> SROAArgCosts;
123 
124   // Keep track of values which map to a pointer base and constant offset.
125   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
126 
127   // Custom simplification helper routines.
128   bool isAllocaDerivedArg(Value *V);
129   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
130                             DenseMap<Value *, int>::iterator &CostIt);
131   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
132   void disableSROA(Value *V);
133   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
134                           int InstructionCost);
135   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
136   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
137   bool simplifyCallSite(Function *F, CallSite CS);
138   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
139 
140   /// Return true if the given argument to the function being considered for
141   /// inlining has the given attribute set either at the call site or the
142   /// function declaration.  Primarily used to inspect call site specific
143   /// attributes since these can be more precise than the ones on the callee
144   /// itself.
145   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
146 
147   /// Return true if the given value is known non null within the callee if
148   /// inlined through this particular callsite.
149   bool isKnownNonNullInCallee(Value *V);
150 
151   /// Update Threshold based on callsite properties such as callee
152   /// attributes and callee hotness for PGO builds. The Callee is explicitly
153   /// passed to support analyzing indirect calls whose target is inferred by
154   /// analysis.
155   void updateThreshold(CallSite CS, Function &Callee);
156 
157   /// Return true if size growth is allowed when inlining the callee at CS.
158   bool allowSizeGrowth(CallSite CS);
159 
160   // Custom analysis routines.
161   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
162 
163   // Disable several entry points to the visitor so we don't accidentally use
164   // them by declaring but not defining them here.
165   void visit(Module *);
166   void visit(Module &);
167   void visit(Function *);
168   void visit(Function &);
169   void visit(BasicBlock *);
170   void visit(BasicBlock &);
171 
172   // Provide base case for our instruction visit.
173   bool visitInstruction(Instruction &I);
174 
175   // Our visit overrides.
176   bool visitAlloca(AllocaInst &I);
177   bool visitPHI(PHINode &I);
178   bool visitGetElementPtr(GetElementPtrInst &I);
179   bool visitBitCast(BitCastInst &I);
180   bool visitPtrToInt(PtrToIntInst &I);
181   bool visitIntToPtr(IntToPtrInst &I);
182   bool visitCastInst(CastInst &I);
183   bool visitUnaryInstruction(UnaryInstruction &I);
184   bool visitCmpInst(CmpInst &I);
185   bool visitSub(BinaryOperator &I);
186   bool visitBinaryOperator(BinaryOperator &I);
187   bool visitLoad(LoadInst &I);
188   bool visitStore(StoreInst &I);
189   bool visitExtractValue(ExtractValueInst &I);
190   bool visitInsertValue(InsertValueInst &I);
191   bool visitCallSite(CallSite CS);
192   bool visitReturnInst(ReturnInst &RI);
193   bool visitBranchInst(BranchInst &BI);
194   bool visitSwitchInst(SwitchInst &SI);
195   bool visitIndirectBrInst(IndirectBrInst &IBI);
196   bool visitResumeInst(ResumeInst &RI);
197   bool visitCleanupReturnInst(CleanupReturnInst &RI);
198   bool visitCatchReturnInst(CatchReturnInst &RI);
199   bool visitUnreachableInst(UnreachableInst &I);
200 
201 public:
202   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
203                Function &Callee, int Threshold, CallSite CSArg)
204       : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
205         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
206         ExposesReturnsTwice(false), HasDynamicAlloca(false),
207         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
208         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
209         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
210         TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
211         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
212         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
213         SROACostSavings(0), SROACostSavingsLost(0) {}
214 
215   bool analyzeCall(CallSite CS);
216 
217   int getThreshold() { return Threshold; }
218   int getCost() { return Cost; }
219 
220   // Keep a bunch of stats about the cost savings found so we can print them
221   // out when debugging.
222   unsigned NumConstantArgs;
223   unsigned NumConstantOffsetPtrArgs;
224   unsigned NumAllocaArgs;
225   unsigned NumConstantPtrCmps;
226   unsigned NumConstantPtrDiffs;
227   unsigned NumInstructionsSimplified;
228   unsigned SROACostSavings;
229   unsigned SROACostSavingsLost;
230 
231   void dump();
232 };
233 
234 } // namespace
235 
236 /// \brief Test whether the given value is an Alloca-derived function argument.
237 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
238   return SROAArgValues.count(V);
239 }
240 
241 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
242 /// Returns false if V does not map to a SROA-candidate.
243 bool CallAnalyzer::lookupSROAArgAndCost(
244     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
245   if (SROAArgValues.empty() || SROAArgCosts.empty())
246     return false;
247 
248   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
249   if (ArgIt == SROAArgValues.end())
250     return false;
251 
252   Arg = ArgIt->second;
253   CostIt = SROAArgCosts.find(Arg);
254   return CostIt != SROAArgCosts.end();
255 }
256 
257 /// \brief Disable SROA for the candidate marked by this cost iterator.
258 ///
259 /// This marks the candidate as no longer viable for SROA, and adds the cost
260 /// savings associated with it back into the inline cost measurement.
261 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
262   // If we're no longer able to perform SROA we need to undo its cost savings
263   // and prevent subsequent analysis.
264   Cost += CostIt->second;
265   SROACostSavings -= CostIt->second;
266   SROACostSavingsLost += CostIt->second;
267   SROAArgCosts.erase(CostIt);
268 }
269 
270 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
271 void CallAnalyzer::disableSROA(Value *V) {
272   Value *SROAArg;
273   DenseMap<Value *, int>::iterator CostIt;
274   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
275     disableSROA(CostIt);
276 }
277 
278 /// \brief Accumulate the given cost for a particular SROA candidate.
279 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
280                                       int InstructionCost) {
281   CostIt->second += InstructionCost;
282   SROACostSavings += InstructionCost;
283 }
284 
285 /// \brief Check whether a GEP's indices are all constant.
286 ///
287 /// Respects any simplified values known during the analysis of this callsite.
288 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
289   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
290     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
291       return false;
292 
293   return true;
294 }
295 
296 /// \brief Accumulate a constant GEP offset into an APInt if possible.
297 ///
298 /// Returns false if unable to compute the offset for any reason. Respects any
299 /// simplified values known during the analysis of this callsite.
300 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
301   const DataLayout &DL = F.getParent()->getDataLayout();
302   unsigned IntPtrWidth = DL.getPointerSizeInBits();
303   assert(IntPtrWidth == Offset.getBitWidth());
304 
305   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
306        GTI != GTE; ++GTI) {
307     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
308     if (!OpC)
309       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
310         OpC = dyn_cast<ConstantInt>(SimpleOp);
311     if (!OpC)
312       return false;
313     if (OpC->isZero())
314       continue;
315 
316     // Handle a struct index, which adds its field offset to the pointer.
317     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
318       unsigned ElementIdx = OpC->getZExtValue();
319       const StructLayout *SL = DL.getStructLayout(STy);
320       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
321       continue;
322     }
323 
324     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
325     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
326   }
327   return true;
328 }
329 
330 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
331   // Check whether inlining will turn a dynamic alloca into a static
332   // alloca, and handle that case.
333   if (I.isArrayAllocation()) {
334     if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
335       ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
336       assert(AllocSize && "Allocation size not a constant int?");
337       Type *Ty = I.getAllocatedType();
338       AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
339       return Base::visitAlloca(I);
340     }
341   }
342 
343   // Accumulate the allocated size.
344   if (I.isStaticAlloca()) {
345     const DataLayout &DL = F.getParent()->getDataLayout();
346     Type *Ty = I.getAllocatedType();
347     AllocatedSize += DL.getTypeAllocSize(Ty);
348   }
349 
350   // We will happily inline static alloca instructions.
351   if (I.isStaticAlloca())
352     return Base::visitAlloca(I);
353 
354   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
355   // a variety of reasons, and so we would like to not inline them into
356   // functions which don't currently have a dynamic alloca. This simply
357   // disables inlining altogether in the presence of a dynamic alloca.
358   HasDynamicAlloca = true;
359   return false;
360 }
361 
362 bool CallAnalyzer::visitPHI(PHINode &I) {
363   // FIXME: We should potentially be tracking values through phi nodes,
364   // especially when they collapse to a single value due to deleted CFG edges
365   // during inlining.
366 
367   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
368   // though we don't want to propagate it's bonuses. The idea is to disable
369   // SROA if it *might* be used in an inappropriate manner.
370 
371   // Phi nodes are always zero-cost.
372   return true;
373 }
374 
375 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
376   Value *SROAArg;
377   DenseMap<Value *, int>::iterator CostIt;
378   bool SROACandidate =
379       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
380 
381   // Try to fold GEPs of constant-offset call site argument pointers. This
382   // requires target data and inbounds GEPs.
383   if (I.isInBounds()) {
384     // Check if we have a base + offset for the pointer.
385     Value *Ptr = I.getPointerOperand();
386     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
387     if (BaseAndOffset.first) {
388       // Check if the offset of this GEP is constant, and if so accumulate it
389       // into Offset.
390       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
391         // Non-constant GEPs aren't folded, and disable SROA.
392         if (SROACandidate)
393           disableSROA(CostIt);
394         return false;
395       }
396 
397       // Add the result as a new mapping to Base + Offset.
398       ConstantOffsetPtrs[&I] = BaseAndOffset;
399 
400       // Also handle SROA candidates here, we already know that the GEP is
401       // all-constant indexed.
402       if (SROACandidate)
403         SROAArgValues[&I] = SROAArg;
404 
405       return true;
406     }
407   }
408 
409   if (isGEPOffsetConstant(I)) {
410     if (SROACandidate)
411       SROAArgValues[&I] = SROAArg;
412 
413     // Constant GEPs are modeled as free.
414     return true;
415   }
416 
417   // Variable GEPs will require math and will disable SROA.
418   if (SROACandidate)
419     disableSROA(CostIt);
420   return false;
421 }
422 
423 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
424   // Propagate constants through bitcasts.
425   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
426   if (!COp)
427     COp = SimplifiedValues.lookup(I.getOperand(0));
428   if (COp)
429     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
430       SimplifiedValues[&I] = C;
431       return true;
432     }
433 
434   // Track base/offsets through casts
435   std::pair<Value *, APInt> BaseAndOffset =
436       ConstantOffsetPtrs.lookup(I.getOperand(0));
437   // Casts don't change the offset, just wrap it up.
438   if (BaseAndOffset.first)
439     ConstantOffsetPtrs[&I] = BaseAndOffset;
440 
441   // Also look for SROA candidates here.
442   Value *SROAArg;
443   DenseMap<Value *, int>::iterator CostIt;
444   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
445     SROAArgValues[&I] = SROAArg;
446 
447   // Bitcasts are always zero cost.
448   return true;
449 }
450 
451 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
452   // Propagate constants through ptrtoint.
453   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
454   if (!COp)
455     COp = SimplifiedValues.lookup(I.getOperand(0));
456   if (COp)
457     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
458       SimplifiedValues[&I] = C;
459       return true;
460     }
461 
462   // Track base/offset pairs when converted to a plain integer provided the
463   // integer is large enough to represent the pointer.
464   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
465   const DataLayout &DL = F.getParent()->getDataLayout();
466   if (IntegerSize >= DL.getPointerSizeInBits()) {
467     std::pair<Value *, APInt> BaseAndOffset =
468         ConstantOffsetPtrs.lookup(I.getOperand(0));
469     if (BaseAndOffset.first)
470       ConstantOffsetPtrs[&I] = BaseAndOffset;
471   }
472 
473   // This is really weird. Technically, ptrtoint will disable SROA. However,
474   // unless that ptrtoint is *used* somewhere in the live basic blocks after
475   // inlining, it will be nuked, and SROA should proceed. All of the uses which
476   // would block SROA would also block SROA if applied directly to a pointer,
477   // and so we can just add the integer in here. The only places where SROA is
478   // preserved either cannot fire on an integer, or won't in-and-of themselves
479   // disable SROA (ext) w/o some later use that we would see and disable.
480   Value *SROAArg;
481   DenseMap<Value *, int>::iterator CostIt;
482   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
483     SROAArgValues[&I] = SROAArg;
484 
485   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
486 }
487 
488 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
489   // Propagate constants through ptrtoint.
490   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
491   if (!COp)
492     COp = SimplifiedValues.lookup(I.getOperand(0));
493   if (COp)
494     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
495       SimplifiedValues[&I] = C;
496       return true;
497     }
498 
499   // Track base/offset pairs when round-tripped through a pointer without
500   // modifications provided the integer is not too large.
501   Value *Op = I.getOperand(0);
502   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
503   const DataLayout &DL = F.getParent()->getDataLayout();
504   if (IntegerSize <= DL.getPointerSizeInBits()) {
505     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
506     if (BaseAndOffset.first)
507       ConstantOffsetPtrs[&I] = BaseAndOffset;
508   }
509 
510   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
511   Value *SROAArg;
512   DenseMap<Value *, int>::iterator CostIt;
513   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
514     SROAArgValues[&I] = SROAArg;
515 
516   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
517 }
518 
519 bool CallAnalyzer::visitCastInst(CastInst &I) {
520   // Propagate constants through ptrtoint.
521   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
522   if (!COp)
523     COp = SimplifiedValues.lookup(I.getOperand(0));
524   if (COp)
525     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
526       SimplifiedValues[&I] = C;
527       return true;
528     }
529 
530   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
531   disableSROA(I.getOperand(0));
532 
533   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
534 }
535 
536 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
537   Value *Operand = I.getOperand(0);
538   Constant *COp = dyn_cast<Constant>(Operand);
539   if (!COp)
540     COp = SimplifiedValues.lookup(Operand);
541   if (COp) {
542     const DataLayout &DL = F.getParent()->getDataLayout();
543     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
544       SimplifiedValues[&I] = C;
545       return true;
546     }
547   }
548 
549   // Disable any SROA on the argument to arbitrary unary operators.
550   disableSROA(Operand);
551 
552   return false;
553 }
554 
555 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
556   unsigned ArgNo = A->getArgNo();
557   return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
558 }
559 
560 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
561   // Does the *call site* have the NonNull attribute set on an argument?  We
562   // use the attribute on the call site to memoize any analysis done in the
563   // caller. This will also trip if the callee function has a non-null
564   // parameter attribute, but that's a less interesting case because hopefully
565   // the callee would already have been simplified based on that.
566   if (Argument *A = dyn_cast<Argument>(V))
567     if (paramHasAttr(A, Attribute::NonNull))
568       return true;
569 
570   // Is this an alloca in the caller?  This is distinct from the attribute case
571   // above because attributes aren't updated within the inliner itself and we
572   // always want to catch the alloca derived case.
573   if (isAllocaDerivedArg(V))
574     // We can actually predict the result of comparisons between an
575     // alloca-derived value and null. Note that this fires regardless of
576     // SROA firing.
577     return true;
578 
579   return false;
580 }
581 
582 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
583   // If the normal destination of the invoke or the parent block of the call
584   // site is unreachable-terminated, there is little point in inlining this
585   // unless there is literally zero cost.
586   // FIXME: Note that it is possible that an unreachable-terminated block has a
587   // hot entry. For example, in below scenario inlining hot_call_X() may be
588   // beneficial :
589   // main() {
590   //   hot_call_1();
591   //   ...
592   //   hot_call_N()
593   //   exit(0);
594   // }
595   // For now, we are not handling this corner case here as it is rare in real
596   // code. In future, we should elaborate this based on BPI and BFI in more
597   // general threshold adjusting heuristics in updateThreshold().
598   Instruction *Instr = CS.getInstruction();
599   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
600     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
601       return false;
602   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
603     return false;
604 
605   return true;
606 }
607 
608 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
609   // If no size growth is allowed for this inlining, set Threshold to 0.
610   if (!allowSizeGrowth(CS)) {
611     Threshold = 0;
612     return;
613   }
614 
615   // If -inline-threshold is not given, listen to the optsize and minsize
616   // attributes when they would decrease the threshold.
617   Function *Caller = CS.getCaller();
618 
619   if (!(DefaultInlineThreshold.getNumOccurrences() > 0)) {
620     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
621       Threshold = OptMinSizeThreshold;
622     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
623       Threshold = OptSizeThreshold;
624   }
625 
626   // If profile information is available, use that to adjust threshold of hot
627   // and cold functions.
628   // FIXME: The heuristic used below for determining hotness and coldness are
629   // based on preliminary SPEC tuning and may not be optimal. Replace this with
630   // a well-tuned heuristic based on *callsite* hotness and not callee hotness.
631   uint64_t FunctionCount = 0, MaxFunctionCount = 0;
632   bool HasPGOCounts = false;
633   if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) {
634     HasPGOCounts = true;
635     FunctionCount = Callee.getEntryCount().getValue();
636     MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue();
637   }
638 
639   // Listen to the inlinehint attribute or profile based hotness information
640   // when it would increase the threshold and the caller does not need to
641   // minimize its size.
642   bool InlineHint =
643       Callee.hasFnAttribute(Attribute::InlineHint) ||
644       (HasPGOCounts &&
645        FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount));
646   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
647     Threshold = HintThreshold;
648 
649   // Listen to the cold attribute or profile based coldness information
650   // when it would decrease the threshold.
651   bool ColdCallee =
652       Callee.hasFnAttribute(Attribute::Cold) ||
653       (HasPGOCounts &&
654        FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));
655   // Command line argument for DefaultInlineThreshold will override the default
656   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
657   // do not use the default cold threshold even if it is smaller.
658   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
659        ColdThreshold.getNumOccurrences() > 0) &&
660       ColdCallee && ColdThreshold < Threshold)
661     Threshold = ColdThreshold;
662 
663   // Finally, take the target-specific inlining threshold multiplier into
664   // account.
665   Threshold *= TTI.getInliningThresholdMultiplier();
666 }
667 
668 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
669   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
670   // First try to handle simplified comparisons.
671   if (!isa<Constant>(LHS))
672     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
673       LHS = SimpleLHS;
674   if (!isa<Constant>(RHS))
675     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
676       RHS = SimpleRHS;
677   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
678     if (Constant *CRHS = dyn_cast<Constant>(RHS))
679       if (Constant *C =
680               ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
681         SimplifiedValues[&I] = C;
682         return true;
683       }
684   }
685 
686   if (I.getOpcode() == Instruction::FCmp)
687     return false;
688 
689   // Otherwise look for a comparison between constant offset pointers with
690   // a common base.
691   Value *LHSBase, *RHSBase;
692   APInt LHSOffset, RHSOffset;
693   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
694   if (LHSBase) {
695     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
696     if (RHSBase && LHSBase == RHSBase) {
697       // We have common bases, fold the icmp to a constant based on the
698       // offsets.
699       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
700       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
701       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
702         SimplifiedValues[&I] = C;
703         ++NumConstantPtrCmps;
704         return true;
705       }
706     }
707   }
708 
709   // If the comparison is an equality comparison with null, we can simplify it
710   // if we know the value (argument) can't be null
711   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
712       isKnownNonNullInCallee(I.getOperand(0))) {
713     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
714     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
715                                       : ConstantInt::getFalse(I.getType());
716     return true;
717   }
718   // Finally check for SROA candidates in comparisons.
719   Value *SROAArg;
720   DenseMap<Value *, int>::iterator CostIt;
721   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
722     if (isa<ConstantPointerNull>(I.getOperand(1))) {
723       accumulateSROACost(CostIt, InlineConstants::InstrCost);
724       return true;
725     }
726 
727     disableSROA(CostIt);
728   }
729 
730   return false;
731 }
732 
733 bool CallAnalyzer::visitSub(BinaryOperator &I) {
734   // Try to handle a special case: we can fold computing the difference of two
735   // constant-related pointers.
736   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
737   Value *LHSBase, *RHSBase;
738   APInt LHSOffset, RHSOffset;
739   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
740   if (LHSBase) {
741     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
742     if (RHSBase && LHSBase == RHSBase) {
743       // We have common bases, fold the subtract to a constant based on the
744       // offsets.
745       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
746       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
747       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
748         SimplifiedValues[&I] = C;
749         ++NumConstantPtrDiffs;
750         return true;
751       }
752     }
753   }
754 
755   // Otherwise, fall back to the generic logic for simplifying and handling
756   // instructions.
757   return Base::visitSub(I);
758 }
759 
760 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
761   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
762   const DataLayout &DL = F.getParent()->getDataLayout();
763   if (!isa<Constant>(LHS))
764     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
765       LHS = SimpleLHS;
766   if (!isa<Constant>(RHS))
767     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
768       RHS = SimpleRHS;
769   Value *SimpleV = nullptr;
770   if (auto FI = dyn_cast<FPMathOperator>(&I))
771     SimpleV =
772         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
773   else
774     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
775 
776   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
777     SimplifiedValues[&I] = C;
778     return true;
779   }
780 
781   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
782   disableSROA(LHS);
783   disableSROA(RHS);
784 
785   return false;
786 }
787 
788 bool CallAnalyzer::visitLoad(LoadInst &I) {
789   Value *SROAArg;
790   DenseMap<Value *, int>::iterator CostIt;
791   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
792     if (I.isSimple()) {
793       accumulateSROACost(CostIt, InlineConstants::InstrCost);
794       return true;
795     }
796 
797     disableSROA(CostIt);
798   }
799 
800   return false;
801 }
802 
803 bool CallAnalyzer::visitStore(StoreInst &I) {
804   Value *SROAArg;
805   DenseMap<Value *, int>::iterator CostIt;
806   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
807     if (I.isSimple()) {
808       accumulateSROACost(CostIt, InlineConstants::InstrCost);
809       return true;
810     }
811 
812     disableSROA(CostIt);
813   }
814 
815   return false;
816 }
817 
818 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
819   // Constant folding for extract value is trivial.
820   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
821   if (!C)
822     C = SimplifiedValues.lookup(I.getAggregateOperand());
823   if (C) {
824     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
825     return true;
826   }
827 
828   // SROA can look through these but give them a cost.
829   return false;
830 }
831 
832 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
833   // Constant folding for insert value is trivial.
834   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
835   if (!AggC)
836     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
837   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
838   if (!InsertedC)
839     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
840   if (AggC && InsertedC) {
841     SimplifiedValues[&I] =
842         ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
843     return true;
844   }
845 
846   // SROA can look through these but give them a cost.
847   return false;
848 }
849 
850 /// \brief Try to simplify a call site.
851 ///
852 /// Takes a concrete function and callsite and tries to actually simplify it by
853 /// analyzing the arguments and call itself with instsimplify. Returns true if
854 /// it has simplified the callsite to some other entity (a constant), making it
855 /// free.
856 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
857   // FIXME: Using the instsimplify logic directly for this is inefficient
858   // because we have to continually rebuild the argument list even when no
859   // simplifications can be performed. Until that is fixed with remapping
860   // inside of instsimplify, directly constant fold calls here.
861   if (!canConstantFoldCallTo(F))
862     return false;
863 
864   // Try to re-map the arguments to constants.
865   SmallVector<Constant *, 4> ConstantArgs;
866   ConstantArgs.reserve(CS.arg_size());
867   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
868        ++I) {
869     Constant *C = dyn_cast<Constant>(*I);
870     if (!C)
871       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
872     if (!C)
873       return false; // This argument doesn't map to a constant.
874 
875     ConstantArgs.push_back(C);
876   }
877   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
878     SimplifiedValues[CS.getInstruction()] = C;
879     return true;
880   }
881 
882   return false;
883 }
884 
885 bool CallAnalyzer::visitCallSite(CallSite CS) {
886   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
887       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
888     // This aborts the entire analysis.
889     ExposesReturnsTwice = true;
890     return false;
891   }
892   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
893     ContainsNoDuplicateCall = true;
894 
895   if (Function *F = CS.getCalledFunction()) {
896     // When we have a concrete function, first try to simplify it directly.
897     if (simplifyCallSite(F, CS))
898       return true;
899 
900     // Next check if it is an intrinsic we know about.
901     // FIXME: Lift this into part of the InstVisitor.
902     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
903       switch (II->getIntrinsicID()) {
904       default:
905         return Base::visitCallSite(CS);
906 
907       case Intrinsic::load_relative:
908         // This is normally lowered to 4 LLVM instructions.
909         Cost += 3 * InlineConstants::InstrCost;
910         return false;
911 
912       case Intrinsic::memset:
913       case Intrinsic::memcpy:
914       case Intrinsic::memmove:
915         // SROA can usually chew through these intrinsics, but they aren't free.
916         return false;
917       case Intrinsic::localescape:
918         HasFrameEscape = true;
919         return false;
920       }
921     }
922 
923     if (F == CS.getInstruction()->getParent()->getParent()) {
924       // This flag will fully abort the analysis, so don't bother with anything
925       // else.
926       IsRecursiveCall = true;
927       return false;
928     }
929 
930     if (TTI.isLoweredToCall(F)) {
931       // We account for the average 1 instruction per call argument setup
932       // here.
933       Cost += CS.arg_size() * InlineConstants::InstrCost;
934 
935       // Everything other than inline ASM will also have a significant cost
936       // merely from making the call.
937       if (!isa<InlineAsm>(CS.getCalledValue()))
938         Cost += InlineConstants::CallPenalty;
939     }
940 
941     return Base::visitCallSite(CS);
942   }
943 
944   // Otherwise we're in a very special case -- an indirect function call. See
945   // if we can be particularly clever about this.
946   Value *Callee = CS.getCalledValue();
947 
948   // First, pay the price of the argument setup. We account for the average
949   // 1 instruction per call argument setup here.
950   Cost += CS.arg_size() * InlineConstants::InstrCost;
951 
952   // Next, check if this happens to be an indirect function call to a known
953   // function in this inline context. If not, we've done all we can.
954   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
955   if (!F)
956     return Base::visitCallSite(CS);
957 
958   // If we have a constant that we are calling as a function, we can peer
959   // through it and see the function target. This happens not infrequently
960   // during devirtualization and so we want to give it a hefty bonus for
961   // inlining, but cap that bonus in the event that inlining wouldn't pan
962   // out. Pretend to inline the function, with a custom threshold.
963   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
964   if (CA.analyzeCall(CS)) {
965     // We were able to inline the indirect call! Subtract the cost from the
966     // threshold to get the bonus we want to apply, but don't go below zero.
967     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
968   }
969 
970   return Base::visitCallSite(CS);
971 }
972 
973 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
974   // At least one return instruction will be free after inlining.
975   bool Free = !HasReturn;
976   HasReturn = true;
977   return Free;
978 }
979 
980 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
981   // We model unconditional branches as essentially free -- they really
982   // shouldn't exist at all, but handling them makes the behavior of the
983   // inliner more regular and predictable. Interestingly, conditional branches
984   // which will fold away are also free.
985   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
986          dyn_cast_or_null<ConstantInt>(
987              SimplifiedValues.lookup(BI.getCondition()));
988 }
989 
990 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
991   // We model unconditional switches as free, see the comments on handling
992   // branches.
993   if (isa<ConstantInt>(SI.getCondition()))
994     return true;
995   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
996     if (isa<ConstantInt>(V))
997       return true;
998 
999   // Otherwise, we need to accumulate a cost proportional to the number of
1000   // distinct successor blocks. This fan-out in the CFG cannot be represented
1001   // for free even if we can represent the core switch as a jumptable that
1002   // takes a single instruction.
1003   //
1004   // NB: We convert large switches which are just used to initialize large phi
1005   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1006   // inlining those. It will prevent inlining in cases where the optimization
1007   // does not (yet) fire.
1008   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1009   SuccessorBlocks.insert(SI.getDefaultDest());
1010   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
1011     SuccessorBlocks.insert(I.getCaseSuccessor());
1012   // Add cost corresponding to the number of distinct destinations. The first
1013   // we model as free because of fallthrough.
1014   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1015   return false;
1016 }
1017 
1018 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1019   // We never want to inline functions that contain an indirectbr.  This is
1020   // incorrect because all the blockaddress's (in static global initializers
1021   // for example) would be referring to the original function, and this
1022   // indirect jump would jump from the inlined copy of the function into the
1023   // original function which is extremely undefined behavior.
1024   // FIXME: This logic isn't really right; we can safely inline functions with
1025   // indirectbr's as long as no other function or global references the
1026   // blockaddress of a block within the current function.
1027   HasIndirectBr = true;
1028   return false;
1029 }
1030 
1031 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1032   // FIXME: It's not clear that a single instruction is an accurate model for
1033   // the inline cost of a resume instruction.
1034   return false;
1035 }
1036 
1037 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1038   // FIXME: It's not clear that a single instruction is an accurate model for
1039   // the inline cost of a cleanupret instruction.
1040   return false;
1041 }
1042 
1043 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1044   // FIXME: It's not clear that a single instruction is an accurate model for
1045   // the inline cost of a catchret instruction.
1046   return false;
1047 }
1048 
1049 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1050   // FIXME: It might be reasonably to discount the cost of instructions leading
1051   // to unreachable as they have the lowest possible impact on both runtime and
1052   // code size.
1053   return true; // No actual code is needed for unreachable.
1054 }
1055 
1056 bool CallAnalyzer::visitInstruction(Instruction &I) {
1057   // Some instructions are free. All of the free intrinsics can also be
1058   // handled by SROA, etc.
1059   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1060     return true;
1061 
1062   // We found something we don't understand or can't handle. Mark any SROA-able
1063   // values in the operand list as no longer viable.
1064   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1065     disableSROA(*OI);
1066 
1067   return false;
1068 }
1069 
1070 /// \brief Analyze a basic block for its contribution to the inline cost.
1071 ///
1072 /// This method walks the analyzer over every instruction in the given basic
1073 /// block and accounts for their cost during inlining at this callsite. It
1074 /// aborts early if the threshold has been exceeded or an impossible to inline
1075 /// construct has been detected. It returns false if inlining is no longer
1076 /// viable, and true if inlining remains viable.
1077 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1078                                 SmallPtrSetImpl<const Value *> &EphValues) {
1079   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1080     // FIXME: Currently, the number of instructions in a function regardless of
1081     // our ability to simplify them during inline to constants or dead code,
1082     // are actually used by the vector bonus heuristic. As long as that's true,
1083     // we have to special case debug intrinsics here to prevent differences in
1084     // inlining due to debug symbols. Eventually, the number of unsimplified
1085     // instructions shouldn't factor into the cost computation, but until then,
1086     // hack around it here.
1087     if (isa<DbgInfoIntrinsic>(I))
1088       continue;
1089 
1090     // Skip ephemeral values.
1091     if (EphValues.count(&*I))
1092       continue;
1093 
1094     ++NumInstructions;
1095     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1096       ++NumVectorInstructions;
1097 
1098     // If the instruction is floating point, and the target says this operation
1099     // is expensive or the function has the "use-soft-float" attribute, this may
1100     // eventually become a library call. Treat the cost as such.
1101     if (I->getType()->isFloatingPointTy()) {
1102       bool hasSoftFloatAttr = false;
1103 
1104       // If the function has the "use-soft-float" attribute, mark it as
1105       // expensive.
1106       if (F.hasFnAttribute("use-soft-float")) {
1107         Attribute Attr = F.getFnAttribute("use-soft-float");
1108         StringRef Val = Attr.getValueAsString();
1109         if (Val == "true")
1110           hasSoftFloatAttr = true;
1111       }
1112 
1113       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1114           hasSoftFloatAttr)
1115         Cost += InlineConstants::CallPenalty;
1116     }
1117 
1118     // If the instruction simplified to a constant, there is no cost to this
1119     // instruction. Visit the instructions using our InstVisitor to account for
1120     // all of the per-instruction logic. The visit tree returns true if we
1121     // consumed the instruction in any way, and false if the instruction's base
1122     // cost should count against inlining.
1123     if (Base::visit(&*I))
1124       ++NumInstructionsSimplified;
1125     else
1126       Cost += InlineConstants::InstrCost;
1127 
1128     // If the visit this instruction detected an uninlinable pattern, abort.
1129     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1130         HasIndirectBr || HasFrameEscape)
1131       return false;
1132 
1133     // If the caller is a recursive function then we don't want to inline
1134     // functions which allocate a lot of stack space because it would increase
1135     // the caller stack usage dramatically.
1136     if (IsCallerRecursive &&
1137         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1138       return false;
1139 
1140     // Check if we've past the maximum possible threshold so we don't spin in
1141     // huge basic blocks that will never inline.
1142     if (Cost > Threshold)
1143       return false;
1144   }
1145 
1146   return true;
1147 }
1148 
1149 /// \brief Compute the base pointer and cumulative constant offsets for V.
1150 ///
1151 /// This strips all constant offsets off of V, leaving it the base pointer, and
1152 /// accumulates the total constant offset applied in the returned constant. It
1153 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1154 /// no constant offsets applied.
1155 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1156   if (!V->getType()->isPointerTy())
1157     return nullptr;
1158 
1159   const DataLayout &DL = F.getParent()->getDataLayout();
1160   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1161   APInt Offset = APInt::getNullValue(IntPtrWidth);
1162 
1163   // Even though we don't look through PHI nodes, we could be called on an
1164   // instruction in an unreachable block, which may be on a cycle.
1165   SmallPtrSet<Value *, 4> Visited;
1166   Visited.insert(V);
1167   do {
1168     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1169       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1170         return nullptr;
1171       V = GEP->getPointerOperand();
1172     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1173       V = cast<Operator>(V)->getOperand(0);
1174     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1175       if (GA->isInterposable())
1176         break;
1177       V = GA->getAliasee();
1178     } else {
1179       break;
1180     }
1181     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1182   } while (Visited.insert(V).second);
1183 
1184   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1185   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1186 }
1187 
1188 /// \brief Analyze a call site for potential inlining.
1189 ///
1190 /// Returns true if inlining this call is viable, and false if it is not
1191 /// viable. It computes the cost and adjusts the threshold based on numerous
1192 /// factors and heuristics. If this method returns false but the computed cost
1193 /// is below the computed threshold, then inlining was forcibly disabled by
1194 /// some artifact of the routine.
1195 bool CallAnalyzer::analyzeCall(CallSite CS) {
1196   ++NumCallsAnalyzed;
1197 
1198   // Perform some tweaks to the cost and threshold based on the direct
1199   // callsite information.
1200 
1201   // We want to more aggressively inline vector-dense kernels, so up the
1202   // threshold, and we'll lower it if the % of vector instructions gets too
1203   // low. Note that these bonuses are some what arbitrary and evolved over time
1204   // by accident as much as because they are principled bonuses.
1205   //
1206   // FIXME: It would be nice to remove all such bonuses. At least it would be
1207   // nice to base the bonus values on something more scientific.
1208   assert(NumInstructions == 0);
1209   assert(NumVectorInstructions == 0);
1210 
1211   // Update the threshold based on callsite properties
1212   updateThreshold(CS, F);
1213 
1214   FiftyPercentVectorBonus = 3 * Threshold / 2;
1215   TenPercentVectorBonus = 3 * Threshold / 4;
1216   const DataLayout &DL = F.getParent()->getDataLayout();
1217 
1218   // Track whether the post-inlining function would have more than one basic
1219   // block. A single basic block is often intended for inlining. Balloon the
1220   // threshold by 50% until we pass the single-BB phase.
1221   bool SingleBB = true;
1222   int SingleBBBonus = Threshold / 2;
1223 
1224   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1225   // this Threshold any time, and cost cannot decrease, we can stop processing
1226   // the rest of the function body.
1227   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1228 
1229   // Give out bonuses per argument, as the instructions setting them up will
1230   // be gone after inlining.
1231   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1232     if (CS.isByValArgument(I)) {
1233       // We approximate the number of loads and stores needed by dividing the
1234       // size of the byval type by the target's pointer size.
1235       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1236       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1237       unsigned PointerSize = DL.getPointerSizeInBits();
1238       // Ceiling division.
1239       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1240 
1241       // If it generates more than 8 stores it is likely to be expanded as an
1242       // inline memcpy so we take that as an upper bound. Otherwise we assume
1243       // one load and one store per word copied.
1244       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1245       // here instead of a magic number of 8, but it's not available via
1246       // DataLayout.
1247       NumStores = std::min(NumStores, 8U);
1248 
1249       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1250     } else {
1251       // For non-byval arguments subtract off one instruction per call
1252       // argument.
1253       Cost -= InlineConstants::InstrCost;
1254     }
1255   }
1256 
1257   // If there is only one call of the function, and it has internal linkage,
1258   // the cost of inlining it drops dramatically.
1259   bool OnlyOneCallAndLocalLinkage =
1260       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1261   if (OnlyOneCallAndLocalLinkage)
1262     Cost += InlineConstants::LastCallToStaticBonus;
1263 
1264   // If this function uses the coldcc calling convention, prefer not to inline
1265   // it.
1266   if (F.getCallingConv() == CallingConv::Cold)
1267     Cost += InlineConstants::ColdccPenalty;
1268 
1269   // Check if we're done. This can happen due to bonuses and penalties.
1270   if (Cost > Threshold)
1271     return false;
1272 
1273   if (F.empty())
1274     return true;
1275 
1276   Function *Caller = CS.getInstruction()->getParent()->getParent();
1277   // Check if the caller function is recursive itself.
1278   for (User *U : Caller->users()) {
1279     CallSite Site(U);
1280     if (!Site)
1281       continue;
1282     Instruction *I = Site.getInstruction();
1283     if (I->getParent()->getParent() == Caller) {
1284       IsCallerRecursive = true;
1285       break;
1286     }
1287   }
1288 
1289   // Populate our simplified values by mapping from function arguments to call
1290   // arguments with known important simplifications.
1291   CallSite::arg_iterator CAI = CS.arg_begin();
1292   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1293        FAI != FAE; ++FAI, ++CAI) {
1294     assert(CAI != CS.arg_end());
1295     if (Constant *C = dyn_cast<Constant>(CAI))
1296       SimplifiedValues[&*FAI] = C;
1297 
1298     Value *PtrArg = *CAI;
1299     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1300       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1301 
1302       // We can SROA any pointer arguments derived from alloca instructions.
1303       if (isa<AllocaInst>(PtrArg)) {
1304         SROAArgValues[&*FAI] = PtrArg;
1305         SROAArgCosts[PtrArg] = 0;
1306       }
1307     }
1308   }
1309   NumConstantArgs = SimplifiedValues.size();
1310   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1311   NumAllocaArgs = SROAArgValues.size();
1312 
1313   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1314   // the ephemeral values multiple times (and they're completely determined by
1315   // the callee, so this is purely duplicate work).
1316   SmallPtrSet<const Value *, 32> EphValues;
1317   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F),
1318                                       EphValues);
1319 
1320   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1321   // adding basic blocks of the callee which can be proven to be dead for this
1322   // particular call site in order to get more accurate cost estimates. This
1323   // requires a somewhat heavyweight iteration pattern: we need to walk the
1324   // basic blocks in a breadth-first order as we insert live successors. To
1325   // accomplish this, prioritizing for small iterations because we exit after
1326   // crossing our threshold, we use a small-size optimized SetVector.
1327   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1328                     SmallPtrSet<BasicBlock *, 16>>
1329       BBSetVector;
1330   BBSetVector BBWorklist;
1331   BBWorklist.insert(&F.getEntryBlock());
1332   // Note that we *must not* cache the size, this loop grows the worklist.
1333   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1334     // Bail out the moment we cross the threshold. This means we'll under-count
1335     // the cost, but only when undercounting doesn't matter.
1336     if (Cost > Threshold)
1337       break;
1338 
1339     BasicBlock *BB = BBWorklist[Idx];
1340     if (BB->empty())
1341       continue;
1342 
1343     // Disallow inlining a blockaddress. A blockaddress only has defined
1344     // behavior for an indirect branch in the same function, and we do not
1345     // currently support inlining indirect branches. But, the inliner may not
1346     // see an indirect branch that ends up being dead code at a particular call
1347     // site. If the blockaddress escapes the function, e.g., via a global
1348     // variable, inlining may lead to an invalid cross-function reference.
1349     if (BB->hasAddressTaken())
1350       return false;
1351 
1352     // Analyze the cost of this block. If we blow through the threshold, this
1353     // returns false, and we can bail on out.
1354     if (!analyzeBlock(BB, EphValues))
1355       return false;
1356 
1357     TerminatorInst *TI = BB->getTerminator();
1358 
1359     // Add in the live successors by first checking whether we have terminator
1360     // that may be simplified based on the values simplified by this call.
1361     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1362       if (BI->isConditional()) {
1363         Value *Cond = BI->getCondition();
1364         if (ConstantInt *SimpleCond =
1365                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1366           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1367           continue;
1368         }
1369       }
1370     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1371       Value *Cond = SI->getCondition();
1372       if (ConstantInt *SimpleCond =
1373               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1374         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1375         continue;
1376       }
1377     }
1378 
1379     // If we're unable to select a particular successor, just count all of
1380     // them.
1381     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1382          ++TIdx)
1383       BBWorklist.insert(TI->getSuccessor(TIdx));
1384 
1385     // If we had any successors at this point, than post-inlining is likely to
1386     // have them as well. Note that we assume any basic blocks which existed
1387     // due to branches or switches which folded above will also fold after
1388     // inlining.
1389     if (SingleBB && TI->getNumSuccessors() > 1) {
1390       // Take off the bonus we applied to the threshold.
1391       Threshold -= SingleBBBonus;
1392       SingleBB = false;
1393     }
1394   }
1395 
1396   // If this is a noduplicate call, we can still inline as long as
1397   // inlining this would cause the removal of the caller (so the instruction
1398   // is not actually duplicated, just moved).
1399   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1400     return false;
1401 
1402   // We applied the maximum possible vector bonus at the beginning. Now,
1403   // subtract the excess bonus, if any, from the Threshold before
1404   // comparing against Cost.
1405   if (NumVectorInstructions <= NumInstructions / 10)
1406     Threshold -= FiftyPercentVectorBonus;
1407   else if (NumVectorInstructions <= NumInstructions / 2)
1408     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1409 
1410   return Cost < std::max(1, Threshold);
1411 }
1412 
1413 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1414 /// \brief Dump stats about this call's analysis.
1415 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1416 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1417   DEBUG_PRINT_STAT(NumConstantArgs);
1418   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1419   DEBUG_PRINT_STAT(NumAllocaArgs);
1420   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1421   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1422   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1423   DEBUG_PRINT_STAT(NumInstructions);
1424   DEBUG_PRINT_STAT(SROACostSavings);
1425   DEBUG_PRINT_STAT(SROACostSavingsLost);
1426   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1427   DEBUG_PRINT_STAT(Cost);
1428   DEBUG_PRINT_STAT(Threshold);
1429 #undef DEBUG_PRINT_STAT
1430 }
1431 #endif
1432 
1433 /// \brief Test that two functions either have or have not the given attribute
1434 ///        at the same time.
1435 template <typename AttrKind>
1436 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1437   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1438 }
1439 
1440 /// \brief Test that there are no attribute conflicts between Caller and Callee
1441 ///        that prevent inlining.
1442 static bool functionsHaveCompatibleAttributes(Function *Caller,
1443                                               Function *Callee,
1444                                               TargetTransformInfo &TTI) {
1445   return TTI.areInlineCompatible(Caller, Callee) &&
1446          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1447 }
1448 
1449 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
1450                                TargetTransformInfo &CalleeTTI,
1451                                AssumptionCacheTracker *ACT) {
1452   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
1453                        ACT);
1454 }
1455 
1456 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
1457                                         unsigned SizeOptLevel) {
1458   if (OptLevel > 2)
1459     return OptAggressiveThreshold;
1460   if (SizeOptLevel == 1) // -Os
1461     return OptSizeThreshold;
1462   if (SizeOptLevel == 2) // -Oz
1463     return OptMinSizeThreshold;
1464   return DefaultInlineThreshold;
1465 }
1466 
1467 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
1468 
1469 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
1470                                int DefaultThreshold,
1471                                TargetTransformInfo &CalleeTTI,
1472                                AssumptionCacheTracker *ACT) {
1473 
1474   // Cannot inline indirect calls.
1475   if (!Callee)
1476     return llvm::InlineCost::getNever();
1477 
1478   // Calls to functions with always-inline attributes should be inlined
1479   // whenever possible.
1480   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1481     if (isInlineViable(*Callee))
1482       return llvm::InlineCost::getAlways();
1483     return llvm::InlineCost::getNever();
1484   }
1485 
1486   // Never inline functions with conflicting attributes (unless callee has
1487   // always-inline attribute).
1488   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1489     return llvm::InlineCost::getNever();
1490 
1491   // Don't inline this call if the caller has the optnone attribute.
1492   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1493     return llvm::InlineCost::getNever();
1494 
1495   // Don't inline functions which can be interposed at link-time.  Don't inline
1496   // functions marked noinline or call sites marked noinline.
1497   // Note: inlining non-exact non-interposable fucntions is fine, since we know
1498   // we have *a* correct implementation of the source level function.
1499   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1500       CS.isNoInline())
1501     return llvm::InlineCost::getNever();
1502 
1503   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1504                      << "...\n");
1505 
1506   CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS);
1507   bool ShouldInline = CA.analyzeCall(CS);
1508 
1509   DEBUG(CA.dump());
1510 
1511   // Check if there was a reason to force inlining or no inlining.
1512   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1513     return InlineCost::getNever();
1514   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1515     return InlineCost::getAlways();
1516 
1517   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1518 }
1519 
1520 bool llvm::isInlineViable(Function &F) {
1521   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1522   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1523     // Disallow inlining of functions which contain indirect branches or
1524     // blockaddresses.
1525     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1526       return false;
1527 
1528     for (auto &II : *BI) {
1529       CallSite CS(&II);
1530       if (!CS)
1531         continue;
1532 
1533       // Disallow recursive calls.
1534       if (&F == CS.getCalledFunction())
1535         return false;
1536 
1537       // Disallow calls which expose returns-twice to a function not previously
1538       // attributed as such.
1539       if (!ReturnsTwice && CS.isCall() &&
1540           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1541         return false;
1542 
1543       // Disallow inlining functions that call @llvm.localescape. Doing this
1544       // correctly would require major changes to the inliner.
1545       if (CS.getCalledFunction() &&
1546           CS.getCalledFunction()->getIntrinsicID() ==
1547               llvm::Intrinsic::localescape)
1548         return false;
1549     }
1550   }
1551 
1552   return true;
1553 }
1554