1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/TargetTransformInfo.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/CallingConv.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/GetElementPtrTypeIterator.h"
29 #include "llvm/IR/GlobalAlias.h"
30 #include "llvm/IR/InstVisitor.h"
31 #include "llvm/IR/IntrinsicInst.h"
32 #include "llvm/IR/Operator.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/raw_ostream.h"
35 
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "inline-cost"
39 
40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
41 
42 // Threshold to use when optsize is specified (and there is no
43 // -inline-threshold).
44 const int OptSizeThreshold = 75;
45 
46 // Threshold to use when -Oz is specified (and there is no -inline-threshold).
47 const int OptMinSizeThreshold = 25;
48 
49 // Threshold to use when -O[34] is specified (and there is no
50 // -inline-threshold).
51 const int OptAggressiveThreshold = 275;
52 
53 static cl::opt<int> DefaultInlineThreshold(
54     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
55     cl::desc("Control the amount of inlining to perform (default = 225)"));
56 
57 static cl::opt<int> HintThreshold(
58     "inlinehint-threshold", cl::Hidden, cl::init(325),
59     cl::desc("Threshold for inlining functions with inline hint"));
60 
61 // We introduce this threshold to help performance of instrumentation based
62 // PGO before we actually hook up inliner with analysis passes such as BPI and
63 // BFI.
64 static cl::opt<int> ColdThreshold(
65     "inlinecold-threshold", cl::Hidden, cl::init(225),
66     cl::desc("Threshold for inlining functions with cold attribute"));
67 
68 namespace {
69 
70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
71   typedef InstVisitor<CallAnalyzer, bool> Base;
72   friend class InstVisitor<CallAnalyzer, bool>;
73 
74   /// The TargetTransformInfo available for this compilation.
75   const TargetTransformInfo &TTI;
76 
77   /// The cache of @llvm.assume intrinsics.
78   AssumptionCacheTracker *ACT;
79 
80   // The called function.
81   Function &F;
82 
83   // The candidate callsite being analyzed. Please do not use this to do
84   // analysis in the caller function; we want the inline cost query to be
85   // easily cacheable. Instead, use the cover function paramHasAttr.
86   CallSite CandidateCS;
87 
88   int Threshold;
89   int Cost;
90 
91   bool IsCallerRecursive;
92   bool IsRecursiveCall;
93   bool ExposesReturnsTwice;
94   bool HasDynamicAlloca;
95   bool ContainsNoDuplicateCall;
96   bool HasReturn;
97   bool HasIndirectBr;
98   bool HasFrameEscape;
99 
100   /// Number of bytes allocated statically by the callee.
101   uint64_t AllocatedSize;
102   unsigned NumInstructions, NumVectorInstructions;
103   int FiftyPercentVectorBonus, TenPercentVectorBonus;
104   int VectorBonus;
105 
106   // While we walk the potentially-inlined instructions, we build up and
107   // maintain a mapping of simplified values specific to this callsite. The
108   // idea is to propagate any special information we have about arguments to
109   // this call through the inlinable section of the function, and account for
110   // likely simplifications post-inlining. The most important aspect we track
111   // is CFG altering simplifications -- when we prove a basic block dead, that
112   // can cause dramatic shifts in the cost of inlining a function.
113   DenseMap<Value *, Constant *> SimplifiedValues;
114 
115   // Keep track of the values which map back (through function arguments) to
116   // allocas on the caller stack which could be simplified through SROA.
117   DenseMap<Value *, Value *> SROAArgValues;
118 
119   // The mapping of caller Alloca values to their accumulated cost savings. If
120   // we have to disable SROA for one of the allocas, this tells us how much
121   // cost must be added.
122   DenseMap<Value *, int> SROAArgCosts;
123 
124   // Keep track of values which map to a pointer base and constant offset.
125   DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs;
126 
127   // Custom simplification helper routines.
128   bool isAllocaDerivedArg(Value *V);
129   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
130                             DenseMap<Value *, int>::iterator &CostIt);
131   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
132   void disableSROA(Value *V);
133   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
134                           int InstructionCost);
135   bool isGEPOffsetConstant(GetElementPtrInst &GEP);
136   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
137   bool simplifyCallSite(Function *F, CallSite CS);
138   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
139 
140   /// Return true if the given argument to the function being considered for
141   /// inlining has the given attribute set either at the call site or the
142   /// function declaration.  Primarily used to inspect call site specific
143   /// attributes since these can be more precise than the ones on the callee
144   /// itself.
145   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
146 
147   /// Return true if the given value is known non null within the callee if
148   /// inlined through this particular callsite.
149   bool isKnownNonNullInCallee(Value *V);
150 
151   /// Update Threshold based on callsite properties such as callee
152   /// attributes and callee hotness for PGO builds. The Callee is explicitly
153   /// passed to support analyzing indirect calls whose target is inferred by
154   /// analysis.
155   void updateThreshold(CallSite CS, Function &Callee);
156 
157   /// Return true if size growth is allowed when inlining the callee at CS.
158   bool allowSizeGrowth(CallSite CS);
159 
160   // Custom analysis routines.
161   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
162 
163   // Disable several entry points to the visitor so we don't accidentally use
164   // them by declaring but not defining them here.
165   void visit(Module *);     void visit(Module &);
166   void visit(Function *);   void visit(Function &);
167   void visit(BasicBlock *); void visit(BasicBlock &);
168 
169   // Provide base case for our instruction visit.
170   bool visitInstruction(Instruction &I);
171 
172   // Our visit overrides.
173   bool visitAlloca(AllocaInst &I);
174   bool visitPHI(PHINode &I);
175   bool visitGetElementPtr(GetElementPtrInst &I);
176   bool visitBitCast(BitCastInst &I);
177   bool visitPtrToInt(PtrToIntInst &I);
178   bool visitIntToPtr(IntToPtrInst &I);
179   bool visitCastInst(CastInst &I);
180   bool visitUnaryInstruction(UnaryInstruction &I);
181   bool visitCmpInst(CmpInst &I);
182   bool visitSub(BinaryOperator &I);
183   bool visitBinaryOperator(BinaryOperator &I);
184   bool visitLoad(LoadInst &I);
185   bool visitStore(StoreInst &I);
186   bool visitExtractValue(ExtractValueInst &I);
187   bool visitInsertValue(InsertValueInst &I);
188   bool visitCallSite(CallSite CS);
189   bool visitReturnInst(ReturnInst &RI);
190   bool visitBranchInst(BranchInst &BI);
191   bool visitSwitchInst(SwitchInst &SI);
192   bool visitIndirectBrInst(IndirectBrInst &IBI);
193   bool visitResumeInst(ResumeInst &RI);
194   bool visitCleanupReturnInst(CleanupReturnInst &RI);
195   bool visitCatchReturnInst(CatchReturnInst &RI);
196   bool visitUnreachableInst(UnreachableInst &I);
197 
198 public:
199   CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT,
200                Function &Callee, int Threshold, CallSite CSArg)
201     : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold),
202         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
203         ExposesReturnsTwice(false), HasDynamicAlloca(false),
204         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
205         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
206         NumVectorInstructions(0), FiftyPercentVectorBonus(0),
207         TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
208         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
209         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
210         SROACostSavings(0), SROACostSavingsLost(0) {}
211 
212   bool analyzeCall(CallSite CS);
213 
214   int getThreshold() { return Threshold; }
215   int getCost() { return Cost; }
216 
217   // Keep a bunch of stats about the cost savings found so we can print them
218   // out when debugging.
219   unsigned NumConstantArgs;
220   unsigned NumConstantOffsetPtrArgs;
221   unsigned NumAllocaArgs;
222   unsigned NumConstantPtrCmps;
223   unsigned NumConstantPtrDiffs;
224   unsigned NumInstructionsSimplified;
225   unsigned SROACostSavings;
226   unsigned SROACostSavingsLost;
227 
228   void dump();
229 };
230 
231 } // namespace
232 
233 /// \brief Test whether the given value is an Alloca-derived function argument.
234 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
235   return SROAArgValues.count(V);
236 }
237 
238 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
239 /// Returns false if V does not map to a SROA-candidate.
240 bool CallAnalyzer::lookupSROAArgAndCost(
241     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
242   if (SROAArgValues.empty() || SROAArgCosts.empty())
243     return false;
244 
245   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
246   if (ArgIt == SROAArgValues.end())
247     return false;
248 
249   Arg = ArgIt->second;
250   CostIt = SROAArgCosts.find(Arg);
251   return CostIt != SROAArgCosts.end();
252 }
253 
254 /// \brief Disable SROA for the candidate marked by this cost iterator.
255 ///
256 /// This marks the candidate as no longer viable for SROA, and adds the cost
257 /// savings associated with it back into the inline cost measurement.
258 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
259   // If we're no longer able to perform SROA we need to undo its cost savings
260   // and prevent subsequent analysis.
261   Cost += CostIt->second;
262   SROACostSavings -= CostIt->second;
263   SROACostSavingsLost += CostIt->second;
264   SROAArgCosts.erase(CostIt);
265 }
266 
267 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
268 void CallAnalyzer::disableSROA(Value *V) {
269   Value *SROAArg;
270   DenseMap<Value *, int>::iterator CostIt;
271   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
272     disableSROA(CostIt);
273 }
274 
275 /// \brief Accumulate the given cost for a particular SROA candidate.
276 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
277                                       int InstructionCost) {
278   CostIt->second += InstructionCost;
279   SROACostSavings += InstructionCost;
280 }
281 
282 /// \brief Check whether a GEP's indices are all constant.
283 ///
284 /// Respects any simplified values known during the analysis of this callsite.
285 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
286   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
287     if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
288       return false;
289 
290   return true;
291 }
292 
293 /// \brief Accumulate a constant GEP offset into an APInt if possible.
294 ///
295 /// Returns false if unable to compute the offset for any reason. Respects any
296 /// simplified values known during the analysis of this callsite.
297 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
298   const DataLayout &DL = F.getParent()->getDataLayout();
299   unsigned IntPtrWidth = DL.getPointerSizeInBits();
300   assert(IntPtrWidth == Offset.getBitWidth());
301 
302   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
303        GTI != GTE; ++GTI) {
304     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
305     if (!OpC)
306       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
307         OpC = dyn_cast<ConstantInt>(SimpleOp);
308     if (!OpC)
309       return false;
310     if (OpC->isZero()) continue;
311 
312     // Handle a struct index, which adds its field offset to the pointer.
313     if (StructType *STy = dyn_cast<StructType>(*GTI)) {
314       unsigned ElementIdx = OpC->getZExtValue();
315       const StructLayout *SL = DL.getStructLayout(STy);
316       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
317       continue;
318     }
319 
320     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
321     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
322   }
323   return true;
324 }
325 
326 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
327   // Check whether inlining will turn a dynamic alloca into a static
328   // alloca, and handle that case.
329   if (I.isArrayAllocation()) {
330     if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) {
331       ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size);
332       assert(AllocSize && "Allocation size not a constant int?");
333       Type *Ty = I.getAllocatedType();
334       AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue();
335       return Base::visitAlloca(I);
336     }
337   }
338 
339   // Accumulate the allocated size.
340   if (I.isStaticAlloca()) {
341     const DataLayout &DL = F.getParent()->getDataLayout();
342     Type *Ty = I.getAllocatedType();
343     AllocatedSize += DL.getTypeAllocSize(Ty);
344   }
345 
346   // We will happily inline static alloca instructions.
347   if (I.isStaticAlloca())
348     return Base::visitAlloca(I);
349 
350   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
351   // a variety of reasons, and so we would like to not inline them into
352   // functions which don't currently have a dynamic alloca. This simply
353   // disables inlining altogether in the presence of a dynamic alloca.
354   HasDynamicAlloca = true;
355   return false;
356 }
357 
358 bool CallAnalyzer::visitPHI(PHINode &I) {
359   // FIXME: We should potentially be tracking values through phi nodes,
360   // especially when they collapse to a single value due to deleted CFG edges
361   // during inlining.
362 
363   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
364   // though we don't want to propagate it's bonuses. The idea is to disable
365   // SROA if it *might* be used in an inappropriate manner.
366 
367   // Phi nodes are always zero-cost.
368   return true;
369 }
370 
371 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
372   Value *SROAArg;
373   DenseMap<Value *, int>::iterator CostIt;
374   bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(),
375                                             SROAArg, CostIt);
376 
377   // Try to fold GEPs of constant-offset call site argument pointers. This
378   // requires target data and inbounds GEPs.
379   if (I.isInBounds()) {
380     // Check if we have a base + offset for the pointer.
381     Value *Ptr = I.getPointerOperand();
382     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
383     if (BaseAndOffset.first) {
384       // Check if the offset of this GEP is constant, and if so accumulate it
385       // into Offset.
386       if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
387         // Non-constant GEPs aren't folded, and disable SROA.
388         if (SROACandidate)
389           disableSROA(CostIt);
390         return false;
391       }
392 
393       // Add the result as a new mapping to Base + Offset.
394       ConstantOffsetPtrs[&I] = BaseAndOffset;
395 
396       // Also handle SROA candidates here, we already know that the GEP is
397       // all-constant indexed.
398       if (SROACandidate)
399         SROAArgValues[&I] = SROAArg;
400 
401       return true;
402     }
403   }
404 
405   if (isGEPOffsetConstant(I)) {
406     if (SROACandidate)
407       SROAArgValues[&I] = SROAArg;
408 
409     // Constant GEPs are modeled as free.
410     return true;
411   }
412 
413   // Variable GEPs will require math and will disable SROA.
414   if (SROACandidate)
415     disableSROA(CostIt);
416   return false;
417 }
418 
419 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
420   // Propagate constants through bitcasts.
421   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
422   if (!COp)
423     COp = SimplifiedValues.lookup(I.getOperand(0));
424   if (COp)
425     if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
426       SimplifiedValues[&I] = C;
427       return true;
428     }
429 
430   // Track base/offsets through casts
431   std::pair<Value *, APInt> BaseAndOffset
432     = ConstantOffsetPtrs.lookup(I.getOperand(0));
433   // Casts don't change the offset, just wrap it up.
434   if (BaseAndOffset.first)
435     ConstantOffsetPtrs[&I] = BaseAndOffset;
436 
437   // Also look for SROA candidates here.
438   Value *SROAArg;
439   DenseMap<Value *, int>::iterator CostIt;
440   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
441     SROAArgValues[&I] = SROAArg;
442 
443   // Bitcasts are always zero cost.
444   return true;
445 }
446 
447 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
448   // Propagate constants through ptrtoint.
449   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
450   if (!COp)
451     COp = SimplifiedValues.lookup(I.getOperand(0));
452   if (COp)
453     if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
454       SimplifiedValues[&I] = C;
455       return true;
456     }
457 
458   // Track base/offset pairs when converted to a plain integer provided the
459   // integer is large enough to represent the pointer.
460   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
461   const DataLayout &DL = F.getParent()->getDataLayout();
462   if (IntegerSize >= DL.getPointerSizeInBits()) {
463     std::pair<Value *, APInt> BaseAndOffset
464       = ConstantOffsetPtrs.lookup(I.getOperand(0));
465     if (BaseAndOffset.first)
466       ConstantOffsetPtrs[&I] = BaseAndOffset;
467   }
468 
469   // This is really weird. Technically, ptrtoint will disable SROA. However,
470   // unless that ptrtoint is *used* somewhere in the live basic blocks after
471   // inlining, it will be nuked, and SROA should proceed. All of the uses which
472   // would block SROA would also block SROA if applied directly to a pointer,
473   // and so we can just add the integer in here. The only places where SROA is
474   // preserved either cannot fire on an integer, or won't in-and-of themselves
475   // disable SROA (ext) w/o some later use that we would see and disable.
476   Value *SROAArg;
477   DenseMap<Value *, int>::iterator CostIt;
478   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
479     SROAArgValues[&I] = SROAArg;
480 
481   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
482 }
483 
484 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
485   // Propagate constants through ptrtoint.
486   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
487   if (!COp)
488     COp = SimplifiedValues.lookup(I.getOperand(0));
489   if (COp)
490     if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
491       SimplifiedValues[&I] = C;
492       return true;
493     }
494 
495   // Track base/offset pairs when round-tripped through a pointer without
496   // modifications provided the integer is not too large.
497   Value *Op = I.getOperand(0);
498   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
499   const DataLayout &DL = F.getParent()->getDataLayout();
500   if (IntegerSize <= DL.getPointerSizeInBits()) {
501     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
502     if (BaseAndOffset.first)
503       ConstantOffsetPtrs[&I] = BaseAndOffset;
504   }
505 
506   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
507   Value *SROAArg;
508   DenseMap<Value *, int>::iterator CostIt;
509   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
510     SROAArgValues[&I] = SROAArg;
511 
512   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
513 }
514 
515 bool CallAnalyzer::visitCastInst(CastInst &I) {
516   // Propagate constants through ptrtoint.
517   Constant *COp = dyn_cast<Constant>(I.getOperand(0));
518   if (!COp)
519     COp = SimplifiedValues.lookup(I.getOperand(0));
520   if (COp)
521     if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
522       SimplifiedValues[&I] = C;
523       return true;
524     }
525 
526   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
527   disableSROA(I.getOperand(0));
528 
529   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
530 }
531 
532 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
533   Value *Operand = I.getOperand(0);
534   Constant *COp = dyn_cast<Constant>(Operand);
535   if (!COp)
536     COp = SimplifiedValues.lookup(Operand);
537   if (COp) {
538     const DataLayout &DL = F.getParent()->getDataLayout();
539     if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
540       SimplifiedValues[&I] = C;
541       return true;
542     }
543   }
544 
545   // Disable any SROA on the argument to arbitrary unary operators.
546   disableSROA(Operand);
547 
548   return false;
549 }
550 
551 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
552   unsigned ArgNo = A->getArgNo();
553   return CandidateCS.paramHasAttr(ArgNo+1, Attr);
554 }
555 
556 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
557   // Does the *call site* have the NonNull attribute set on an argument?  We
558   // use the attribute on the call site to memoize any analysis done in the
559   // caller. This will also trip if the callee function has a non-null
560   // parameter attribute, but that's a less interesting case because hopefully
561   // the callee would already have been simplified based on that.
562   if (Argument *A = dyn_cast<Argument>(V))
563     if (paramHasAttr(A, Attribute::NonNull))
564       return true;
565 
566   // Is this an alloca in the caller?  This is distinct from the attribute case
567   // above because attributes aren't updated within the inliner itself and we
568   // always want to catch the alloca derived case.
569   if (isAllocaDerivedArg(V))
570     // We can actually predict the result of comparisons between an
571     // alloca-derived value and null. Note that this fires regardless of
572     // SROA firing.
573     return true;
574 
575   return false;
576 }
577 
578 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
579   // If the normal destination of the invoke or the parent block of the call
580   // site is unreachable-terminated, there is little point in inlining this
581   // unless there is literally zero cost.
582   // FIXME: Note that it is possible that an unreachable-terminated block has a
583   // hot entry. For example, in below scenario inlining hot_call_X() may be
584   // beneficial :
585   // main() {
586   //   hot_call_1();
587   //   ...
588   //   hot_call_N()
589   //   exit(0);
590   // }
591   // For now, we are not handling this corner case here as it is rare in real
592   // code. In future, we should elaborate this based on BPI and BFI in more
593   // general threshold adjusting heuristics in updateThreshold().
594   Instruction *Instr = CS.getInstruction();
595   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
596     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
597       return false;
598   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
599     return false;
600 
601   return true;
602 }
603 
604 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
605   // If no size growth is allowed for this inlining, set Threshold to 0.
606   if (!allowSizeGrowth(CS)) {
607     Threshold = 0;
608     return;
609   }
610 
611   // If -inline-threshold is not given, listen to the optsize and minsize
612   // attributes when they would decrease the threshold.
613   Function *Caller = CS.getCaller();
614 
615   if (!(DefaultInlineThreshold.getNumOccurrences() > 0)) {
616     if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold)
617       Threshold = OptMinSizeThreshold;
618     else if (Caller->optForSize() && OptSizeThreshold < Threshold)
619       Threshold = OptSizeThreshold;
620   }
621 
622   // If profile information is available, use that to adjust threshold of hot
623   // and cold functions.
624   // FIXME: The heuristic used below for determining hotness and coldness are
625   // based on preliminary SPEC tuning and may not be optimal. Replace this with
626   // a well-tuned heuristic based on *callsite* hotness and not callee hotness.
627   uint64_t FunctionCount = 0, MaxFunctionCount = 0;
628   bool HasPGOCounts = false;
629   if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) {
630     HasPGOCounts = true;
631     FunctionCount = Callee.getEntryCount().getValue();
632     MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue();
633   }
634 
635   // Listen to the inlinehint attribute or profile based hotness information
636   // when it would increase the threshold and the caller does not need to
637   // minimize its size.
638   bool InlineHint =
639       Callee.hasFnAttribute(Attribute::InlineHint) ||
640       (HasPGOCounts &&
641        FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount));
642   if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
643     Threshold = HintThreshold;
644 
645   // Listen to the cold attribute or profile based coldness information
646   // when it would decrease the threshold.
647   bool ColdCallee =
648       Callee.hasFnAttribute(Attribute::Cold) ||
649       (HasPGOCounts &&
650        FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount));
651   // Command line argument for DefaultInlineThreshold will override the default
652   // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold,
653   // do not use the default cold threshold even if it is smaller.
654   if ((DefaultInlineThreshold.getNumOccurrences() == 0 ||
655        ColdThreshold.getNumOccurrences() > 0) &&
656       ColdCallee && ColdThreshold < Threshold)
657     Threshold = ColdThreshold;
658 
659   // Finally, take the target-specific inlining threshold multiplier into
660   // account.
661   Threshold *= TTI.getInliningThresholdMultiplier();
662 }
663 
664 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
665   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
666   // First try to handle simplified comparisons.
667   if (!isa<Constant>(LHS))
668     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
669       LHS = SimpleLHS;
670   if (!isa<Constant>(RHS))
671     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
672       RHS = SimpleRHS;
673   if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
674     if (Constant *CRHS = dyn_cast<Constant>(RHS))
675       if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
676         SimplifiedValues[&I] = C;
677         return true;
678       }
679   }
680 
681   if (I.getOpcode() == Instruction::FCmp)
682     return false;
683 
684   // Otherwise look for a comparison between constant offset pointers with
685   // a common base.
686   Value *LHSBase, *RHSBase;
687   APInt LHSOffset, RHSOffset;
688   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
689   if (LHSBase) {
690     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
691     if (RHSBase && LHSBase == RHSBase) {
692       // We have common bases, fold the icmp to a constant based on the
693       // offsets.
694       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
695       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
696       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
697         SimplifiedValues[&I] = C;
698         ++NumConstantPtrCmps;
699         return true;
700       }
701     }
702   }
703 
704   // If the comparison is an equality comparison with null, we can simplify it
705   // if we know the value (argument) can't be null
706   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
707       isKnownNonNullInCallee(I.getOperand(0))) {
708     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
709     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
710                                       : ConstantInt::getFalse(I.getType());
711     return true;
712   }
713   // Finally check for SROA candidates in comparisons.
714   Value *SROAArg;
715   DenseMap<Value *, int>::iterator CostIt;
716   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
717     if (isa<ConstantPointerNull>(I.getOperand(1))) {
718       accumulateSROACost(CostIt, InlineConstants::InstrCost);
719       return true;
720     }
721 
722     disableSROA(CostIt);
723   }
724 
725   return false;
726 }
727 
728 bool CallAnalyzer::visitSub(BinaryOperator &I) {
729   // Try to handle a special case: we can fold computing the difference of two
730   // constant-related pointers.
731   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
732   Value *LHSBase, *RHSBase;
733   APInt LHSOffset, RHSOffset;
734   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
735   if (LHSBase) {
736     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
737     if (RHSBase && LHSBase == RHSBase) {
738       // We have common bases, fold the subtract to a constant based on the
739       // offsets.
740       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
741       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
742       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
743         SimplifiedValues[&I] = C;
744         ++NumConstantPtrDiffs;
745         return true;
746       }
747     }
748   }
749 
750   // Otherwise, fall back to the generic logic for simplifying and handling
751   // instructions.
752   return Base::visitSub(I);
753 }
754 
755 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
756   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
757   const DataLayout &DL = F.getParent()->getDataLayout();
758   if (!isa<Constant>(LHS))
759     if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
760       LHS = SimpleLHS;
761   if (!isa<Constant>(RHS))
762     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
763       RHS = SimpleRHS;
764   Value *SimpleV = nullptr;
765   if (auto FI = dyn_cast<FPMathOperator>(&I))
766     SimpleV =
767         SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
768   else
769     SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
770 
771   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
772     SimplifiedValues[&I] = C;
773     return true;
774   }
775 
776   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
777   disableSROA(LHS);
778   disableSROA(RHS);
779 
780   return false;
781 }
782 
783 bool CallAnalyzer::visitLoad(LoadInst &I) {
784   Value *SROAArg;
785   DenseMap<Value *, int>::iterator CostIt;
786   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
787     if (I.isSimple()) {
788       accumulateSROACost(CostIt, InlineConstants::InstrCost);
789       return true;
790     }
791 
792     disableSROA(CostIt);
793   }
794 
795   return false;
796 }
797 
798 bool CallAnalyzer::visitStore(StoreInst &I) {
799   Value *SROAArg;
800   DenseMap<Value *, int>::iterator CostIt;
801   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
802     if (I.isSimple()) {
803       accumulateSROACost(CostIt, InlineConstants::InstrCost);
804       return true;
805     }
806 
807     disableSROA(CostIt);
808   }
809 
810   return false;
811 }
812 
813 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
814   // Constant folding for extract value is trivial.
815   Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
816   if (!C)
817     C = SimplifiedValues.lookup(I.getAggregateOperand());
818   if (C) {
819     SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
820     return true;
821   }
822 
823   // SROA can look through these but give them a cost.
824   return false;
825 }
826 
827 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
828   // Constant folding for insert value is trivial.
829   Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
830   if (!AggC)
831     AggC = SimplifiedValues.lookup(I.getAggregateOperand());
832   Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
833   if (!InsertedC)
834     InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
835   if (AggC && InsertedC) {
836     SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC,
837                                                         I.getIndices());
838     return true;
839   }
840 
841   // SROA can look through these but give them a cost.
842   return false;
843 }
844 
845 /// \brief Try to simplify a call site.
846 ///
847 /// Takes a concrete function and callsite and tries to actually simplify it by
848 /// analyzing the arguments and call itself with instsimplify. Returns true if
849 /// it has simplified the callsite to some other entity (a constant), making it
850 /// free.
851 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
852   // FIXME: Using the instsimplify logic directly for this is inefficient
853   // because we have to continually rebuild the argument list even when no
854   // simplifications can be performed. Until that is fixed with remapping
855   // inside of instsimplify, directly constant fold calls here.
856   if (!canConstantFoldCallTo(F))
857     return false;
858 
859   // Try to re-map the arguments to constants.
860   SmallVector<Constant *, 4> ConstantArgs;
861   ConstantArgs.reserve(CS.arg_size());
862   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
863        I != E; ++I) {
864     Constant *C = dyn_cast<Constant>(*I);
865     if (!C)
866       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
867     if (!C)
868       return false; // This argument doesn't map to a constant.
869 
870     ConstantArgs.push_back(C);
871   }
872   if (Constant *C = ConstantFoldCall(F, ConstantArgs)) {
873     SimplifiedValues[CS.getInstruction()] = C;
874     return true;
875   }
876 
877   return false;
878 }
879 
880 bool CallAnalyzer::visitCallSite(CallSite CS) {
881   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
882       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
883     // This aborts the entire analysis.
884     ExposesReturnsTwice = true;
885     return false;
886   }
887   if (CS.isCall() &&
888       cast<CallInst>(CS.getInstruction())->cannotDuplicate())
889     ContainsNoDuplicateCall = true;
890 
891   if (Function *F = CS.getCalledFunction()) {
892     // When we have a concrete function, first try to simplify it directly.
893     if (simplifyCallSite(F, CS))
894       return true;
895 
896     // Next check if it is an intrinsic we know about.
897     // FIXME: Lift this into part of the InstVisitor.
898     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
899       switch (II->getIntrinsicID()) {
900       default:
901         return Base::visitCallSite(CS);
902 
903       case Intrinsic::memset:
904       case Intrinsic::memcpy:
905       case Intrinsic::memmove:
906         // SROA can usually chew through these intrinsics, but they aren't free.
907         return false;
908       case Intrinsic::localescape:
909         HasFrameEscape = true;
910         return false;
911       }
912     }
913 
914     if (F == CS.getInstruction()->getParent()->getParent()) {
915       // This flag will fully abort the analysis, so don't bother with anything
916       // else.
917       IsRecursiveCall = true;
918       return false;
919     }
920 
921     if (TTI.isLoweredToCall(F)) {
922       // We account for the average 1 instruction per call argument setup
923       // here.
924       Cost += CS.arg_size() * InlineConstants::InstrCost;
925 
926       // Everything other than inline ASM will also have a significant cost
927       // merely from making the call.
928       if (!isa<InlineAsm>(CS.getCalledValue()))
929         Cost += InlineConstants::CallPenalty;
930     }
931 
932     return Base::visitCallSite(CS);
933   }
934 
935   // Otherwise we're in a very special case -- an indirect function call. See
936   // if we can be particularly clever about this.
937   Value *Callee = CS.getCalledValue();
938 
939   // First, pay the price of the argument setup. We account for the average
940   // 1 instruction per call argument setup here.
941   Cost += CS.arg_size() * InlineConstants::InstrCost;
942 
943   // Next, check if this happens to be an indirect function call to a known
944   // function in this inline context. If not, we've done all we can.
945   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
946   if (!F)
947     return Base::visitCallSite(CS);
948 
949   // If we have a constant that we are calling as a function, we can peer
950   // through it and see the function target. This happens not infrequently
951   // during devirtualization and so we want to give it a hefty bonus for
952   // inlining, but cap that bonus in the event that inlining wouldn't pan
953   // out. Pretend to inline the function, with a custom threshold.
954   CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS);
955   if (CA.analyzeCall(CS)) {
956     // We were able to inline the indirect call! Subtract the cost from the
957     // threshold to get the bonus we want to apply, but don't go below zero.
958     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
959   }
960 
961   return Base::visitCallSite(CS);
962 }
963 
964 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
965   // At least one return instruction will be free after inlining.
966   bool Free = !HasReturn;
967   HasReturn = true;
968   return Free;
969 }
970 
971 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
972   // We model unconditional branches as essentially free -- they really
973   // shouldn't exist at all, but handling them makes the behavior of the
974   // inliner more regular and predictable. Interestingly, conditional branches
975   // which will fold away are also free.
976   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
977          dyn_cast_or_null<ConstantInt>(
978              SimplifiedValues.lookup(BI.getCondition()));
979 }
980 
981 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
982   // We model unconditional switches as free, see the comments on handling
983   // branches.
984   if (isa<ConstantInt>(SI.getCondition()))
985     return true;
986   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
987     if (isa<ConstantInt>(V))
988       return true;
989 
990   // Otherwise, we need to accumulate a cost proportional to the number of
991   // distinct successor blocks. This fan-out in the CFG cannot be represented
992   // for free even if we can represent the core switch as a jumptable that
993   // takes a single instruction.
994   //
995   // NB: We convert large switches which are just used to initialize large phi
996   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
997   // inlining those. It will prevent inlining in cases where the optimization
998   // does not (yet) fire.
999   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
1000   SuccessorBlocks.insert(SI.getDefaultDest());
1001   for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
1002     SuccessorBlocks.insert(I.getCaseSuccessor());
1003   // Add cost corresponding to the number of distinct destinations. The first
1004   // we model as free because of fallthrough.
1005   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
1006   return false;
1007 }
1008 
1009 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1010   // We never want to inline functions that contain an indirectbr.  This is
1011   // incorrect because all the blockaddress's (in static global initializers
1012   // for example) would be referring to the original function, and this
1013   // indirect jump would jump from the inlined copy of the function into the
1014   // original function which is extremely undefined behavior.
1015   // FIXME: This logic isn't really right; we can safely inline functions with
1016   // indirectbr's as long as no other function or global references the
1017   // blockaddress of a block within the current function.
1018   HasIndirectBr = true;
1019   return false;
1020 }
1021 
1022 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1023   // FIXME: It's not clear that a single instruction is an accurate model for
1024   // the inline cost of a resume instruction.
1025   return false;
1026 }
1027 
1028 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1029   // FIXME: It's not clear that a single instruction is an accurate model for
1030   // the inline cost of a cleanupret instruction.
1031   return false;
1032 }
1033 
1034 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1035   // FIXME: It's not clear that a single instruction is an accurate model for
1036   // the inline cost of a catchret instruction.
1037   return false;
1038 }
1039 
1040 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1041   // FIXME: It might be reasonably to discount the cost of instructions leading
1042   // to unreachable as they have the lowest possible impact on both runtime and
1043   // code size.
1044   return true; // No actual code is needed for unreachable.
1045 }
1046 
1047 bool CallAnalyzer::visitInstruction(Instruction &I) {
1048   // Some instructions are free. All of the free intrinsics can also be
1049   // handled by SROA, etc.
1050   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1051     return true;
1052 
1053   // We found something we don't understand or can't handle. Mark any SROA-able
1054   // values in the operand list as no longer viable.
1055   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1056     disableSROA(*OI);
1057 
1058   return false;
1059 }
1060 
1061 
1062 /// \brief Analyze a basic block for its contribution to the inline cost.
1063 ///
1064 /// This method walks the analyzer over every instruction in the given basic
1065 /// block and accounts for their cost during inlining at this callsite. It
1066 /// aborts early if the threshold has been exceeded or an impossible to inline
1067 /// construct has been detected. It returns false if inlining is no longer
1068 /// viable, and true if inlining remains viable.
1069 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1070                                 SmallPtrSetImpl<const Value *> &EphValues) {
1071   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1072     // FIXME: Currently, the number of instructions in a function regardless of
1073     // our ability to simplify them during inline to constants or dead code,
1074     // are actually used by the vector bonus heuristic. As long as that's true,
1075     // we have to special case debug intrinsics here to prevent differences in
1076     // inlining due to debug symbols. Eventually, the number of unsimplified
1077     // instructions shouldn't factor into the cost computation, but until then,
1078     // hack around it here.
1079     if (isa<DbgInfoIntrinsic>(I))
1080       continue;
1081 
1082     // Skip ephemeral values.
1083     if (EphValues.count(&*I))
1084       continue;
1085 
1086     ++NumInstructions;
1087     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1088       ++NumVectorInstructions;
1089 
1090     // If the instruction is floating point, and the target says this operation
1091     // is expensive or the function has the "use-soft-float" attribute, this may
1092     // eventually become a library call. Treat the cost as such.
1093     if (I->getType()->isFloatingPointTy()) {
1094       bool hasSoftFloatAttr = false;
1095 
1096       // If the function has the "use-soft-float" attribute, mark it as
1097       // expensive.
1098       if (F.hasFnAttribute("use-soft-float")) {
1099         Attribute Attr = F.getFnAttribute("use-soft-float");
1100         StringRef Val = Attr.getValueAsString();
1101         if (Val == "true")
1102           hasSoftFloatAttr = true;
1103       }
1104 
1105       if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1106           hasSoftFloatAttr)
1107         Cost += InlineConstants::CallPenalty;
1108     }
1109 
1110     // If the instruction simplified to a constant, there is no cost to this
1111     // instruction. Visit the instructions using our InstVisitor to account for
1112     // all of the per-instruction logic. The visit tree returns true if we
1113     // consumed the instruction in any way, and false if the instruction's base
1114     // cost should count against inlining.
1115     if (Base::visit(&*I))
1116       ++NumInstructionsSimplified;
1117     else
1118       Cost += InlineConstants::InstrCost;
1119 
1120     // If the visit this instruction detected an uninlinable pattern, abort.
1121     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1122         HasIndirectBr || HasFrameEscape)
1123       return false;
1124 
1125     // If the caller is a recursive function then we don't want to inline
1126     // functions which allocate a lot of stack space because it would increase
1127     // the caller stack usage dramatically.
1128     if (IsCallerRecursive &&
1129         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
1130       return false;
1131 
1132     // Check if we've past the maximum possible threshold so we don't spin in
1133     // huge basic blocks that will never inline.
1134     if (Cost > Threshold)
1135       return false;
1136   }
1137 
1138   return true;
1139 }
1140 
1141 /// \brief Compute the base pointer and cumulative constant offsets for V.
1142 ///
1143 /// This strips all constant offsets off of V, leaving it the base pointer, and
1144 /// accumulates the total constant offset applied in the returned constant. It
1145 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1146 /// no constant offsets applied.
1147 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1148   if (!V->getType()->isPointerTy())
1149     return nullptr;
1150 
1151   const DataLayout &DL = F.getParent()->getDataLayout();
1152   unsigned IntPtrWidth = DL.getPointerSizeInBits();
1153   APInt Offset = APInt::getNullValue(IntPtrWidth);
1154 
1155   // Even though we don't look through PHI nodes, we could be called on an
1156   // instruction in an unreachable block, which may be on a cycle.
1157   SmallPtrSet<Value *, 4> Visited;
1158   Visited.insert(V);
1159   do {
1160     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1161       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1162         return nullptr;
1163       V = GEP->getPointerOperand();
1164     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1165       V = cast<Operator>(V)->getOperand(0);
1166     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1167       if (GA->isInterposable())
1168         break;
1169       V = GA->getAliasee();
1170     } else {
1171       break;
1172     }
1173     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1174   } while (Visited.insert(V).second);
1175 
1176   Type *IntPtrTy = DL.getIntPtrType(V->getContext());
1177   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1178 }
1179 
1180 /// \brief Analyze a call site for potential inlining.
1181 ///
1182 /// Returns true if inlining this call is viable, and false if it is not
1183 /// viable. It computes the cost and adjusts the threshold based on numerous
1184 /// factors and heuristics. If this method returns false but the computed cost
1185 /// is below the computed threshold, then inlining was forcibly disabled by
1186 /// some artifact of the routine.
1187 bool CallAnalyzer::analyzeCall(CallSite CS) {
1188   ++NumCallsAnalyzed;
1189 
1190   // Perform some tweaks to the cost and threshold based on the direct
1191   // callsite information.
1192 
1193   // We want to more aggressively inline vector-dense kernels, so up the
1194   // threshold, and we'll lower it if the % of vector instructions gets too
1195   // low. Note that these bonuses are some what arbitrary and evolved over time
1196   // by accident as much as because they are principled bonuses.
1197   //
1198   // FIXME: It would be nice to remove all such bonuses. At least it would be
1199   // nice to base the bonus values on something more scientific.
1200   assert(NumInstructions == 0);
1201   assert(NumVectorInstructions == 0);
1202 
1203   // Update the threshold based on callsite properties
1204   updateThreshold(CS, F);
1205 
1206   FiftyPercentVectorBonus = 3 * Threshold / 2;
1207   TenPercentVectorBonus = 3 * Threshold / 4;
1208   const DataLayout &DL = F.getParent()->getDataLayout();
1209 
1210   // Track whether the post-inlining function would have more than one basic
1211   // block. A single basic block is often intended for inlining. Balloon the
1212   // threshold by 50% until we pass the single-BB phase.
1213   bool SingleBB = true;
1214   int SingleBBBonus = Threshold / 2;
1215 
1216   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1217   // this Threshold any time, and cost cannot decrease, we can stop processing
1218   // the rest of the function body.
1219   Threshold += (SingleBBBonus + FiftyPercentVectorBonus);
1220 
1221   // Give out bonuses per argument, as the instructions setting them up will
1222   // be gone after inlining.
1223   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1224     if (CS.isByValArgument(I)) {
1225       // We approximate the number of loads and stores needed by dividing the
1226       // size of the byval type by the target's pointer size.
1227       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1228       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1229       unsigned PointerSize = DL.getPointerSizeInBits();
1230       // Ceiling division.
1231       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1232 
1233       // If it generates more than 8 stores it is likely to be expanded as an
1234       // inline memcpy so we take that as an upper bound. Otherwise we assume
1235       // one load and one store per word copied.
1236       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1237       // here instead of a magic number of 8, but it's not available via
1238       // DataLayout.
1239       NumStores = std::min(NumStores, 8U);
1240 
1241       Cost -= 2 * NumStores * InlineConstants::InstrCost;
1242     } else {
1243       // For non-byval arguments subtract off one instruction per call
1244       // argument.
1245       Cost -= InlineConstants::InstrCost;
1246     }
1247   }
1248 
1249   // If there is only one call of the function, and it has internal linkage,
1250   // the cost of inlining it drops dramatically.
1251   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
1252     &F == CS.getCalledFunction();
1253   if (OnlyOneCallAndLocalLinkage)
1254     Cost += InlineConstants::LastCallToStaticBonus;
1255 
1256   // If this function uses the coldcc calling convention, prefer not to inline
1257   // it.
1258   if (F.getCallingConv() == CallingConv::Cold)
1259     Cost += InlineConstants::ColdccPenalty;
1260 
1261   // Check if we're done. This can happen due to bonuses and penalties.
1262   if (Cost > Threshold)
1263     return false;
1264 
1265   if (F.empty())
1266     return true;
1267 
1268   Function *Caller = CS.getInstruction()->getParent()->getParent();
1269   // Check if the caller function is recursive itself.
1270   for (User *U : Caller->users()) {
1271     CallSite Site(U);
1272     if (!Site)
1273       continue;
1274     Instruction *I = Site.getInstruction();
1275     if (I->getParent()->getParent() == Caller) {
1276       IsCallerRecursive = true;
1277       break;
1278     }
1279   }
1280 
1281   // Populate our simplified values by mapping from function arguments to call
1282   // arguments with known important simplifications.
1283   CallSite::arg_iterator CAI = CS.arg_begin();
1284   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1285        FAI != FAE; ++FAI, ++CAI) {
1286     assert(CAI != CS.arg_end());
1287     if (Constant *C = dyn_cast<Constant>(CAI))
1288       SimplifiedValues[&*FAI] = C;
1289 
1290     Value *PtrArg = *CAI;
1291     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1292       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1293 
1294       // We can SROA any pointer arguments derived from alloca instructions.
1295       if (isa<AllocaInst>(PtrArg)) {
1296         SROAArgValues[&*FAI] = PtrArg;
1297         SROAArgCosts[PtrArg] = 0;
1298       }
1299     }
1300   }
1301   NumConstantArgs = SimplifiedValues.size();
1302   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1303   NumAllocaArgs = SROAArgValues.size();
1304 
1305   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1306   // the ephemeral values multiple times (and they're completely determined by
1307   // the callee, so this is purely duplicate work).
1308   SmallPtrSet<const Value *, 32> EphValues;
1309   CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues);
1310 
1311   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1312   // adding basic blocks of the callee which can be proven to be dead for this
1313   // particular call site in order to get more accurate cost estimates. This
1314   // requires a somewhat heavyweight iteration pattern: we need to walk the
1315   // basic blocks in a breadth-first order as we insert live successors. To
1316   // accomplish this, prioritizing for small iterations because we exit after
1317   // crossing our threshold, we use a small-size optimized SetVector.
1318   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1319                                   SmallPtrSet<BasicBlock *, 16> > BBSetVector;
1320   BBSetVector BBWorklist;
1321   BBWorklist.insert(&F.getEntryBlock());
1322   // Note that we *must not* cache the size, this loop grows the worklist.
1323   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1324     // Bail out the moment we cross the threshold. This means we'll under-count
1325     // the cost, but only when undercounting doesn't matter.
1326     if (Cost > Threshold)
1327       break;
1328 
1329     BasicBlock *BB = BBWorklist[Idx];
1330     if (BB->empty())
1331       continue;
1332 
1333     // Disallow inlining a blockaddress. A blockaddress only has defined
1334     // behavior for an indirect branch in the same function, and we do not
1335     // currently support inlining indirect branches. But, the inliner may not
1336     // see an indirect branch that ends up being dead code at a particular call
1337     // site. If the blockaddress escapes the function, e.g., via a global
1338     // variable, inlining may lead to an invalid cross-function reference.
1339     if (BB->hasAddressTaken())
1340       return false;
1341 
1342     // Analyze the cost of this block. If we blow through the threshold, this
1343     // returns false, and we can bail on out.
1344     if (!analyzeBlock(BB, EphValues))
1345       return false;
1346 
1347     TerminatorInst *TI = BB->getTerminator();
1348 
1349     // Add in the live successors by first checking whether we have terminator
1350     // that may be simplified based on the values simplified by this call.
1351     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1352       if (BI->isConditional()) {
1353         Value *Cond = BI->getCondition();
1354         if (ConstantInt *SimpleCond
1355               = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1356           BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0));
1357           continue;
1358         }
1359       }
1360     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1361       Value *Cond = SI->getCondition();
1362       if (ConstantInt *SimpleCond
1363             = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1364         BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
1365         continue;
1366       }
1367     }
1368 
1369     // If we're unable to select a particular successor, just count all of
1370     // them.
1371     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1372          ++TIdx)
1373       BBWorklist.insert(TI->getSuccessor(TIdx));
1374 
1375     // If we had any successors at this point, than post-inlining is likely to
1376     // have them as well. Note that we assume any basic blocks which existed
1377     // due to branches or switches which folded above will also fold after
1378     // inlining.
1379     if (SingleBB && TI->getNumSuccessors() > 1) {
1380       // Take off the bonus we applied to the threshold.
1381       Threshold -= SingleBBBonus;
1382       SingleBB = false;
1383     }
1384   }
1385 
1386   // If this is a noduplicate call, we can still inline as long as
1387   // inlining this would cause the removal of the caller (so the instruction
1388   // is not actually duplicated, just moved).
1389   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1390     return false;
1391 
1392   // We applied the maximum possible vector bonus at the beginning. Now,
1393   // subtract the excess bonus, if any, from the Threshold before
1394   // comparing against Cost.
1395   if (NumVectorInstructions <= NumInstructions / 10)
1396     Threshold -= FiftyPercentVectorBonus;
1397   else if (NumVectorInstructions <= NumInstructions / 2)
1398     Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus);
1399 
1400   return Cost < std::max(1, Threshold);
1401 }
1402 
1403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1404 /// \brief Dump stats about this call's analysis.
1405 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1406 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1407   DEBUG_PRINT_STAT(NumConstantArgs);
1408   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1409   DEBUG_PRINT_STAT(NumAllocaArgs);
1410   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1411   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1412   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1413   DEBUG_PRINT_STAT(NumInstructions);
1414   DEBUG_PRINT_STAT(SROACostSavings);
1415   DEBUG_PRINT_STAT(SROACostSavingsLost);
1416   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1417   DEBUG_PRINT_STAT(Cost);
1418   DEBUG_PRINT_STAT(Threshold);
1419 #undef DEBUG_PRINT_STAT
1420 }
1421 #endif
1422 
1423 /// \brief Test that two functions either have or have not the given attribute
1424 ///        at the same time.
1425 template<typename AttrKind>
1426 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1427   return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1428 }
1429 
1430 /// \brief Test that there are no attribute conflicts between Caller and Callee
1431 ///        that prevent inlining.
1432 static bool functionsHaveCompatibleAttributes(Function *Caller,
1433                                               Function *Callee,
1434                                               TargetTransformInfo &TTI) {
1435   return TTI.areInlineCompatible(Caller, Callee) &&
1436          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1437 }
1438 
1439 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold,
1440                                TargetTransformInfo &CalleeTTI,
1441                                AssumptionCacheTracker *ACT) {
1442   return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI,
1443                        ACT);
1444 }
1445 
1446 int llvm::computeThresholdFromOptLevels(unsigned OptLevel,
1447                                         unsigned SizeOptLevel) {
1448   if (OptLevel > 2)
1449     return OptAggressiveThreshold;
1450   if (SizeOptLevel == 1) // -Os
1451     return OptSizeThreshold;
1452   if (SizeOptLevel == 2) // -Oz
1453     return OptMinSizeThreshold;
1454   return DefaultInlineThreshold;
1455 }
1456 
1457 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; }
1458 
1459 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee,
1460                                int DefaultThreshold,
1461                                TargetTransformInfo &CalleeTTI,
1462                                AssumptionCacheTracker *ACT) {
1463 
1464   // Cannot inline indirect calls.
1465   if (!Callee)
1466     return llvm::InlineCost::getNever();
1467 
1468   // Calls to functions with always-inline attributes should be inlined
1469   // whenever possible.
1470   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1471     if (isInlineViable(*Callee))
1472       return llvm::InlineCost::getAlways();
1473     return llvm::InlineCost::getNever();
1474   }
1475 
1476   // Never inline functions with conflicting attributes (unless callee has
1477   // always-inline attribute).
1478   if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI))
1479     return llvm::InlineCost::getNever();
1480 
1481   // Don't inline this call if the caller has the optnone attribute.
1482   if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
1483     return llvm::InlineCost::getNever();
1484 
1485   // Don't inline functions which can be interposed at link-time.  Don't inline
1486   // functions marked noinline or call sites marked noinline.
1487   // Note: inlining non-exact non-interposable fucntions is fine, since we know
1488   // we have *a* correct implementation of the source level function.
1489   if (Callee->isInterposable() ||
1490       Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
1491     return llvm::InlineCost::getNever();
1492 
1493   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1494         << "...\n");
1495 
1496   CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS);
1497   bool ShouldInline = CA.analyzeCall(CS);
1498 
1499   DEBUG(CA.dump());
1500 
1501   // Check if there was a reason to force inlining or no inlining.
1502   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1503     return InlineCost::getNever();
1504   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1505     return InlineCost::getAlways();
1506 
1507   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
1508 }
1509 
1510 bool llvm::isInlineViable(Function &F) {
1511   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
1512   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
1513     // Disallow inlining of functions which contain indirect branches or
1514     // blockaddresses.
1515     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
1516       return false;
1517 
1518     for (auto &II : *BI) {
1519       CallSite CS(&II);
1520       if (!CS)
1521         continue;
1522 
1523       // Disallow recursive calls.
1524       if (&F == CS.getCalledFunction())
1525         return false;
1526 
1527       // Disallow calls which expose returns-twice to a function not previously
1528       // attributed as such.
1529       if (!ReturnsTwice && CS.isCall() &&
1530           cast<CallInst>(CS.getInstruction())->canReturnTwice())
1531         return false;
1532 
1533       // Disallow inlining functions that call @llvm.localescape. Doing this
1534       // correctly would require major changes to the inliner.
1535       if (CS.getCalledFunction() &&
1536           CS.getCalledFunction()->getIntrinsicID() ==
1537               llvm::Intrinsic::localescape)
1538         return false;
1539     }
1540   }
1541 
1542   return true;
1543 }
1544