1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/Config/llvm-config.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/IR/GetElementPtrTypeIterator.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/InstVisitor.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Operator.h"
38 #include "llvm/IR/PatternMatch.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/raw_ostream.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "inline-cost"
46 
47 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
48 
49 static cl::opt<int>
50     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
51                      cl::ZeroOrMore,
52                      cl::desc("Default amount of inlining to perform"));
53 
54 static cl::opt<int> InlineThreshold(
55     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
56     cl::desc("Control the amount of inlining to perform (default = 225)"));
57 
58 static cl::opt<int> HintThreshold(
59     "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
60     cl::desc("Threshold for inlining functions with inline hint"));
61 
62 static cl::opt<int>
63     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
64                           cl::init(45), cl::ZeroOrMore,
65                           cl::desc("Threshold for inlining cold callsites"));
66 
67 // We introduce this threshold to help performance of instrumentation based
68 // PGO before we actually hook up inliner with analysis passes such as BPI and
69 // BFI.
70 static cl::opt<int> ColdThreshold(
71     "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
72     cl::desc("Threshold for inlining functions with cold attribute"));
73 
74 static cl::opt<int>
75     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
76                          cl::ZeroOrMore,
77                          cl::desc("Threshold for hot callsites "));
78 
79 static cl::opt<int> LocallyHotCallSiteThreshold(
80     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
81     cl::desc("Threshold for locally hot callsites "));
82 
83 static cl::opt<int> ColdCallSiteRelFreq(
84     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
85     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
86              "entry frequency, for a callsite to be cold in the absence of "
87              "profile information."));
88 
89 static cl::opt<int> HotCallSiteRelFreq(
90     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
91     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
92              "entry frequency, for a callsite to be hot in the absence of "
93              "profile information."));
94 
95 static cl::opt<bool> OptComputeFullInlineCost(
96     "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
97     cl::desc("Compute the full inline cost of a call site even when the cost "
98              "exceeds the threshold."));
99 
100 namespace {
101 class InlineCostCallAnalyzer;
102 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
103   typedef InstVisitor<CallAnalyzer, bool> Base;
104   friend class InstVisitor<CallAnalyzer, bool>;
105 
106 protected:
107   virtual ~CallAnalyzer() {}
108   /// The TargetTransformInfo available for this compilation.
109   const TargetTransformInfo &TTI;
110 
111   /// Getter for the cache of @llvm.assume intrinsics.
112   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
113 
114   /// Getter for BlockFrequencyInfo
115   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
116 
117   /// Profile summary information.
118   ProfileSummaryInfo *PSI;
119 
120   /// The called function.
121   Function &F;
122 
123   // Cache the DataLayout since we use it a lot.
124   const DataLayout &DL;
125 
126   /// The OptimizationRemarkEmitter available for this compilation.
127   OptimizationRemarkEmitter *ORE;
128 
129   /// The candidate callsite being analyzed. Please do not use this to do
130   /// analysis in the caller function; we want the inline cost query to be
131   /// easily cacheable. Instead, use the cover function paramHasAttr.
132   CallBase &CandidateCall;
133 
134   /// Extension points for handling callsite features.
135   /// Called after a basic block was analyzed.
136   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
137 
138   /// Called at the end of the analysis of the callsite. Return the outcome of
139   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
140   /// the reason it can't.
141   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
142   /// Called when we're about to start processing a basic block, and every time
143   /// we are done processing an instruction. Return true if there is no point in
144   /// continuing the analysis (e.g. we've determined already the call site is
145   /// too expensive to inline)
146   virtual bool shouldStop() { return false; }
147 
148   /// Called before the analysis of the callee body starts (with callsite
149   /// contexts propagated).  It checks callsite-specific information. Return a
150   /// reason analysis can't continue if that's the case, or 'true' if it may
151   /// continue.
152   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
153   /// Called if the analysis engine decides SROA cannot be done for the given
154   /// alloca.
155   virtual void onDisableSROA(AllocaInst *Arg) {}
156 
157   /// Called the analysis engine determines load elimination won't happen.
158   virtual void onDisableLoadElimination() {}
159 
160   /// Called to account for a call.
161   virtual void onCallPenalty() {}
162 
163   /// Called to account for the expectation the inlining would result in a load
164   /// elimination.
165   virtual void onLoadEliminationOpportunity() {}
166 
167   /// Called to account for the cost of argument setup for the Call in the
168   /// callee's body (not the callsite currently under analysis).
169   virtual void onCallArgumentSetup(const CallBase &Call) {}
170 
171   /// Called to account for a load relative intrinsic.
172   virtual void onLoadRelativeIntrinsic() {}
173 
174   /// Called to account for a lowered call.
175   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
176   }
177 
178   /// Account for a jump table of given size. Return false to stop further
179   /// processing the switch instruction
180   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
181 
182   /// Account for a case cluster of given size. Return false to stop further
183   /// processing of the instruction.
184   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
185 
186   /// Called at the end of processing a switch instruction, with the given
187   /// number of case clusters.
188   virtual void onFinalizeSwitch(unsigned JumpTableSize,
189                                 unsigned NumCaseCluster) {}
190 
191   /// Called to account for any other instruction not specifically accounted
192   /// for.
193   virtual void onMissedSimplification() {}
194 
195   /// Start accounting potential benefits due to SROA for the given alloca.
196   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
197 
198   /// Account SROA savings for the AllocaInst value.
199   virtual void onAggregateSROAUse(AllocaInst *V) {}
200 
201   bool handleSROA(Value *V, bool DoNotDisable) {
202     // Check for SROA candidates in comparisons.
203     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
204       if (DoNotDisable) {
205         onAggregateSROAUse(SROAArg);
206         return true;
207       }
208       disableSROAForArg(SROAArg);
209     }
210     return false;
211   }
212 
213   bool IsCallerRecursive = false;
214   bool IsRecursiveCall = false;
215   bool ExposesReturnsTwice = false;
216   bool HasDynamicAlloca = false;
217   bool ContainsNoDuplicateCall = false;
218   bool HasReturn = false;
219   bool HasIndirectBr = false;
220   bool HasUninlineableIntrinsic = false;
221   bool InitsVargArgs = false;
222 
223   /// Number of bytes allocated statically by the callee.
224   uint64_t AllocatedSize = 0;
225   unsigned NumInstructions = 0;
226   unsigned NumVectorInstructions = 0;
227 
228   /// While we walk the potentially-inlined instructions, we build up and
229   /// maintain a mapping of simplified values specific to this callsite. The
230   /// idea is to propagate any special information we have about arguments to
231   /// this call through the inlinable section of the function, and account for
232   /// likely simplifications post-inlining. The most important aspect we track
233   /// is CFG altering simplifications -- when we prove a basic block dead, that
234   /// can cause dramatic shifts in the cost of inlining a function.
235   DenseMap<Value *, Constant *> SimplifiedValues;
236 
237   /// Keep track of the values which map back (through function arguments) to
238   /// allocas on the caller stack which could be simplified through SROA.
239   DenseMap<Value *, AllocaInst *> SROAArgValues;
240 
241   /// Keep track of Allocas for which we believe we may get SROA optimization.
242   DenseSet<AllocaInst *> EnabledSROAAllocas;
243 
244   /// Keep track of values which map to a pointer base and constant offset.
245   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
246 
247   /// Keep track of dead blocks due to the constant arguments.
248   SetVector<BasicBlock *> DeadBlocks;
249 
250   /// The mapping of the blocks to their known unique successors due to the
251   /// constant arguments.
252   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
253 
254   /// Model the elimination of repeated loads that is expected to happen
255   /// whenever we simplify away the stores that would otherwise cause them to be
256   /// loads.
257   bool EnableLoadElimination;
258   SmallPtrSet<Value *, 16> LoadAddrSet;
259 
260   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
261     auto It = SROAArgValues.find(V);
262     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
263       return nullptr;
264     return It->second;
265   }
266 
267   // Custom simplification helper routines.
268   bool isAllocaDerivedArg(Value *V);
269   void disableSROAForArg(AllocaInst *SROAArg);
270   void disableSROA(Value *V);
271   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
272   void disableLoadElimination();
273   bool isGEPFree(GetElementPtrInst &GEP);
274   bool canFoldInboundsGEP(GetElementPtrInst &I);
275   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
276   bool simplifyCallSite(Function *F, CallBase &Call);
277   template <typename Callable>
278   bool simplifyInstruction(Instruction &I, Callable Evaluate);
279   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
280 
281   /// Return true if the given argument to the function being considered for
282   /// inlining has the given attribute set either at the call site or the
283   /// function declaration.  Primarily used to inspect call site specific
284   /// attributes since these can be more precise than the ones on the callee
285   /// itself.
286   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
287 
288   /// Return true if the given value is known non null within the callee if
289   /// inlined through this particular callsite.
290   bool isKnownNonNullInCallee(Value *V);
291 
292   /// Return true if size growth is allowed when inlining the callee at \p Call.
293   bool allowSizeGrowth(CallBase &Call);
294 
295   // Custom analysis routines.
296   InlineResult analyzeBlock(BasicBlock *BB,
297                             SmallPtrSetImpl<const Value *> &EphValues);
298 
299   // Disable several entry points to the visitor so we don't accidentally use
300   // them by declaring but not defining them here.
301   void visit(Module *);
302   void visit(Module &);
303   void visit(Function *);
304   void visit(Function &);
305   void visit(BasicBlock *);
306   void visit(BasicBlock &);
307 
308   // Provide base case for our instruction visit.
309   bool visitInstruction(Instruction &I);
310 
311   // Our visit overrides.
312   bool visitAlloca(AllocaInst &I);
313   bool visitPHI(PHINode &I);
314   bool visitGetElementPtr(GetElementPtrInst &I);
315   bool visitBitCast(BitCastInst &I);
316   bool visitPtrToInt(PtrToIntInst &I);
317   bool visitIntToPtr(IntToPtrInst &I);
318   bool visitCastInst(CastInst &I);
319   bool visitUnaryInstruction(UnaryInstruction &I);
320   bool visitCmpInst(CmpInst &I);
321   bool visitSub(BinaryOperator &I);
322   bool visitBinaryOperator(BinaryOperator &I);
323   bool visitFNeg(UnaryOperator &I);
324   bool visitLoad(LoadInst &I);
325   bool visitStore(StoreInst &I);
326   bool visitExtractValue(ExtractValueInst &I);
327   bool visitInsertValue(InsertValueInst &I);
328   bool visitCallBase(CallBase &Call);
329   bool visitReturnInst(ReturnInst &RI);
330   bool visitBranchInst(BranchInst &BI);
331   bool visitSelectInst(SelectInst &SI);
332   bool visitSwitchInst(SwitchInst &SI);
333   bool visitIndirectBrInst(IndirectBrInst &IBI);
334   bool visitResumeInst(ResumeInst &RI);
335   bool visitCleanupReturnInst(CleanupReturnInst &RI);
336   bool visitCatchReturnInst(CatchReturnInst &RI);
337   bool visitUnreachableInst(UnreachableInst &I);
338 
339 public:
340   CallAnalyzer(const TargetTransformInfo &TTI,
341                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
342                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
343                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
344                Function &Callee, CallBase &Call)
345       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
346         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
347         CandidateCall(Call), EnableLoadElimination(true) {}
348 
349   InlineResult analyze();
350 
351   // Keep a bunch of stats about the cost savings found so we can print them
352   // out when debugging.
353   unsigned NumConstantArgs = 0;
354   unsigned NumConstantOffsetPtrArgs = 0;
355   unsigned NumAllocaArgs = 0;
356   unsigned NumConstantPtrCmps = 0;
357   unsigned NumConstantPtrDiffs = 0;
358   unsigned NumInstructionsSimplified = 0;
359 
360   void dump();
361 };
362 
363 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
364 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
365 class InlineCostCallAnalyzer final : public CallAnalyzer {
366   const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
367   const bool ComputeFullInlineCost;
368   int LoadEliminationCost = 0;
369   /// Bonus to be applied when percentage of vector instructions in callee is
370   /// high (see more details in updateThreshold).
371   int VectorBonus = 0;
372   /// Bonus to be applied when the callee has only one reachable basic block.
373   int SingleBBBonus = 0;
374 
375   /// Tunable parameters that control the analysis.
376   const InlineParams &Params;
377 
378   /// Upper bound for the inlining cost. Bonuses are being applied to account
379   /// for speculative "expected profit" of the inlining decision.
380   int Threshold = 0;
381 
382   /// Attempt to evaluate indirect calls to boost its inline cost.
383   const bool BoostIndirectCalls;
384 
385   /// Inlining cost measured in abstract units, accounts for all the
386   /// instructions expected to be executed for a given function invocation.
387   /// Instructions that are statically proven to be dead based on call-site
388   /// arguments are not counted here.
389   int Cost = 0;
390 
391   bool SingleBB = true;
392 
393   unsigned SROACostSavings = 0;
394   unsigned SROACostSavingsLost = 0;
395 
396   /// The mapping of caller Alloca values to their accumulated cost savings. If
397   /// we have to disable SROA for one of the allocas, this tells us how much
398   /// cost must be added.
399   DenseMap<AllocaInst *, int> SROAArgCosts;
400 
401   /// Return true if \p Call is a cold callsite.
402   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
403 
404   /// Update Threshold based on callsite properties such as callee
405   /// attributes and callee hotness for PGO builds. The Callee is explicitly
406   /// passed to support analyzing indirect calls whose target is inferred by
407   /// analysis.
408   void updateThreshold(CallBase &Call, Function &Callee);
409   /// Return a higher threshold if \p Call is a hot callsite.
410   Optional<int> getHotCallSiteThreshold(CallBase &Call,
411                                         BlockFrequencyInfo *CallerBFI);
412 
413   /// Handle a capped 'int' increment for Cost.
414   void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
415     assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
416     Cost = (int)std::min(UpperBound, Cost + Inc);
417   }
418 
419   void onDisableSROA(AllocaInst *Arg) override {
420     auto CostIt = SROAArgCosts.find(Arg);
421     if (CostIt == SROAArgCosts.end())
422       return;
423     addCost(CostIt->second);
424     SROACostSavings -= CostIt->second;
425     SROACostSavingsLost += CostIt->second;
426     SROAArgCosts.erase(CostIt);
427   }
428 
429   void onDisableLoadElimination() override {
430     addCost(LoadEliminationCost);
431     LoadEliminationCost = 0;
432   }
433   void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
434   void onCallArgumentSetup(const CallBase &Call) override {
435     // Pay the price of the argument setup. We account for the average 1
436     // instruction per call argument setup here.
437     addCost(Call.arg_size() * InlineConstants::InstrCost);
438   }
439   void onLoadRelativeIntrinsic() override {
440     // This is normally lowered to 4 LLVM instructions.
441     addCost(3 * InlineConstants::InstrCost);
442   }
443   void onLoweredCall(Function *F, CallBase &Call,
444                      bool IsIndirectCall) override {
445     // We account for the average 1 instruction per call argument setup here.
446     addCost(Call.arg_size() * InlineConstants::InstrCost);
447 
448     // If we have a constant that we are calling as a function, we can peer
449     // through it and see the function target. This happens not infrequently
450     // during devirtualization and so we want to give it a hefty bonus for
451     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
452     // Pretend to inline the function, with a custom threshold.
453     if (IsIndirectCall && BoostIndirectCalls) {
454       auto IndirectCallParams = Params;
455       IndirectCallParams.DefaultThreshold =
456           InlineConstants::IndirectCallThreshold;
457       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
458       /// to instantiate the derived class.
459       InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
460                                 Call, IndirectCallParams, false);
461       if (CA.analyze().isSuccess()) {
462         // We were able to inline the indirect call! Subtract the cost from the
463         // threshold to get the bonus we want to apply, but don't go below zero.
464         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
465       }
466     } else
467       // Otherwise simply add the cost for merely making the call.
468       addCost(InlineConstants::CallPenalty);
469   }
470 
471   void onFinalizeSwitch(unsigned JumpTableSize,
472                         unsigned NumCaseCluster) override {
473     // If suitable for a jump table, consider the cost for the table size and
474     // branch to destination.
475     // Maximum valid cost increased in this function.
476     if (JumpTableSize) {
477       int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
478                        4 * InlineConstants::InstrCost;
479 
480       addCost(JTCost, (int64_t)CostUpperBound);
481       return;
482     }
483     // Considering forming a binary search, we should find the number of nodes
484     // which is same as the number of comparisons when lowered. For a given
485     // number of clusters, n, we can define a recursive function, f(n), to find
486     // the number of nodes in the tree. The recursion is :
487     // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
488     // and f(n) = n, when n <= 3.
489     // This will lead a binary tree where the leaf should be either f(2) or f(3)
490     // when n > 3.  So, the number of comparisons from leaves should be n, while
491     // the number of non-leaf should be :
492     //   2^(log2(n) - 1) - 1
493     //   = 2^log2(n) * 2^-1 - 1
494     //   = n / 2 - 1.
495     // Considering comparisons from leaf and non-leaf nodes, we can estimate the
496     // number of comparisons in a simple closed form :
497     //   n + n / 2 - 1 = n * 3 / 2 - 1
498     if (NumCaseCluster <= 3) {
499       // Suppose a comparison includes one compare and one conditional branch.
500       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
501       return;
502     }
503 
504     int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
505     int64_t SwitchCost =
506         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
507 
508     addCost(SwitchCost, (int64_t)CostUpperBound);
509   }
510   void onMissedSimplification() override {
511     addCost(InlineConstants::InstrCost);
512   }
513 
514   void onInitializeSROAArg(AllocaInst *Arg) override {
515     assert(Arg != nullptr &&
516            "Should not initialize SROA costs for null value.");
517     SROAArgCosts[Arg] = 0;
518   }
519 
520   void onAggregateSROAUse(AllocaInst *SROAArg) override {
521     auto CostIt = SROAArgCosts.find(SROAArg);
522     assert(CostIt != SROAArgCosts.end() &&
523            "expected this argument to have a cost");
524     CostIt->second += InlineConstants::InstrCost;
525     SROACostSavings += InlineConstants::InstrCost;
526   }
527 
528   void onBlockAnalyzed(const BasicBlock *BB) override {
529     auto *TI = BB->getTerminator();
530     // If we had any successors at this point, than post-inlining is likely to
531     // have them as well. Note that we assume any basic blocks which existed
532     // due to branches or switches which folded above will also fold after
533     // inlining.
534     if (SingleBB && TI->getNumSuccessors() > 1) {
535       // Take off the bonus we applied to the threshold.
536       Threshold -= SingleBBBonus;
537       SingleBB = false;
538     }
539   }
540 
541   InlineResult finalizeAnalysis() override {
542     // Loops generally act a lot like calls in that they act like barriers to
543     // movement, require a certain amount of setup, etc. So when optimising for
544     // size, we penalise any call sites that perform loops. We do this after all
545     // other costs here, so will likely only be dealing with relatively small
546     // functions (and hence DT and LI will hopefully be cheap).
547     auto *Caller = CandidateCall.getFunction();
548     if (Caller->hasMinSize()) {
549       DominatorTree DT(F);
550       LoopInfo LI(DT);
551       int NumLoops = 0;
552       for (Loop *L : LI) {
553         // Ignore loops that will not be executed
554         if (DeadBlocks.count(L->getHeader()))
555           continue;
556         NumLoops++;
557       }
558       addCost(NumLoops * InlineConstants::CallPenalty);
559     }
560 
561     // We applied the maximum possible vector bonus at the beginning. Now,
562     // subtract the excess bonus, if any, from the Threshold before
563     // comparing against Cost.
564     if (NumVectorInstructions <= NumInstructions / 10)
565       Threshold -= VectorBonus;
566     else if (NumVectorInstructions <= NumInstructions / 2)
567       Threshold -= VectorBonus / 2;
568 
569     if (Cost < std::max(1, Threshold))
570       return InlineResult::success();
571     return InlineResult::failure("Cost over threshold.");
572   }
573   bool shouldStop() override {
574     // Bail out the moment we cross the threshold. This means we'll under-count
575     // the cost, but only when undercounting doesn't matter.
576     return Cost >= Threshold && !ComputeFullInlineCost;
577   }
578 
579   void onLoadEliminationOpportunity() override {
580     LoadEliminationCost += InlineConstants::InstrCost;
581   }
582 
583   InlineResult onAnalysisStart() override {
584     // Perform some tweaks to the cost and threshold based on the direct
585     // callsite information.
586 
587     // We want to more aggressively inline vector-dense kernels, so up the
588     // threshold, and we'll lower it if the % of vector instructions gets too
589     // low. Note that these bonuses are some what arbitrary and evolved over
590     // time by accident as much as because they are principled bonuses.
591     //
592     // FIXME: It would be nice to remove all such bonuses. At least it would be
593     // nice to base the bonus values on something more scientific.
594     assert(NumInstructions == 0);
595     assert(NumVectorInstructions == 0);
596 
597     // Update the threshold based on callsite properties
598     updateThreshold(CandidateCall, F);
599 
600     // While Threshold depends on commandline options that can take negative
601     // values, we want to enforce the invariant that the computed threshold and
602     // bonuses are non-negative.
603     assert(Threshold >= 0);
604     assert(SingleBBBonus >= 0);
605     assert(VectorBonus >= 0);
606 
607     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
608     // this Threshold any time, and cost cannot decrease, we can stop processing
609     // the rest of the function body.
610     Threshold += (SingleBBBonus + VectorBonus);
611 
612     // Give out bonuses for the callsite, as the instructions setting them up
613     // will be gone after inlining.
614     addCost(-getCallsiteCost(this->CandidateCall, DL));
615 
616     // If this function uses the coldcc calling convention, prefer not to inline
617     // it.
618     if (F.getCallingConv() == CallingConv::Cold)
619       Cost += InlineConstants::ColdccPenalty;
620 
621     // Check if we're done. This can happen due to bonuses and penalties.
622     if (Cost >= Threshold && !ComputeFullInlineCost)
623       return InlineResult::failure("high cost");
624 
625     return InlineResult::success();
626   }
627 
628 public:
629   InlineCostCallAnalyzer(
630       const TargetTransformInfo &TTI,
631       std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
632       Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
633       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
634       CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
635       : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
636         ComputeFullInlineCost(OptComputeFullInlineCost ||
637                               Params.ComputeFullInlineCost || ORE),
638         Params(Params), Threshold(Params.DefaultThreshold),
639         BoostIndirectCalls(BoostIndirect) {}
640   void dump();
641 
642   virtual ~InlineCostCallAnalyzer() {}
643   int getThreshold() { return Threshold; }
644   int getCost() { return Cost; }
645 };
646 } // namespace
647 
648 /// Test whether the given value is an Alloca-derived function argument.
649 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
650   return SROAArgValues.count(V);
651 }
652 
653 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
654   onDisableSROA(SROAArg);
655   EnabledSROAAllocas.erase(SROAArg);
656   disableLoadElimination();
657 }
658 /// If 'V' maps to a SROA candidate, disable SROA for it.
659 void CallAnalyzer::disableSROA(Value *V) {
660   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
661     disableSROAForArg(SROAArg);
662   }
663 }
664 
665 void CallAnalyzer::disableLoadElimination() {
666   if (EnableLoadElimination) {
667     onDisableLoadElimination();
668     EnableLoadElimination = false;
669   }
670 }
671 
672 /// Accumulate a constant GEP offset into an APInt if possible.
673 ///
674 /// Returns false if unable to compute the offset for any reason. Respects any
675 /// simplified values known during the analysis of this callsite.
676 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
677   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
678   assert(IntPtrWidth == Offset.getBitWidth());
679 
680   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
681        GTI != GTE; ++GTI) {
682     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
683     if (!OpC)
684       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
685         OpC = dyn_cast<ConstantInt>(SimpleOp);
686     if (!OpC)
687       return false;
688     if (OpC->isZero())
689       continue;
690 
691     // Handle a struct index, which adds its field offset to the pointer.
692     if (StructType *STy = GTI.getStructTypeOrNull()) {
693       unsigned ElementIdx = OpC->getZExtValue();
694       const StructLayout *SL = DL.getStructLayout(STy);
695       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
696       continue;
697     }
698 
699     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
700     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
701   }
702   return true;
703 }
704 
705 /// Use TTI to check whether a GEP is free.
706 ///
707 /// Respects any simplified values known during the analysis of this callsite.
708 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
709   SmallVector<Value *, 4> Operands;
710   Operands.push_back(GEP.getOperand(0));
711   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
712     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
713       Operands.push_back(SimpleOp);
714     else
715       Operands.push_back(*I);
716   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
717 }
718 
719 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
720   // Check whether inlining will turn a dynamic alloca into a static
721   // alloca and handle that case.
722   if (I.isArrayAllocation()) {
723     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
724     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
725       Type *Ty = I.getAllocatedType();
726       AllocatedSize = SaturatingMultiplyAdd(
727           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
728           AllocatedSize);
729       return Base::visitAlloca(I);
730     }
731   }
732 
733   // Accumulate the allocated size.
734   if (I.isStaticAlloca()) {
735     Type *Ty = I.getAllocatedType();
736     AllocatedSize =
737         SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize);
738   }
739 
740   // We will happily inline static alloca instructions.
741   if (I.isStaticAlloca())
742     return Base::visitAlloca(I);
743 
744   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
745   // a variety of reasons, and so we would like to not inline them into
746   // functions which don't currently have a dynamic alloca. This simply
747   // disables inlining altogether in the presence of a dynamic alloca.
748   HasDynamicAlloca = true;
749   return false;
750 }
751 
752 bool CallAnalyzer::visitPHI(PHINode &I) {
753   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
754   // though we don't want to propagate it's bonuses. The idea is to disable
755   // SROA if it *might* be used in an inappropriate manner.
756 
757   // Phi nodes are always zero-cost.
758   // FIXME: Pointer sizes may differ between different address spaces, so do we
759   // need to use correct address space in the call to getPointerSizeInBits here?
760   // Or could we skip the getPointerSizeInBits call completely? As far as I can
761   // see the ZeroOffset is used as a dummy value, so we can probably use any
762   // bit width for the ZeroOffset?
763   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
764   bool CheckSROA = I.getType()->isPointerTy();
765 
766   // Track the constant or pointer with constant offset we've seen so far.
767   Constant *FirstC = nullptr;
768   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
769   Value *FirstV = nullptr;
770 
771   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
772     BasicBlock *Pred = I.getIncomingBlock(i);
773     // If the incoming block is dead, skip the incoming block.
774     if (DeadBlocks.count(Pred))
775       continue;
776     // If the parent block of phi is not the known successor of the incoming
777     // block, skip the incoming block.
778     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
779     if (KnownSuccessor && KnownSuccessor != I.getParent())
780       continue;
781 
782     Value *V = I.getIncomingValue(i);
783     // If the incoming value is this phi itself, skip the incoming value.
784     if (&I == V)
785       continue;
786 
787     Constant *C = dyn_cast<Constant>(V);
788     if (!C)
789       C = SimplifiedValues.lookup(V);
790 
791     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
792     if (!C && CheckSROA)
793       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
794 
795     if (!C && !BaseAndOffset.first)
796       // The incoming value is neither a constant nor a pointer with constant
797       // offset, exit early.
798       return true;
799 
800     if (FirstC) {
801       if (FirstC == C)
802         // If we've seen a constant incoming value before and it is the same
803         // constant we see this time, continue checking the next incoming value.
804         continue;
805       // Otherwise early exit because we either see a different constant or saw
806       // a constant before but we have a pointer with constant offset this time.
807       return true;
808     }
809 
810     if (FirstV) {
811       // The same logic as above, but check pointer with constant offset here.
812       if (FirstBaseAndOffset == BaseAndOffset)
813         continue;
814       return true;
815     }
816 
817     if (C) {
818       // This is the 1st time we've seen a constant, record it.
819       FirstC = C;
820       continue;
821     }
822 
823     // The remaining case is that this is the 1st time we've seen a pointer with
824     // constant offset, record it.
825     FirstV = V;
826     FirstBaseAndOffset = BaseAndOffset;
827   }
828 
829   // Check if we can map phi to a constant.
830   if (FirstC) {
831     SimplifiedValues[&I] = FirstC;
832     return true;
833   }
834 
835   // Check if we can map phi to a pointer with constant offset.
836   if (FirstBaseAndOffset.first) {
837     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
838 
839     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
840       SROAArgValues[&I] = SROAArg;
841   }
842 
843   return true;
844 }
845 
846 /// Check we can fold GEPs of constant-offset call site argument pointers.
847 /// This requires target data and inbounds GEPs.
848 ///
849 /// \return true if the specified GEP can be folded.
850 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
851   // Check if we have a base + offset for the pointer.
852   std::pair<Value *, APInt> BaseAndOffset =
853       ConstantOffsetPtrs.lookup(I.getPointerOperand());
854   if (!BaseAndOffset.first)
855     return false;
856 
857   // Check if the offset of this GEP is constant, and if so accumulate it
858   // into Offset.
859   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
860     return false;
861 
862   // Add the result as a new mapping to Base + Offset.
863   ConstantOffsetPtrs[&I] = BaseAndOffset;
864 
865   return true;
866 }
867 
868 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
869   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
870 
871   // Lambda to check whether a GEP's indices are all constant.
872   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
873     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
874       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
875         return false;
876     return true;
877   };
878 
879   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
880     if (SROAArg)
881       SROAArgValues[&I] = SROAArg;
882 
883     // Constant GEPs are modeled as free.
884     return true;
885   }
886 
887   // Variable GEPs will require math and will disable SROA.
888   if (SROAArg)
889     disableSROAForArg(SROAArg);
890   return isGEPFree(I);
891 }
892 
893 /// Simplify \p I if its operands are constants and update SimplifiedValues.
894 /// \p Evaluate is a callable specific to instruction type that evaluates the
895 /// instruction when all the operands are constants.
896 template <typename Callable>
897 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
898   SmallVector<Constant *, 2> COps;
899   for (Value *Op : I.operands()) {
900     Constant *COp = dyn_cast<Constant>(Op);
901     if (!COp)
902       COp = SimplifiedValues.lookup(Op);
903     if (!COp)
904       return false;
905     COps.push_back(COp);
906   }
907   auto *C = Evaluate(COps);
908   if (!C)
909     return false;
910   SimplifiedValues[&I] = C;
911   return true;
912 }
913 
914 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
915   // Propagate constants through bitcasts.
916   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
917         return ConstantExpr::getBitCast(COps[0], I.getType());
918       }))
919     return true;
920 
921   // Track base/offsets through casts
922   std::pair<Value *, APInt> BaseAndOffset =
923       ConstantOffsetPtrs.lookup(I.getOperand(0));
924   // Casts don't change the offset, just wrap it up.
925   if (BaseAndOffset.first)
926     ConstantOffsetPtrs[&I] = BaseAndOffset;
927 
928   // Also look for SROA candidates here.
929   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
930     SROAArgValues[&I] = SROAArg;
931 
932   // Bitcasts are always zero cost.
933   return true;
934 }
935 
936 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
937   // Propagate constants through ptrtoint.
938   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
939         return ConstantExpr::getPtrToInt(COps[0], I.getType());
940       }))
941     return true;
942 
943   // Track base/offset pairs when converted to a plain integer provided the
944   // integer is large enough to represent the pointer.
945   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
946   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
947   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
948     std::pair<Value *, APInt> BaseAndOffset =
949         ConstantOffsetPtrs.lookup(I.getOperand(0));
950     if (BaseAndOffset.first)
951       ConstantOffsetPtrs[&I] = BaseAndOffset;
952   }
953 
954   // This is really weird. Technically, ptrtoint will disable SROA. However,
955   // unless that ptrtoint is *used* somewhere in the live basic blocks after
956   // inlining, it will be nuked, and SROA should proceed. All of the uses which
957   // would block SROA would also block SROA if applied directly to a pointer,
958   // and so we can just add the integer in here. The only places where SROA is
959   // preserved either cannot fire on an integer, or won't in-and-of themselves
960   // disable SROA (ext) w/o some later use that we would see and disable.
961   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
962     SROAArgValues[&I] = SROAArg;
963 
964   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
965 }
966 
967 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
968   // Propagate constants through ptrtoint.
969   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
970         return ConstantExpr::getIntToPtr(COps[0], I.getType());
971       }))
972     return true;
973 
974   // Track base/offset pairs when round-tripped through a pointer without
975   // modifications provided the integer is not too large.
976   Value *Op = I.getOperand(0);
977   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
978   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
979     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
980     if (BaseAndOffset.first)
981       ConstantOffsetPtrs[&I] = BaseAndOffset;
982   }
983 
984   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
985   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
986     SROAArgValues[&I] = SROAArg;
987 
988   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
989 }
990 
991 bool CallAnalyzer::visitCastInst(CastInst &I) {
992   // Propagate constants through casts.
993   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
994         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
995       }))
996     return true;
997 
998   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
999   disableSROA(I.getOperand(0));
1000 
1001   // If this is a floating-point cast, and the target says this operation
1002   // is expensive, this may eventually become a library call. Treat the cost
1003   // as such.
1004   switch (I.getOpcode()) {
1005   case Instruction::FPTrunc:
1006   case Instruction::FPExt:
1007   case Instruction::UIToFP:
1008   case Instruction::SIToFP:
1009   case Instruction::FPToUI:
1010   case Instruction::FPToSI:
1011     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1012       onCallPenalty();
1013     break;
1014   default:
1015     break;
1016   }
1017 
1018   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1019 }
1020 
1021 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
1022   Value *Operand = I.getOperand(0);
1023   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1024         return ConstantFoldInstOperands(&I, COps[0], DL);
1025       }))
1026     return true;
1027 
1028   // Disable any SROA on the argument to arbitrary unary instructions.
1029   disableSROA(Operand);
1030 
1031   return false;
1032 }
1033 
1034 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1035   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1036 }
1037 
1038 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1039   // Does the *call site* have the NonNull attribute set on an argument?  We
1040   // use the attribute on the call site to memoize any analysis done in the
1041   // caller. This will also trip if the callee function has a non-null
1042   // parameter attribute, but that's a less interesting case because hopefully
1043   // the callee would already have been simplified based on that.
1044   if (Argument *A = dyn_cast<Argument>(V))
1045     if (paramHasAttr(A, Attribute::NonNull))
1046       return true;
1047 
1048   // Is this an alloca in the caller?  This is distinct from the attribute case
1049   // above because attributes aren't updated within the inliner itself and we
1050   // always want to catch the alloca derived case.
1051   if (isAllocaDerivedArg(V))
1052     // We can actually predict the result of comparisons between an
1053     // alloca-derived value and null. Note that this fires regardless of
1054     // SROA firing.
1055     return true;
1056 
1057   return false;
1058 }
1059 
1060 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1061   // If the normal destination of the invoke or the parent block of the call
1062   // site is unreachable-terminated, there is little point in inlining this
1063   // unless there is literally zero cost.
1064   // FIXME: Note that it is possible that an unreachable-terminated block has a
1065   // hot entry. For example, in below scenario inlining hot_call_X() may be
1066   // beneficial :
1067   // main() {
1068   //   hot_call_1();
1069   //   ...
1070   //   hot_call_N()
1071   //   exit(0);
1072   // }
1073   // For now, we are not handling this corner case here as it is rare in real
1074   // code. In future, we should elaborate this based on BPI and BFI in more
1075   // general threshold adjusting heuristics in updateThreshold().
1076   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1077     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1078       return false;
1079   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1080     return false;
1081 
1082   return true;
1083 }
1084 
1085 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1086                                             BlockFrequencyInfo *CallerBFI) {
1087   // If global profile summary is available, then callsite's coldness is
1088   // determined based on that.
1089   if (PSI && PSI->hasProfileSummary())
1090     return PSI->isColdCallSite(CallSite(&Call), CallerBFI);
1091 
1092   // Otherwise we need BFI to be available.
1093   if (!CallerBFI)
1094     return false;
1095 
1096   // Determine if the callsite is cold relative to caller's entry. We could
1097   // potentially cache the computation of scaled entry frequency, but the added
1098   // complexity is not worth it unless this scaling shows up high in the
1099   // profiles.
1100   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1101   auto CallSiteBB = Call.getParent();
1102   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1103   auto CallerEntryFreq =
1104       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1105   return CallSiteFreq < CallerEntryFreq * ColdProb;
1106 }
1107 
1108 Optional<int>
1109 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1110                                                 BlockFrequencyInfo *CallerBFI) {
1111 
1112   // If global profile summary is available, then callsite's hotness is
1113   // determined based on that.
1114   if (PSI && PSI->hasProfileSummary() &&
1115       PSI->isHotCallSite(CallSite(&Call), CallerBFI))
1116     return Params.HotCallSiteThreshold;
1117 
1118   // Otherwise we need BFI to be available and to have a locally hot callsite
1119   // threshold.
1120   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1121     return None;
1122 
1123   // Determine if the callsite is hot relative to caller's entry. We could
1124   // potentially cache the computation of scaled entry frequency, but the added
1125   // complexity is not worth it unless this scaling shows up high in the
1126   // profiles.
1127   auto CallSiteBB = Call.getParent();
1128   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1129   auto CallerEntryFreq = CallerBFI->getEntryFreq();
1130   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1131     return Params.LocallyHotCallSiteThreshold;
1132 
1133   // Otherwise treat it normally.
1134   return None;
1135 }
1136 
1137 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1138   // If no size growth is allowed for this inlining, set Threshold to 0.
1139   if (!allowSizeGrowth(Call)) {
1140     Threshold = 0;
1141     return;
1142   }
1143 
1144   Function *Caller = Call.getCaller();
1145 
1146   // return min(A, B) if B is valid.
1147   auto MinIfValid = [](int A, Optional<int> B) {
1148     return B ? std::min(A, B.getValue()) : A;
1149   };
1150 
1151   // return max(A, B) if B is valid.
1152   auto MaxIfValid = [](int A, Optional<int> B) {
1153     return B ? std::max(A, B.getValue()) : A;
1154   };
1155 
1156   // Various bonus percentages. These are multiplied by Threshold to get the
1157   // bonus values.
1158   // SingleBBBonus: This bonus is applied if the callee has a single reachable
1159   // basic block at the given callsite context. This is speculatively applied
1160   // and withdrawn if more than one basic block is seen.
1161   //
1162   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1163   // of the last call to a static function as inlining such functions is
1164   // guaranteed to reduce code size.
1165   //
1166   // These bonus percentages may be set to 0 based on properties of the caller
1167   // and the callsite.
1168   int SingleBBBonusPercent = 50;
1169   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1170   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1171 
1172   // Lambda to set all the above bonus and bonus percentages to 0.
1173   auto DisallowAllBonuses = [&]() {
1174     SingleBBBonusPercent = 0;
1175     VectorBonusPercent = 0;
1176     LastCallToStaticBonus = 0;
1177   };
1178 
1179   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1180   // and reduce the threshold if the caller has the necessary attribute.
1181   if (Caller->hasMinSize()) {
1182     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1183     // For minsize, we want to disable the single BB bonus and the vector
1184     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1185     // a static function will, at the minimum, eliminate the parameter setup and
1186     // call/return instructions.
1187     SingleBBBonusPercent = 0;
1188     VectorBonusPercent = 0;
1189   } else if (Caller->hasOptSize())
1190     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1191 
1192   // Adjust the threshold based on inlinehint attribute and profile based
1193   // hotness information if the caller does not have MinSize attribute.
1194   if (!Caller->hasMinSize()) {
1195     if (Callee.hasFnAttribute(Attribute::InlineHint))
1196       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1197 
1198     // FIXME: After switching to the new passmanager, simplify the logic below
1199     // by checking only the callsite hotness/coldness as we will reliably
1200     // have local profile information.
1201     //
1202     // Callsite hotness and coldness can be determined if sample profile is
1203     // used (which adds hotness metadata to calls) or if caller's
1204     // BlockFrequencyInfo is available.
1205     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
1206     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1207     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1208       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1209       // FIXME: This should update the threshold only if it exceeds the
1210       // current threshold, but AutoFDO + ThinLTO currently relies on this
1211       // behavior to prevent inlining of hot callsites during ThinLTO
1212       // compile phase.
1213       Threshold = HotCallSiteThreshold.getValue();
1214     } else if (isColdCallSite(Call, CallerBFI)) {
1215       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1216       // Do not apply bonuses for a cold callsite including the
1217       // LastCallToStatic bonus. While this bonus might result in code size
1218       // reduction, it can cause the size of a non-cold caller to increase
1219       // preventing it from being inlined.
1220       DisallowAllBonuses();
1221       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1222     } else if (PSI) {
1223       // Use callee's global profile information only if we have no way of
1224       // determining this via callsite information.
1225       if (PSI->isFunctionEntryHot(&Callee)) {
1226         LLVM_DEBUG(dbgs() << "Hot callee.\n");
1227         // If callsite hotness can not be determined, we may still know
1228         // that the callee is hot and treat it as a weaker hint for threshold
1229         // increase.
1230         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1231       } else if (PSI->isFunctionEntryCold(&Callee)) {
1232         LLVM_DEBUG(dbgs() << "Cold callee.\n");
1233         // Do not apply bonuses for a cold callee including the
1234         // LastCallToStatic bonus. While this bonus might result in code size
1235         // reduction, it can cause the size of a non-cold caller to increase
1236         // preventing it from being inlined.
1237         DisallowAllBonuses();
1238         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1239       }
1240     }
1241   }
1242 
1243   // Finally, take the target-specific inlining threshold multiplier into
1244   // account.
1245   Threshold *= TTI.getInliningThresholdMultiplier();
1246 
1247   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1248   VectorBonus = Threshold * VectorBonusPercent / 100;
1249 
1250   bool OnlyOneCallAndLocalLinkage =
1251       F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
1252   // If there is only one call of the function, and it has internal linkage,
1253   // the cost of inlining it drops dramatically. It may seem odd to update
1254   // Cost in updateThreshold, but the bonus depends on the logic in this method.
1255   if (OnlyOneCallAndLocalLinkage)
1256     Cost -= LastCallToStaticBonus;
1257 }
1258 
1259 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1260   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1261   // First try to handle simplified comparisons.
1262   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1263         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
1264       }))
1265     return true;
1266 
1267   if (I.getOpcode() == Instruction::FCmp)
1268     return false;
1269 
1270   // Otherwise look for a comparison between constant offset pointers with
1271   // a common base.
1272   Value *LHSBase, *RHSBase;
1273   APInt LHSOffset, RHSOffset;
1274   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1275   if (LHSBase) {
1276     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1277     if (RHSBase && LHSBase == RHSBase) {
1278       // We have common bases, fold the icmp to a constant based on the
1279       // offsets.
1280       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1281       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1282       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1283         SimplifiedValues[&I] = C;
1284         ++NumConstantPtrCmps;
1285         return true;
1286       }
1287     }
1288   }
1289 
1290   // If the comparison is an equality comparison with null, we can simplify it
1291   // if we know the value (argument) can't be null
1292   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1293       isKnownNonNullInCallee(I.getOperand(0))) {
1294     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1295     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1296                                       : ConstantInt::getFalse(I.getType());
1297     return true;
1298   }
1299   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1300 }
1301 
1302 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1303   // Try to handle a special case: we can fold computing the difference of two
1304   // constant-related pointers.
1305   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1306   Value *LHSBase, *RHSBase;
1307   APInt LHSOffset, RHSOffset;
1308   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1309   if (LHSBase) {
1310     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1311     if (RHSBase && LHSBase == RHSBase) {
1312       // We have common bases, fold the subtract to a constant based on the
1313       // offsets.
1314       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1315       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1316       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1317         SimplifiedValues[&I] = C;
1318         ++NumConstantPtrDiffs;
1319         return true;
1320       }
1321     }
1322   }
1323 
1324   // Otherwise, fall back to the generic logic for simplifying and handling
1325   // instructions.
1326   return Base::visitSub(I);
1327 }
1328 
1329 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1330   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1331   Constant *CLHS = dyn_cast<Constant>(LHS);
1332   if (!CLHS)
1333     CLHS = SimplifiedValues.lookup(LHS);
1334   Constant *CRHS = dyn_cast<Constant>(RHS);
1335   if (!CRHS)
1336     CRHS = SimplifiedValues.lookup(RHS);
1337 
1338   Value *SimpleV = nullptr;
1339   if (auto FI = dyn_cast<FPMathOperator>(&I))
1340     SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
1341                             FI->getFastMathFlags(), DL);
1342   else
1343     SimpleV =
1344         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1345 
1346   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1347     SimplifiedValues[&I] = C;
1348 
1349   if (SimpleV)
1350     return true;
1351 
1352   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1353   disableSROA(LHS);
1354   disableSROA(RHS);
1355 
1356   // If the instruction is floating point, and the target says this operation
1357   // is expensive, this may eventually become a library call. Treat the cost
1358   // as such. Unless it's fneg which can be implemented with an xor.
1359   using namespace llvm::PatternMatch;
1360   if (I.getType()->isFloatingPointTy() &&
1361       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
1362       !match(&I, m_FNeg(m_Value())))
1363     onCallPenalty();
1364 
1365   return false;
1366 }
1367 
1368 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
1369   Value *Op = I.getOperand(0);
1370   Constant *COp = dyn_cast<Constant>(Op);
1371   if (!COp)
1372     COp = SimplifiedValues.lookup(Op);
1373 
1374   Value *SimpleV = SimplifyFNegInst(
1375       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
1376 
1377   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1378     SimplifiedValues[&I] = C;
1379 
1380   if (SimpleV)
1381     return true;
1382 
1383   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
1384   disableSROA(Op);
1385 
1386   return false;
1387 }
1388 
1389 bool CallAnalyzer::visitLoad(LoadInst &I) {
1390   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1391     return true;
1392 
1393   // If the data is already loaded from this address and hasn't been clobbered
1394   // by any stores or calls, this load is likely to be redundant and can be
1395   // eliminated.
1396   if (EnableLoadElimination &&
1397       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1398     onLoadEliminationOpportunity();
1399     return true;
1400   }
1401 
1402   return false;
1403 }
1404 
1405 bool CallAnalyzer::visitStore(StoreInst &I) {
1406   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1407     return true;
1408 
1409   // The store can potentially clobber loads and prevent repeated loads from
1410   // being eliminated.
1411   // FIXME:
1412   // 1. We can probably keep an initial set of eliminatable loads substracted
1413   // from the cost even when we finally see a store. We just need to disable
1414   // *further* accumulation of elimination savings.
1415   // 2. We should probably at some point thread MemorySSA for the callee into
1416   // this and then use that to actually compute *really* precise savings.
1417   disableLoadElimination();
1418   return false;
1419 }
1420 
1421 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1422   // Constant folding for extract value is trivial.
1423   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1424         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1425       }))
1426     return true;
1427 
1428   // SROA can look through these but give them a cost.
1429   return false;
1430 }
1431 
1432 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1433   // Constant folding for insert value is trivial.
1434   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1435         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1436                                             /*InsertedValueOperand*/ COps[1],
1437                                             I.getIndices());
1438       }))
1439     return true;
1440 
1441   // SROA can look through these but give them a cost.
1442   return false;
1443 }
1444 
1445 /// Try to simplify a call site.
1446 ///
1447 /// Takes a concrete function and callsite and tries to actually simplify it by
1448 /// analyzing the arguments and call itself with instsimplify. Returns true if
1449 /// it has simplified the callsite to some other entity (a constant), making it
1450 /// free.
1451 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
1452   // FIXME: Using the instsimplify logic directly for this is inefficient
1453   // because we have to continually rebuild the argument list even when no
1454   // simplifications can be performed. Until that is fixed with remapping
1455   // inside of instsimplify, directly constant fold calls here.
1456   if (!canConstantFoldCallTo(&Call, F))
1457     return false;
1458 
1459   // Try to re-map the arguments to constants.
1460   SmallVector<Constant *, 4> ConstantArgs;
1461   ConstantArgs.reserve(Call.arg_size());
1462   for (Value *I : Call.args()) {
1463     Constant *C = dyn_cast<Constant>(I);
1464     if (!C)
1465       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
1466     if (!C)
1467       return false; // This argument doesn't map to a constant.
1468 
1469     ConstantArgs.push_back(C);
1470   }
1471   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
1472     SimplifiedValues[&Call] = C;
1473     return true;
1474   }
1475 
1476   return false;
1477 }
1478 
1479 bool CallAnalyzer::visitCallBase(CallBase &Call) {
1480   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
1481       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1482     // This aborts the entire analysis.
1483     ExposesReturnsTwice = true;
1484     return false;
1485   }
1486   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
1487     ContainsNoDuplicateCall = true;
1488 
1489   Value *Callee = Call.getCalledOperand();
1490   Function *F = dyn_cast_or_null<Function>(Callee);
1491   bool IsIndirectCall = !F;
1492   if (IsIndirectCall) {
1493     // Check if this happens to be an indirect function call to a known function
1494     // in this inline context. If not, we've done all we can.
1495     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1496     if (!F) {
1497       onCallArgumentSetup(Call);
1498 
1499       if (!Call.onlyReadsMemory())
1500         disableLoadElimination();
1501       return Base::visitCallBase(Call);
1502     }
1503   }
1504 
1505   assert(F && "Expected a call to a known function");
1506 
1507   // When we have a concrete function, first try to simplify it directly.
1508   if (simplifyCallSite(F, Call))
1509     return true;
1510 
1511   // Next check if it is an intrinsic we know about.
1512   // FIXME: Lift this into part of the InstVisitor.
1513   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
1514     switch (II->getIntrinsicID()) {
1515     default:
1516       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1517         disableLoadElimination();
1518       return Base::visitCallBase(Call);
1519 
1520     case Intrinsic::load_relative:
1521       onLoadRelativeIntrinsic();
1522       return false;
1523 
1524     case Intrinsic::memset:
1525     case Intrinsic::memcpy:
1526     case Intrinsic::memmove:
1527       disableLoadElimination();
1528       // SROA can usually chew through these intrinsics, but they aren't free.
1529       return false;
1530     case Intrinsic::icall_branch_funnel:
1531     case Intrinsic::localescape:
1532       HasUninlineableIntrinsic = true;
1533       return false;
1534     case Intrinsic::vastart:
1535       InitsVargArgs = true;
1536       return false;
1537     }
1538   }
1539 
1540   if (F == Call.getFunction()) {
1541     // This flag will fully abort the analysis, so don't bother with anything
1542     // else.
1543     IsRecursiveCall = true;
1544     return false;
1545   }
1546 
1547   if (TTI.isLoweredToCall(F)) {
1548     onLoweredCall(F, Call, IsIndirectCall);
1549   }
1550 
1551   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
1552     disableLoadElimination();
1553   return Base::visitCallBase(Call);
1554 }
1555 
1556 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1557   // At least one return instruction will be free after inlining.
1558   bool Free = !HasReturn;
1559   HasReturn = true;
1560   return Free;
1561 }
1562 
1563 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1564   // We model unconditional branches as essentially free -- they really
1565   // shouldn't exist at all, but handling them makes the behavior of the
1566   // inliner more regular and predictable. Interestingly, conditional branches
1567   // which will fold away are also free.
1568   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1569          dyn_cast_or_null<ConstantInt>(
1570              SimplifiedValues.lookup(BI.getCondition()));
1571 }
1572 
1573 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1574   bool CheckSROA = SI.getType()->isPointerTy();
1575   Value *TrueVal = SI.getTrueValue();
1576   Value *FalseVal = SI.getFalseValue();
1577 
1578   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1579   if (!TrueC)
1580     TrueC = SimplifiedValues.lookup(TrueVal);
1581   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1582   if (!FalseC)
1583     FalseC = SimplifiedValues.lookup(FalseVal);
1584   Constant *CondC =
1585       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1586 
1587   if (!CondC) {
1588     // Select C, X, X => X
1589     if (TrueC == FalseC && TrueC) {
1590       SimplifiedValues[&SI] = TrueC;
1591       return true;
1592     }
1593 
1594     if (!CheckSROA)
1595       return Base::visitSelectInst(SI);
1596 
1597     std::pair<Value *, APInt> TrueBaseAndOffset =
1598         ConstantOffsetPtrs.lookup(TrueVal);
1599     std::pair<Value *, APInt> FalseBaseAndOffset =
1600         ConstantOffsetPtrs.lookup(FalseVal);
1601     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1602       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1603 
1604       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
1605         SROAArgValues[&SI] = SROAArg;
1606       return true;
1607     }
1608 
1609     return Base::visitSelectInst(SI);
1610   }
1611 
1612   // Select condition is a constant.
1613   Value *SelectedV = CondC->isAllOnesValue()
1614                          ? TrueVal
1615                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1616   if (!SelectedV) {
1617     // Condition is a vector constant that is not all 1s or all 0s.  If all
1618     // operands are constants, ConstantExpr::getSelect() can handle the cases
1619     // such as select vectors.
1620     if (TrueC && FalseC) {
1621       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1622         SimplifiedValues[&SI] = C;
1623         return true;
1624       }
1625     }
1626     return Base::visitSelectInst(SI);
1627   }
1628 
1629   // Condition is either all 1s or all 0s. SI can be simplified.
1630   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1631     SimplifiedValues[&SI] = SelectedC;
1632     return true;
1633   }
1634 
1635   if (!CheckSROA)
1636     return true;
1637 
1638   std::pair<Value *, APInt> BaseAndOffset =
1639       ConstantOffsetPtrs.lookup(SelectedV);
1640   if (BaseAndOffset.first) {
1641     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1642 
1643     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
1644       SROAArgValues[&SI] = SROAArg;
1645   }
1646 
1647   return true;
1648 }
1649 
1650 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1651   // We model unconditional switches as free, see the comments on handling
1652   // branches.
1653   if (isa<ConstantInt>(SI.getCondition()))
1654     return true;
1655   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1656     if (isa<ConstantInt>(V))
1657       return true;
1658 
1659   // Assume the most general case where the switch is lowered into
1660   // either a jump table, bit test, or a balanced binary tree consisting of
1661   // case clusters without merging adjacent clusters with the same
1662   // destination. We do not consider the switches that are lowered with a mix
1663   // of jump table/bit test/binary search tree. The cost of the switch is
1664   // proportional to the size of the tree or the size of jump table range.
1665   //
1666   // NB: We convert large switches which are just used to initialize large phi
1667   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1668   // inlining those. It will prevent inlining in cases where the optimization
1669   // does not (yet) fire.
1670 
1671   unsigned JumpTableSize = 0;
1672   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
1673   unsigned NumCaseCluster =
1674       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
1675 
1676   onFinalizeSwitch(JumpTableSize, NumCaseCluster);
1677   return false;
1678 }
1679 
1680 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1681   // We never want to inline functions that contain an indirectbr.  This is
1682   // incorrect because all the blockaddress's (in static global initializers
1683   // for example) would be referring to the original function, and this
1684   // indirect jump would jump from the inlined copy of the function into the
1685   // original function which is extremely undefined behavior.
1686   // FIXME: This logic isn't really right; we can safely inline functions with
1687   // indirectbr's as long as no other function or global references the
1688   // blockaddress of a block within the current function.
1689   HasIndirectBr = true;
1690   return false;
1691 }
1692 
1693 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1694   // FIXME: It's not clear that a single instruction is an accurate model for
1695   // the inline cost of a resume instruction.
1696   return false;
1697 }
1698 
1699 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1700   // FIXME: It's not clear that a single instruction is an accurate model for
1701   // the inline cost of a cleanupret instruction.
1702   return false;
1703 }
1704 
1705 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1706   // FIXME: It's not clear that a single instruction is an accurate model for
1707   // the inline cost of a catchret instruction.
1708   return false;
1709 }
1710 
1711 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1712   // FIXME: It might be reasonably to discount the cost of instructions leading
1713   // to unreachable as they have the lowest possible impact on both runtime and
1714   // code size.
1715   return true; // No actual code is needed for unreachable.
1716 }
1717 
1718 bool CallAnalyzer::visitInstruction(Instruction &I) {
1719   // Some instructions are free. All of the free intrinsics can also be
1720   // handled by SROA, etc.
1721   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1722     return true;
1723 
1724   // We found something we don't understand or can't handle. Mark any SROA-able
1725   // values in the operand list as no longer viable.
1726   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1727     disableSROA(*OI);
1728 
1729   return false;
1730 }
1731 
1732 /// Analyze a basic block for its contribution to the inline cost.
1733 ///
1734 /// This method walks the analyzer over every instruction in the given basic
1735 /// block and accounts for their cost during inlining at this callsite. It
1736 /// aborts early if the threshold has been exceeded or an impossible to inline
1737 /// construct has been detected. It returns false if inlining is no longer
1738 /// viable, and true if inlining remains viable.
1739 InlineResult
1740 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1741                            SmallPtrSetImpl<const Value *> &EphValues) {
1742   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1743     // FIXME: Currently, the number of instructions in a function regardless of
1744     // our ability to simplify them during inline to constants or dead code,
1745     // are actually used by the vector bonus heuristic. As long as that's true,
1746     // we have to special case debug intrinsics here to prevent differences in
1747     // inlining due to debug symbols. Eventually, the number of unsimplified
1748     // instructions shouldn't factor into the cost computation, but until then,
1749     // hack around it here.
1750     if (isa<DbgInfoIntrinsic>(I))
1751       continue;
1752 
1753     // Skip ephemeral values.
1754     if (EphValues.count(&*I))
1755       continue;
1756 
1757     ++NumInstructions;
1758     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1759       ++NumVectorInstructions;
1760 
1761     // If the instruction simplified to a constant, there is no cost to this
1762     // instruction. Visit the instructions using our InstVisitor to account for
1763     // all of the per-instruction logic. The visit tree returns true if we
1764     // consumed the instruction in any way, and false if the instruction's base
1765     // cost should count against inlining.
1766     if (Base::visit(&*I))
1767       ++NumInstructionsSimplified;
1768     else
1769       onMissedSimplification();
1770 
1771     using namespace ore;
1772     // If the visit this instruction detected an uninlinable pattern, abort.
1773     InlineResult IR = InlineResult::success();
1774     if (IsRecursiveCall)
1775       IR = InlineResult::failure("recursive");
1776     else if (ExposesReturnsTwice)
1777       IR = InlineResult::failure("exposes returns twice");
1778     else if (HasDynamicAlloca)
1779       IR = InlineResult::failure("dynamic alloca");
1780     else if (HasIndirectBr)
1781       IR = InlineResult::failure("indirect branch");
1782     else if (HasUninlineableIntrinsic)
1783       IR = InlineResult::failure("uninlinable intrinsic");
1784     else if (InitsVargArgs)
1785       IR = InlineResult::failure("varargs");
1786     if (!IR.isSuccess()) {
1787       if (ORE)
1788         ORE->emit([&]() {
1789           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1790                                           &CandidateCall)
1791                  << NV("Callee", &F) << " has uninlinable pattern ("
1792                  << NV("InlineResult", IR.getFailureReason())
1793                  << ") and cost is not fully computed";
1794         });
1795       return IR;
1796     }
1797 
1798     // If the caller is a recursive function then we don't want to inline
1799     // functions which allocate a lot of stack space because it would increase
1800     // the caller stack usage dramatically.
1801     if (IsCallerRecursive &&
1802         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1803       auto IR =
1804           InlineResult::failure("recursive and allocates too much stack space");
1805       if (ORE)
1806         ORE->emit([&]() {
1807           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1808                                           &CandidateCall)
1809                  << NV("Callee", &F) << " is "
1810                  << NV("InlineResult", IR.getFailureReason())
1811                  << ". Cost is not fully computed";
1812         });
1813       return IR;
1814     }
1815 
1816     if (shouldStop())
1817       return InlineResult::failure(
1818           "Call site analysis is not favorable to inlining.");
1819   }
1820 
1821   return InlineResult::success();
1822 }
1823 
1824 /// Compute the base pointer and cumulative constant offsets for V.
1825 ///
1826 /// This strips all constant offsets off of V, leaving it the base pointer, and
1827 /// accumulates the total constant offset applied in the returned constant. It
1828 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1829 /// no constant offsets applied.
1830 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1831   if (!V->getType()->isPointerTy())
1832     return nullptr;
1833 
1834   unsigned AS = V->getType()->getPointerAddressSpace();
1835   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1836   APInt Offset = APInt::getNullValue(IntPtrWidth);
1837 
1838   // Even though we don't look through PHI nodes, we could be called on an
1839   // instruction in an unreachable block, which may be on a cycle.
1840   SmallPtrSet<Value *, 4> Visited;
1841   Visited.insert(V);
1842   do {
1843     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1844       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1845         return nullptr;
1846       V = GEP->getPointerOperand();
1847     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1848       V = cast<Operator>(V)->getOperand(0);
1849     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1850       if (GA->isInterposable())
1851         break;
1852       V = GA->getAliasee();
1853     } else {
1854       break;
1855     }
1856     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1857   } while (Visited.insert(V).second);
1858 
1859   Type *IdxPtrTy = DL.getIndexType(V->getType());
1860   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
1861 }
1862 
1863 /// Find dead blocks due to deleted CFG edges during inlining.
1864 ///
1865 /// If we know the successor of the current block, \p CurrBB, has to be \p
1866 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1867 /// no live incoming CFG edges.  If one block is found to be dead, we can
1868 /// continue growing the dead block list by checking the successors of the dead
1869 /// blocks to see if all their incoming edges are dead or not.
1870 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1871   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1872     // A CFG edge is dead if the predecessor is dead or the predecessor has a
1873     // known successor which is not the one under exam.
1874     return (DeadBlocks.count(Pred) ||
1875             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1876   };
1877 
1878   auto IsNewlyDead = [&](BasicBlock *BB) {
1879     // If all the edges to a block are dead, the block is also dead.
1880     return (!DeadBlocks.count(BB) &&
1881             llvm::all_of(predecessors(BB),
1882                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1883   };
1884 
1885   for (BasicBlock *Succ : successors(CurrBB)) {
1886     if (Succ == NextBB || !IsNewlyDead(Succ))
1887       continue;
1888     SmallVector<BasicBlock *, 4> NewDead;
1889     NewDead.push_back(Succ);
1890     while (!NewDead.empty()) {
1891       BasicBlock *Dead = NewDead.pop_back_val();
1892       if (DeadBlocks.insert(Dead))
1893         // Continue growing the dead block lists.
1894         for (BasicBlock *S : successors(Dead))
1895           if (IsNewlyDead(S))
1896             NewDead.push_back(S);
1897     }
1898   }
1899 }
1900 
1901 /// Analyze a call site for potential inlining.
1902 ///
1903 /// Returns true if inlining this call is viable, and false if it is not
1904 /// viable. It computes the cost and adjusts the threshold based on numerous
1905 /// factors and heuristics. If this method returns false but the computed cost
1906 /// is below the computed threshold, then inlining was forcibly disabled by
1907 /// some artifact of the routine.
1908 InlineResult CallAnalyzer::analyze() {
1909   ++NumCallsAnalyzed;
1910 
1911   auto Result = onAnalysisStart();
1912   if (!Result.isSuccess())
1913     return Result;
1914 
1915   if (F.empty())
1916     return InlineResult::success();
1917 
1918   Function *Caller = CandidateCall.getFunction();
1919   // Check if the caller function is recursive itself.
1920   for (User *U : Caller->users()) {
1921     CallBase *Call = dyn_cast<CallBase>(U);
1922     if (Call && Call->getFunction() == Caller) {
1923       IsCallerRecursive = true;
1924       break;
1925     }
1926   }
1927 
1928   // Populate our simplified values by mapping from function arguments to call
1929   // arguments with known important simplifications.
1930   auto CAI = CandidateCall.arg_begin();
1931   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1932        FAI != FAE; ++FAI, ++CAI) {
1933     assert(CAI != CandidateCall.arg_end());
1934     if (Constant *C = dyn_cast<Constant>(CAI))
1935       SimplifiedValues[&*FAI] = C;
1936 
1937     Value *PtrArg = *CAI;
1938     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1939       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1940 
1941       // We can SROA any pointer arguments derived from alloca instructions.
1942       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
1943         SROAArgValues[&*FAI] = SROAArg;
1944         onInitializeSROAArg(SROAArg);
1945         EnabledSROAAllocas.insert(SROAArg);
1946       }
1947     }
1948   }
1949   NumConstantArgs = SimplifiedValues.size();
1950   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1951   NumAllocaArgs = SROAArgValues.size();
1952 
1953   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1954   // the ephemeral values multiple times (and they're completely determined by
1955   // the callee, so this is purely duplicate work).
1956   SmallPtrSet<const Value *, 32> EphValues;
1957   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1958 
1959   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1960   // adding basic blocks of the callee which can be proven to be dead for this
1961   // particular call site in order to get more accurate cost estimates. This
1962   // requires a somewhat heavyweight iteration pattern: we need to walk the
1963   // basic blocks in a breadth-first order as we insert live successors. To
1964   // accomplish this, prioritizing for small iterations because we exit after
1965   // crossing our threshold, we use a small-size optimized SetVector.
1966   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1967                     SmallPtrSet<BasicBlock *, 16>>
1968       BBSetVector;
1969   BBSetVector BBWorklist;
1970   BBWorklist.insert(&F.getEntryBlock());
1971 
1972   // Note that we *must not* cache the size, this loop grows the worklist.
1973   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1974     if (shouldStop())
1975       break;
1976 
1977     BasicBlock *BB = BBWorklist[Idx];
1978     if (BB->empty())
1979       continue;
1980 
1981     // Disallow inlining a blockaddress with uses other than strictly callbr.
1982     // A blockaddress only has defined behavior for an indirect branch in the
1983     // same function, and we do not currently support inlining indirect
1984     // branches.  But, the inliner may not see an indirect branch that ends up
1985     // being dead code at a particular call site. If the blockaddress escapes
1986     // the function, e.g., via a global variable, inlining may lead to an
1987     // invalid cross-function reference.
1988     // FIXME: pr/39560: continue relaxing this overt restriction.
1989     if (BB->hasAddressTaken())
1990       for (User *U : BlockAddress::get(&*BB)->users())
1991         if (!isa<CallBrInst>(*U))
1992           return InlineResult::failure("blockaddress used outside of callbr");
1993 
1994     // Analyze the cost of this block. If we blow through the threshold, this
1995     // returns false, and we can bail on out.
1996     InlineResult IR = analyzeBlock(BB, EphValues);
1997     if (!IR.isSuccess())
1998       return IR;
1999 
2000     Instruction *TI = BB->getTerminator();
2001 
2002     // Add in the live successors by first checking whether we have terminator
2003     // that may be simplified based on the values simplified by this call.
2004     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2005       if (BI->isConditional()) {
2006         Value *Cond = BI->getCondition();
2007         if (ConstantInt *SimpleCond =
2008                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2009           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2010           BBWorklist.insert(NextBB);
2011           KnownSuccessors[BB] = NextBB;
2012           findDeadBlocks(BB, NextBB);
2013           continue;
2014         }
2015       }
2016     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2017       Value *Cond = SI->getCondition();
2018       if (ConstantInt *SimpleCond =
2019               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2020         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2021         BBWorklist.insert(NextBB);
2022         KnownSuccessors[BB] = NextBB;
2023         findDeadBlocks(BB, NextBB);
2024         continue;
2025       }
2026     }
2027 
2028     // If we're unable to select a particular successor, just count all of
2029     // them.
2030     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2031          ++TIdx)
2032       BBWorklist.insert(TI->getSuccessor(TIdx));
2033 
2034     onBlockAnalyzed(BB);
2035   }
2036 
2037   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
2038                                     &F == CandidateCall.getCalledFunction();
2039   // If this is a noduplicate call, we can still inline as long as
2040   // inlining this would cause the removal of the caller (so the instruction
2041   // is not actually duplicated, just moved).
2042   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
2043     return InlineResult::failure("noduplicate");
2044 
2045   return finalizeAnalysis();
2046 }
2047 
2048 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2049 /// Dump stats about this call's analysis.
2050 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
2051 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
2052   DEBUG_PRINT_STAT(NumConstantArgs);
2053   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2054   DEBUG_PRINT_STAT(NumAllocaArgs);
2055   DEBUG_PRINT_STAT(NumConstantPtrCmps);
2056   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2057   DEBUG_PRINT_STAT(NumInstructionsSimplified);
2058   DEBUG_PRINT_STAT(NumInstructions);
2059   DEBUG_PRINT_STAT(SROACostSavings);
2060   DEBUG_PRINT_STAT(SROACostSavingsLost);
2061   DEBUG_PRINT_STAT(LoadEliminationCost);
2062   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2063   DEBUG_PRINT_STAT(Cost);
2064   DEBUG_PRINT_STAT(Threshold);
2065 #undef DEBUG_PRINT_STAT
2066 }
2067 #endif
2068 
2069 /// Test that there are no attribute conflicts between Caller and Callee
2070 ///        that prevent inlining.
2071 static bool functionsHaveCompatibleAttributes(Function *Caller,
2072                                               Function *Callee,
2073                                               TargetTransformInfo &TTI) {
2074   return TTI.areInlineCompatible(Caller, Callee) &&
2075          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2076 }
2077 
2078 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
2079   int Cost = 0;
2080   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2081     if (Call.isByValArgument(I)) {
2082       // We approximate the number of loads and stores needed by dividing the
2083       // size of the byval type by the target's pointer size.
2084       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2085       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
2086       unsigned AS = PTy->getAddressSpace();
2087       unsigned PointerSize = DL.getPointerSizeInBits(AS);
2088       // Ceiling division.
2089       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2090 
2091       // If it generates more than 8 stores it is likely to be expanded as an
2092       // inline memcpy so we take that as an upper bound. Otherwise we assume
2093       // one load and one store per word copied.
2094       // FIXME: The maxStoresPerMemcpy setting from the target should be used
2095       // here instead of a magic number of 8, but it's not available via
2096       // DataLayout.
2097       NumStores = std::min(NumStores, 8U);
2098 
2099       Cost += 2 * NumStores * InlineConstants::InstrCost;
2100     } else {
2101       // For non-byval arguments subtract off one instruction per call
2102       // argument.
2103       Cost += InlineConstants::InstrCost;
2104     }
2105   }
2106   // The call instruction also disappears after inlining.
2107   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
2108   return Cost;
2109 }
2110 
2111 InlineCost llvm::getInlineCost(
2112     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2113     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2114     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2115     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2116   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2117                        GetAssumptionCache, GetBFI, PSI, ORE);
2118 }
2119 
2120 InlineCost llvm::getInlineCost(
2121     CallBase &Call, Function *Callee, const InlineParams &Params,
2122     TargetTransformInfo &CalleeTTI,
2123     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2124     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2125     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2126 
2127   // Cannot inline indirect calls.
2128   if (!Callee)
2129     return llvm::InlineCost::getNever("indirect call");
2130 
2131   // Never inline calls with byval arguments that does not have the alloca
2132   // address space. Since byval arguments can be replaced with a copy to an
2133   // alloca, the inlined code would need to be adjusted to handle that the
2134   // argument is in the alloca address space (so it is a little bit complicated
2135   // to solve).
2136   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2137   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2138     if (Call.isByValArgument(I)) {
2139       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2140       if (PTy->getAddressSpace() != AllocaAS)
2141         return llvm::InlineCost::getNever("byval arguments without alloca"
2142                                           " address space");
2143     }
2144 
2145   // Calls to functions with always-inline attributes should be inlined
2146   // whenever possible.
2147   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2148     auto IsViable = isInlineViable(*Callee);
2149     if (IsViable.isSuccess())
2150       return llvm::InlineCost::getAlways("always inline attribute");
2151     return llvm::InlineCost::getNever(IsViable.getFailureReason());
2152   }
2153 
2154   // Never inline functions with conflicting attributes (unless callee has
2155   // always-inline attribute).
2156   Function *Caller = Call.getCaller();
2157   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2158     return llvm::InlineCost::getNever("conflicting attributes");
2159 
2160   // Don't inline this call if the caller has the optnone attribute.
2161   if (Caller->hasOptNone())
2162     return llvm::InlineCost::getNever("optnone attribute");
2163 
2164   // Don't inline a function that treats null pointer as valid into a caller
2165   // that does not have this attribute.
2166   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2167     return llvm::InlineCost::getNever("nullptr definitions incompatible");
2168 
2169   // Don't inline functions which can be interposed at link-time.
2170   if (Callee->isInterposable())
2171     return llvm::InlineCost::getNever("interposable");
2172 
2173   // Don't inline functions marked noinline.
2174   if (Callee->hasFnAttribute(Attribute::NoInline))
2175     return llvm::InlineCost::getNever("noinline function attribute");
2176 
2177   // Don't inline call sites marked noinline.
2178   if (Call.isNoInline())
2179     return llvm::InlineCost::getNever("noinline call site attribute");
2180 
2181   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2182                           << "... (caller:" << Caller->getName() << ")\n");
2183 
2184   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2185                             *Callee, Call, Params);
2186   InlineResult ShouldInline = CA.analyze();
2187 
2188   LLVM_DEBUG(CA.dump());
2189 
2190   // Check if there was a reason to force inlining or no inlining.
2191   if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
2192     return InlineCost::getNever(ShouldInline.getFailureReason());
2193   if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
2194     return InlineCost::getAlways("empty function");
2195 
2196   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2197 }
2198 
2199 InlineResult llvm::isInlineViable(Function &F) {
2200   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2201   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2202     // Disallow inlining of functions which contain indirect branches.
2203     if (isa<IndirectBrInst>(BI->getTerminator()))
2204       return InlineResult::failure("contains indirect branches");
2205 
2206     // Disallow inlining of blockaddresses which are used by non-callbr
2207     // instructions.
2208     if (BI->hasAddressTaken())
2209       for (User *U : BlockAddress::get(&*BI)->users())
2210         if (!isa<CallBrInst>(*U))
2211           return InlineResult::failure("blockaddress used outside of callbr");
2212 
2213     for (auto &II : *BI) {
2214       CallBase *Call = dyn_cast<CallBase>(&II);
2215       if (!Call)
2216         continue;
2217 
2218       // Disallow recursive calls.
2219       if (&F == Call->getCalledFunction())
2220         return InlineResult::failure("recursive call");
2221 
2222       // Disallow calls which expose returns-twice to a function not previously
2223       // attributed as such.
2224       if (!ReturnsTwice && isa<CallInst>(Call) &&
2225           cast<CallInst>(Call)->canReturnTwice())
2226         return InlineResult::failure("exposes returns-twice attribute");
2227 
2228       if (Call->getCalledFunction())
2229         switch (Call->getCalledFunction()->getIntrinsicID()) {
2230         default:
2231           break;
2232         case llvm::Intrinsic::icall_branch_funnel:
2233           // Disallow inlining of @llvm.icall.branch.funnel because current
2234           // backend can't separate call targets from call arguments.
2235           return InlineResult::failure(
2236               "disallowed inlining of @llvm.icall.branch.funnel");
2237         case llvm::Intrinsic::localescape:
2238           // Disallow inlining functions that call @llvm.localescape. Doing this
2239           // correctly would require major changes to the inliner.
2240           return InlineResult::failure(
2241               "disallowed inlining of @llvm.localescape");
2242         case llvm::Intrinsic::vastart:
2243           // Disallow inlining of functions that initialize VarArgs with
2244           // va_start.
2245           return InlineResult::failure(
2246               "contains VarArgs initialized with va_start");
2247         }
2248     }
2249   }
2250 
2251   return InlineResult::success();
2252 }
2253 
2254 // APIs to create InlineParams based on command line flags and/or other
2255 // parameters.
2256 
2257 InlineParams llvm::getInlineParams(int Threshold) {
2258   InlineParams Params;
2259 
2260   // This field is the threshold to use for a callee by default. This is
2261   // derived from one or more of:
2262   //  * optimization or size-optimization levels,
2263   //  * a value passed to createFunctionInliningPass function, or
2264   //  * the -inline-threshold flag.
2265   //  If the -inline-threshold flag is explicitly specified, that is used
2266   //  irrespective of anything else.
2267   if (InlineThreshold.getNumOccurrences() > 0)
2268     Params.DefaultThreshold = InlineThreshold;
2269   else
2270     Params.DefaultThreshold = Threshold;
2271 
2272   // Set the HintThreshold knob from the -inlinehint-threshold.
2273   Params.HintThreshold = HintThreshold;
2274 
2275   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2276   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2277 
2278   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2279   // populate LocallyHotCallSiteThreshold. Later, we populate
2280   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2281   // we know that optimization level is O3 (in the getInlineParams variant that
2282   // takes the opt and size levels).
2283   // FIXME: Remove this check (and make the assignment unconditional) after
2284   // addressing size regression issues at O2.
2285   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2286     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2287 
2288   // Set the ColdCallSiteThreshold knob from the
2289   // -inline-cold-callsite-threshold.
2290   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2291 
2292   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2293   // -inlinehint-threshold commandline option is not explicitly given. If that
2294   // option is present, then its value applies even for callees with size and
2295   // minsize attributes.
2296   // If the -inline-threshold is not specified, set the ColdThreshold from the
2297   // -inlinecold-threshold even if it is not explicitly passed. If
2298   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2299   // explicitly specified to set the ColdThreshold knob
2300   if (InlineThreshold.getNumOccurrences() == 0) {
2301     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2302     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2303     Params.ColdThreshold = ColdThreshold;
2304   } else if (ColdThreshold.getNumOccurrences() > 0) {
2305     Params.ColdThreshold = ColdThreshold;
2306   }
2307   return Params;
2308 }
2309 
2310 InlineParams llvm::getInlineParams() {
2311   return getInlineParams(DefaultThreshold);
2312 }
2313 
2314 // Compute the default threshold for inlining based on the opt level and the
2315 // size opt level.
2316 static int computeThresholdFromOptLevels(unsigned OptLevel,
2317                                          unsigned SizeOptLevel) {
2318   if (OptLevel > 2)
2319     return InlineConstants::OptAggressiveThreshold;
2320   if (SizeOptLevel == 1) // -Os
2321     return InlineConstants::OptSizeThreshold;
2322   if (SizeOptLevel == 2) // -Oz
2323     return InlineConstants::OptMinSizeThreshold;
2324   return DefaultThreshold;
2325 }
2326 
2327 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2328   auto Params =
2329       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2330   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2331   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2332   // when it is specified explicitly.
2333   if (OptLevel > 2)
2334     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2335   return Params;
2336 }
2337