1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/AssemblyAnnotationWriter.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/FormattedStream.h"
44 #include "llvm/Support/raw_ostream.h"
45 
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "inline-cost"
49 
50 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
51 
52 static cl::opt<int>
53     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
54                      cl::ZeroOrMore,
55                      cl::desc("Default amount of inlining to perform"));
56 
57 static cl::opt<bool> PrintDebugInstructionDeltas("print-instruction-deltas",
58     cl::Hidden, cl::init(false),
59     cl::desc("Prints deltas of cost and threshold per instruction"));
60 
61 static cl::opt<int> InlineThreshold(
62     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
63     cl::desc("Control the amount of inlining to perform (default = 225)"));
64 
65 static cl::opt<int> HintThreshold(
66     "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
67     cl::desc("Threshold for inlining functions with inline hint"));
68 
69 static cl::opt<int>
70     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
71                           cl::init(45), cl::ZeroOrMore,
72                           cl::desc("Threshold for inlining cold callsites"));
73 
74 // We introduce this threshold to help performance of instrumentation based
75 // PGO before we actually hook up inliner with analysis passes such as BPI and
76 // BFI.
77 static cl::opt<int> ColdThreshold(
78     "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
79     cl::desc("Threshold for inlining functions with cold attribute"));
80 
81 static cl::opt<int>
82     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
83                          cl::ZeroOrMore,
84                          cl::desc("Threshold for hot callsites "));
85 
86 static cl::opt<int> LocallyHotCallSiteThreshold(
87     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
88     cl::desc("Threshold for locally hot callsites "));
89 
90 static cl::opt<int> ColdCallSiteRelFreq(
91     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
92     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
93              "entry frequency, for a callsite to be cold in the absence of "
94              "profile information."));
95 
96 static cl::opt<int> HotCallSiteRelFreq(
97     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
98     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
99              "entry frequency, for a callsite to be hot in the absence of "
100              "profile information."));
101 
102 static cl::opt<bool> OptComputeFullInlineCost(
103     "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
104     cl::desc("Compute the full inline cost of a call site even when the cost "
105              "exceeds the threshold."));
106 
107 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
108     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
109     cl::ZeroOrMore,
110     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
111              "attributes."));
112 
113 namespace {
114 class InlineCostCallAnalyzer;
115 
116 // This struct is used to store information about inline cost of a
117 // particular instruction
118 struct InstructionCostDetail {
119   int CostBefore;
120   int CostAfter;
121   int ThresholdBefore;
122   int ThresholdAfter;
123 };
124 
125 class CostAnnotationWriter : public AssemblyAnnotationWriter {
126 public:
127   // This DenseMap stores the delta change in cost and threshold after
128   // accounting for the given instruction.
129   DenseMap <const Instruction *, InstructionCostDetail> CostThresholdMap;
130 
131   virtual void emitInstructionAnnot(const Instruction *I,
132                                         formatted_raw_ostream &OS);
133 };
134 
135 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
136   typedef InstVisitor<CallAnalyzer, bool> Base;
137   friend class InstVisitor<CallAnalyzer, bool>;
138 
139 protected:
140   virtual ~CallAnalyzer() {}
141   /// The TargetTransformInfo available for this compilation.
142   const TargetTransformInfo &TTI;
143 
144   /// Getter for the cache of @llvm.assume intrinsics.
145   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
146 
147   /// Getter for BlockFrequencyInfo
148   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
149 
150   /// Profile summary information.
151   ProfileSummaryInfo *PSI;
152 
153   /// The called function.
154   Function &F;
155 
156   // Cache the DataLayout since we use it a lot.
157   const DataLayout &DL;
158 
159   /// The OptimizationRemarkEmitter available for this compilation.
160   OptimizationRemarkEmitter *ORE;
161 
162   /// The candidate callsite being analyzed. Please do not use this to do
163   /// analysis in the caller function; we want the inline cost query to be
164   /// easily cacheable. Instead, use the cover function paramHasAttr.
165   CallBase &CandidateCall;
166 
167   /// Extension points for handling callsite features.
168   /// Called after a basic block was analyzed.
169   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
170 
171   /// Called before an instruction was analyzed
172   virtual void onInstructionAnalysisStart(const Instruction *I) {}
173 
174   /// Called after an instruction was analyzed
175   virtual void onInstructionAnalysisFinish(const Instruction *I) {}
176 
177   /// Called at the end of the analysis of the callsite. Return the outcome of
178   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
179   /// the reason it can't.
180   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
181   /// Called when we're about to start processing a basic block, and every time
182   /// we are done processing an instruction. Return true if there is no point in
183   /// continuing the analysis (e.g. we've determined already the call site is
184   /// too expensive to inline)
185   virtual bool shouldStop() { return false; }
186 
187   /// Called before the analysis of the callee body starts (with callsite
188   /// contexts propagated).  It checks callsite-specific information. Return a
189   /// reason analysis can't continue if that's the case, or 'true' if it may
190   /// continue.
191   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
192   /// Called if the analysis engine decides SROA cannot be done for the given
193   /// alloca.
194   virtual void onDisableSROA(AllocaInst *Arg) {}
195 
196   /// Called the analysis engine determines load elimination won't happen.
197   virtual void onDisableLoadElimination() {}
198 
199   /// Called to account for a call.
200   virtual void onCallPenalty() {}
201 
202   /// Called to account for the expectation the inlining would result in a load
203   /// elimination.
204   virtual void onLoadEliminationOpportunity() {}
205 
206   /// Called to account for the cost of argument setup for the Call in the
207   /// callee's body (not the callsite currently under analysis).
208   virtual void onCallArgumentSetup(const CallBase &Call) {}
209 
210   /// Called to account for a load relative intrinsic.
211   virtual void onLoadRelativeIntrinsic() {}
212 
213   /// Called to account for a lowered call.
214   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
215   }
216 
217   /// Account for a jump table of given size. Return false to stop further
218   /// processing the switch instruction
219   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
220 
221   /// Account for a case cluster of given size. Return false to stop further
222   /// processing of the instruction.
223   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
224 
225   /// Called at the end of processing a switch instruction, with the given
226   /// number of case clusters.
227   virtual void onFinalizeSwitch(unsigned JumpTableSize,
228                                 unsigned NumCaseCluster) {}
229 
230   /// Called to account for any other instruction not specifically accounted
231   /// for.
232   virtual void onMissedSimplification() {}
233 
234   /// Start accounting potential benefits due to SROA for the given alloca.
235   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
236 
237   /// Account SROA savings for the AllocaInst value.
238   virtual void onAggregateSROAUse(AllocaInst *V) {}
239 
240   bool handleSROA(Value *V, bool DoNotDisable) {
241     // Check for SROA candidates in comparisons.
242     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
243       if (DoNotDisable) {
244         onAggregateSROAUse(SROAArg);
245         return true;
246       }
247       disableSROAForArg(SROAArg);
248     }
249     return false;
250   }
251 
252   bool IsCallerRecursive = false;
253   bool IsRecursiveCall = false;
254   bool ExposesReturnsTwice = false;
255   bool HasDynamicAlloca = false;
256   bool ContainsNoDuplicateCall = false;
257   bool HasReturn = false;
258   bool HasIndirectBr = false;
259   bool HasUninlineableIntrinsic = false;
260   bool InitsVargArgs = false;
261 
262   /// Number of bytes allocated statically by the callee.
263   uint64_t AllocatedSize = 0;
264   unsigned NumInstructions = 0;
265   unsigned NumVectorInstructions = 0;
266 
267   /// While we walk the potentially-inlined instructions, we build up and
268   /// maintain a mapping of simplified values specific to this callsite. The
269   /// idea is to propagate any special information we have about arguments to
270   /// this call through the inlinable section of the function, and account for
271   /// likely simplifications post-inlining. The most important aspect we track
272   /// is CFG altering simplifications -- when we prove a basic block dead, that
273   /// can cause dramatic shifts in the cost of inlining a function.
274   DenseMap<Value *, Constant *> SimplifiedValues;
275 
276   /// Keep track of the values which map back (through function arguments) to
277   /// allocas on the caller stack which could be simplified through SROA.
278   DenseMap<Value *, AllocaInst *> SROAArgValues;
279 
280   /// Keep track of Allocas for which we believe we may get SROA optimization.
281   DenseSet<AllocaInst *> EnabledSROAAllocas;
282 
283   /// Keep track of values which map to a pointer base and constant offset.
284   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
285 
286   /// Keep track of dead blocks due to the constant arguments.
287   SetVector<BasicBlock *> DeadBlocks;
288 
289   /// The mapping of the blocks to their known unique successors due to the
290   /// constant arguments.
291   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
292 
293   /// Model the elimination of repeated loads that is expected to happen
294   /// whenever we simplify away the stores that would otherwise cause them to be
295   /// loads.
296   bool EnableLoadElimination;
297   SmallPtrSet<Value *, 16> LoadAddrSet;
298 
299   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
300     auto It = SROAArgValues.find(V);
301     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
302       return nullptr;
303     return It->second;
304   }
305 
306   // Custom simplification helper routines.
307   bool isAllocaDerivedArg(Value *V);
308   void disableSROAForArg(AllocaInst *SROAArg);
309   void disableSROA(Value *V);
310   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
311   void disableLoadElimination();
312   bool isGEPFree(GetElementPtrInst &GEP);
313   bool canFoldInboundsGEP(GetElementPtrInst &I);
314   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
315   bool simplifyCallSite(Function *F, CallBase &Call);
316   template <typename Callable>
317   bool simplifyInstruction(Instruction &I, Callable Evaluate);
318   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
319 
320   /// Return true if the given argument to the function being considered for
321   /// inlining has the given attribute set either at the call site or the
322   /// function declaration.  Primarily used to inspect call site specific
323   /// attributes since these can be more precise than the ones on the callee
324   /// itself.
325   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
326 
327   /// Return true if the given value is known non null within the callee if
328   /// inlined through this particular callsite.
329   bool isKnownNonNullInCallee(Value *V);
330 
331   /// Return true if size growth is allowed when inlining the callee at \p Call.
332   bool allowSizeGrowth(CallBase &Call);
333 
334   // Custom analysis routines.
335   InlineResult analyzeBlock(BasicBlock *BB,
336                             SmallPtrSetImpl<const Value *> &EphValues);
337 
338   // Disable several entry points to the visitor so we don't accidentally use
339   // them by declaring but not defining them here.
340   void visit(Module *);
341   void visit(Module &);
342   void visit(Function *);
343   void visit(Function &);
344   void visit(BasicBlock *);
345   void visit(BasicBlock &);
346 
347   // Provide base case for our instruction visit.
348   bool visitInstruction(Instruction &I);
349 
350   // Our visit overrides.
351   bool visitAlloca(AllocaInst &I);
352   bool visitPHI(PHINode &I);
353   bool visitGetElementPtr(GetElementPtrInst &I);
354   bool visitBitCast(BitCastInst &I);
355   bool visitPtrToInt(PtrToIntInst &I);
356   bool visitIntToPtr(IntToPtrInst &I);
357   bool visitCastInst(CastInst &I);
358   bool visitUnaryInstruction(UnaryInstruction &I);
359   bool visitCmpInst(CmpInst &I);
360   bool visitSub(BinaryOperator &I);
361   bool visitBinaryOperator(BinaryOperator &I);
362   bool visitFNeg(UnaryOperator &I);
363   bool visitLoad(LoadInst &I);
364   bool visitStore(StoreInst &I);
365   bool visitExtractValue(ExtractValueInst &I);
366   bool visitInsertValue(InsertValueInst &I);
367   bool visitCallBase(CallBase &Call);
368   bool visitReturnInst(ReturnInst &RI);
369   bool visitBranchInst(BranchInst &BI);
370   bool visitSelectInst(SelectInst &SI);
371   bool visitSwitchInst(SwitchInst &SI);
372   bool visitIndirectBrInst(IndirectBrInst &IBI);
373   bool visitResumeInst(ResumeInst &RI);
374   bool visitCleanupReturnInst(CleanupReturnInst &RI);
375   bool visitCatchReturnInst(CatchReturnInst &RI);
376   bool visitUnreachableInst(UnreachableInst &I);
377 
378 public:
379   CallAnalyzer(const TargetTransformInfo &TTI,
380                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
381                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
382                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
383                Function &Callee, CallBase &Call)
384       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
385         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
386         CandidateCall(Call), EnableLoadElimination(true) {}
387 
388   InlineResult analyze();
389 
390   // Keep a bunch of stats about the cost savings found so we can print them
391   // out when debugging.
392   unsigned NumConstantArgs = 0;
393   unsigned NumConstantOffsetPtrArgs = 0;
394   unsigned NumAllocaArgs = 0;
395   unsigned NumConstantPtrCmps = 0;
396   unsigned NumConstantPtrDiffs = 0;
397   unsigned NumInstructionsSimplified = 0;
398 
399   void dump();
400 };
401 
402 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
403 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
404 class InlineCostCallAnalyzer final : public CallAnalyzer {
405   const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
406   const bool ComputeFullInlineCost;
407   int LoadEliminationCost = 0;
408   /// Bonus to be applied when percentage of vector instructions in callee is
409   /// high (see more details in updateThreshold).
410   int VectorBonus = 0;
411   /// Bonus to be applied when the callee has only one reachable basic block.
412   int SingleBBBonus = 0;
413 
414   /// Tunable parameters that control the analysis.
415   const InlineParams &Params;
416 
417   /// Upper bound for the inlining cost. Bonuses are being applied to account
418   /// for speculative "expected profit" of the inlining decision.
419   int Threshold = 0;
420 
421   /// Attempt to evaluate indirect calls to boost its inline cost.
422   const bool BoostIndirectCalls;
423 
424   /// Inlining cost measured in abstract units, accounts for all the
425   /// instructions expected to be executed for a given function invocation.
426   /// Instructions that are statically proven to be dead based on call-site
427   /// arguments are not counted here.
428   int Cost = 0;
429 
430   bool SingleBB = true;
431 
432   unsigned SROACostSavings = 0;
433   unsigned SROACostSavingsLost = 0;
434 
435   /// The mapping of caller Alloca values to their accumulated cost savings. If
436   /// we have to disable SROA for one of the allocas, this tells us how much
437   /// cost must be added.
438   DenseMap<AllocaInst *, int> SROAArgCosts;
439 
440   /// Return true if \p Call is a cold callsite.
441   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
442 
443   /// Update Threshold based on callsite properties such as callee
444   /// attributes and callee hotness for PGO builds. The Callee is explicitly
445   /// passed to support analyzing indirect calls whose target is inferred by
446   /// analysis.
447   void updateThreshold(CallBase &Call, Function &Callee);
448   /// Return a higher threshold if \p Call is a hot callsite.
449   Optional<int> getHotCallSiteThreshold(CallBase &Call,
450                                         BlockFrequencyInfo *CallerBFI);
451 
452   /// Handle a capped 'int' increment for Cost.
453   void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
454     assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
455     Cost = (int)std::min(UpperBound, Cost + Inc);
456   }
457 
458   void onDisableSROA(AllocaInst *Arg) override {
459     auto CostIt = SROAArgCosts.find(Arg);
460     if (CostIt == SROAArgCosts.end())
461       return;
462     addCost(CostIt->second);
463     SROACostSavings -= CostIt->second;
464     SROACostSavingsLost += CostIt->second;
465     SROAArgCosts.erase(CostIt);
466   }
467 
468   void onDisableLoadElimination() override {
469     addCost(LoadEliminationCost);
470     LoadEliminationCost = 0;
471   }
472   void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
473   void onCallArgumentSetup(const CallBase &Call) override {
474     // Pay the price of the argument setup. We account for the average 1
475     // instruction per call argument setup here.
476     addCost(Call.arg_size() * InlineConstants::InstrCost);
477   }
478   void onLoadRelativeIntrinsic() override {
479     // This is normally lowered to 4 LLVM instructions.
480     addCost(3 * InlineConstants::InstrCost);
481   }
482   void onLoweredCall(Function *F, CallBase &Call,
483                      bool IsIndirectCall) override {
484     // We account for the average 1 instruction per call argument setup here.
485     addCost(Call.arg_size() * InlineConstants::InstrCost);
486 
487     // If we have a constant that we are calling as a function, we can peer
488     // through it and see the function target. This happens not infrequently
489     // during devirtualization and so we want to give it a hefty bonus for
490     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
491     // Pretend to inline the function, with a custom threshold.
492     if (IsIndirectCall && BoostIndirectCalls) {
493       auto IndirectCallParams = Params;
494       IndirectCallParams.DefaultThreshold =
495           InlineConstants::IndirectCallThreshold;
496       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
497       /// to instantiate the derived class.
498       InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
499                                 Call, IndirectCallParams, false);
500       if (CA.analyze().isSuccess()) {
501         // We were able to inline the indirect call! Subtract the cost from the
502         // threshold to get the bonus we want to apply, but don't go below zero.
503         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
504       }
505     } else
506       // Otherwise simply add the cost for merely making the call.
507       addCost(InlineConstants::CallPenalty);
508   }
509 
510   void onFinalizeSwitch(unsigned JumpTableSize,
511                         unsigned NumCaseCluster) override {
512     // If suitable for a jump table, consider the cost for the table size and
513     // branch to destination.
514     // Maximum valid cost increased in this function.
515     if (JumpTableSize) {
516       int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
517                        4 * InlineConstants::InstrCost;
518 
519       addCost(JTCost, (int64_t)CostUpperBound);
520       return;
521     }
522     // Considering forming a binary search, we should find the number of nodes
523     // which is same as the number of comparisons when lowered. For a given
524     // number of clusters, n, we can define a recursive function, f(n), to find
525     // the number of nodes in the tree. The recursion is :
526     // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
527     // and f(n) = n, when n <= 3.
528     // This will lead a binary tree where the leaf should be either f(2) or f(3)
529     // when n > 3.  So, the number of comparisons from leaves should be n, while
530     // the number of non-leaf should be :
531     //   2^(log2(n) - 1) - 1
532     //   = 2^log2(n) * 2^-1 - 1
533     //   = n / 2 - 1.
534     // Considering comparisons from leaf and non-leaf nodes, we can estimate the
535     // number of comparisons in a simple closed form :
536     //   n + n / 2 - 1 = n * 3 / 2 - 1
537     if (NumCaseCluster <= 3) {
538       // Suppose a comparison includes one compare and one conditional branch.
539       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
540       return;
541     }
542 
543     int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
544     int64_t SwitchCost =
545         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
546 
547     addCost(SwitchCost, (int64_t)CostUpperBound);
548   }
549   void onMissedSimplification() override {
550     addCost(InlineConstants::InstrCost);
551   }
552 
553   void onInitializeSROAArg(AllocaInst *Arg) override {
554     assert(Arg != nullptr &&
555            "Should not initialize SROA costs for null value.");
556     SROAArgCosts[Arg] = 0;
557   }
558 
559   void onAggregateSROAUse(AllocaInst *SROAArg) override {
560     auto CostIt = SROAArgCosts.find(SROAArg);
561     assert(CostIt != SROAArgCosts.end() &&
562            "expected this argument to have a cost");
563     CostIt->second += InlineConstants::InstrCost;
564     SROACostSavings += InlineConstants::InstrCost;
565   }
566 
567   void onBlockAnalyzed(const BasicBlock *BB) override {
568     auto *TI = BB->getTerminator();
569     // If we had any successors at this point, than post-inlining is likely to
570     // have them as well. Note that we assume any basic blocks which existed
571     // due to branches or switches which folded above will also fold after
572     // inlining.
573     if (SingleBB && TI->getNumSuccessors() > 1) {
574       // Take off the bonus we applied to the threshold.
575       Threshold -= SingleBBBonus;
576       SingleBB = false;
577     }
578   }
579 
580   void onInstructionAnalysisStart(const Instruction *I) override {
581     // This function is called to store the initial cost of inlining before
582     // the given instruction was assessed.
583     if (!PrintDebugInstructionDeltas)
584         return ;
585     Writer.CostThresholdMap[I].CostBefore = Cost;
586     Writer.CostThresholdMap[I].ThresholdBefore = Threshold;
587   }
588 
589   void onInstructionAnalysisFinish(const Instruction *I) override {
590     // This function is called to find new values of cost and threshold after
591     // the instruction has been assessed.
592     if (!PrintDebugInstructionDeltas)
593         return ;
594     Writer.CostThresholdMap[I].CostAfter = Cost;
595     Writer.CostThresholdMap[I].ThresholdAfter = Threshold;
596   }
597 
598   InlineResult finalizeAnalysis() override {
599     // Loops generally act a lot like calls in that they act like barriers to
600     // movement, require a certain amount of setup, etc. So when optimising for
601     // size, we penalise any call sites that perform loops. We do this after all
602     // other costs here, so will likely only be dealing with relatively small
603     // functions (and hence DT and LI will hopefully be cheap).
604     auto *Caller = CandidateCall.getFunction();
605     if (Caller->hasMinSize()) {
606       DominatorTree DT(F);
607       LoopInfo LI(DT);
608       int NumLoops = 0;
609       for (Loop *L : LI) {
610         // Ignore loops that will not be executed
611         if (DeadBlocks.count(L->getHeader()))
612           continue;
613         NumLoops++;
614       }
615       addCost(NumLoops * InlineConstants::CallPenalty);
616     }
617 
618     // We applied the maximum possible vector bonus at the beginning. Now,
619     // subtract the excess bonus, if any, from the Threshold before
620     // comparing against Cost.
621     if (NumVectorInstructions <= NumInstructions / 10)
622       Threshold -= VectorBonus;
623     else if (NumVectorInstructions <= NumInstructions / 2)
624       Threshold -= VectorBonus / 2;
625 
626     if (Cost < std::max(1, Threshold))
627       return InlineResult::success();
628     return InlineResult::failure("Cost over threshold.");
629   }
630   bool shouldStop() override {
631     // Bail out the moment we cross the threshold. This means we'll under-count
632     // the cost, but only when undercounting doesn't matter.
633     return Cost >= Threshold && !ComputeFullInlineCost;
634   }
635 
636   void onLoadEliminationOpportunity() override {
637     LoadEliminationCost += InlineConstants::InstrCost;
638   }
639 
640   InlineResult onAnalysisStart() override {
641     // Perform some tweaks to the cost and threshold based on the direct
642     // callsite information.
643 
644     // We want to more aggressively inline vector-dense kernels, so up the
645     // threshold, and we'll lower it if the % of vector instructions gets too
646     // low. Note that these bonuses are some what arbitrary and evolved over
647     // time by accident as much as because they are principled bonuses.
648     //
649     // FIXME: It would be nice to remove all such bonuses. At least it would be
650     // nice to base the bonus values on something more scientific.
651     assert(NumInstructions == 0);
652     assert(NumVectorInstructions == 0);
653 
654     // Update the threshold based on callsite properties
655     updateThreshold(CandidateCall, F);
656 
657     // While Threshold depends on commandline options that can take negative
658     // values, we want to enforce the invariant that the computed threshold and
659     // bonuses are non-negative.
660     assert(Threshold >= 0);
661     assert(SingleBBBonus >= 0);
662     assert(VectorBonus >= 0);
663 
664     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
665     // this Threshold any time, and cost cannot decrease, we can stop processing
666     // the rest of the function body.
667     Threshold += (SingleBBBonus + VectorBonus);
668 
669     // Give out bonuses for the callsite, as the instructions setting them up
670     // will be gone after inlining.
671     addCost(-getCallsiteCost(this->CandidateCall, DL));
672 
673     // If this function uses the coldcc calling convention, prefer not to inline
674     // it.
675     if (F.getCallingConv() == CallingConv::Cold)
676       Cost += InlineConstants::ColdccPenalty;
677 
678     // Check if we're done. This can happen due to bonuses and penalties.
679     if (Cost >= Threshold && !ComputeFullInlineCost)
680       return InlineResult::failure("high cost");
681 
682     return InlineResult::success();
683   }
684 
685 public:
686   InlineCostCallAnalyzer(
687       const TargetTransformInfo &TTI,
688       std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
689       Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
690       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
691       CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
692       : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
693         ComputeFullInlineCost(OptComputeFullInlineCost ||
694                               Params.ComputeFullInlineCost || ORE),
695         Params(Params), Threshold(Params.DefaultThreshold),
696         BoostIndirectCalls(BoostIndirect) {}
697 
698   /// Annotation Writer for cost annotation
699   CostAnnotationWriter Writer;
700 
701   void dump();
702 
703   virtual ~InlineCostCallAnalyzer() {}
704   int getThreshold() { return Threshold; }
705   int getCost() { return Cost; }
706 };
707 } // namespace
708 
709 /// Test whether the given value is an Alloca-derived function argument.
710 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
711   return SROAArgValues.count(V);
712 }
713 
714 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
715   onDisableSROA(SROAArg);
716   EnabledSROAAllocas.erase(SROAArg);
717   disableLoadElimination();
718 }
719 
720 void CostAnnotationWriter::emitInstructionAnnot(
721     const Instruction *I, formatted_raw_ostream &OS) {
722     // The cost of inlining of the given instruction is printed always.
723     // The threshold delta is printed only when it is non-zero. It happens
724     // when we decided to give a bonus at a particular instruction.
725     OS << "; cost before = " << CostThresholdMap[I].CostBefore <<
726               ", cost after = " << CostThresholdMap[I].CostAfter <<
727               ", threshold before = " << CostThresholdMap[I].ThresholdBefore <<
728               ", threshold after = " << CostThresholdMap[I].ThresholdAfter <<
729               ", ";
730     OS << "cost delta = " << CostThresholdMap[I].CostAfter -
731                                 CostThresholdMap[I].CostBefore;
732     if (CostThresholdMap[I].ThresholdAfter != CostThresholdMap[I].ThresholdBefore)
733       OS << ", threshold delta = " << CostThresholdMap[I].ThresholdAfter -
734                                 CostThresholdMap[I].ThresholdBefore;
735     OS << "\n";
736 }
737 
738 /// If 'V' maps to a SROA candidate, disable SROA for it.
739 void CallAnalyzer::disableSROA(Value *V) {
740   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
741     disableSROAForArg(SROAArg);
742   }
743 }
744 
745 void CallAnalyzer::disableLoadElimination() {
746   if (EnableLoadElimination) {
747     onDisableLoadElimination();
748     EnableLoadElimination = false;
749   }
750 }
751 
752 /// Accumulate a constant GEP offset into an APInt if possible.
753 ///
754 /// Returns false if unable to compute the offset for any reason. Respects any
755 /// simplified values known during the analysis of this callsite.
756 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
757   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
758   assert(IntPtrWidth == Offset.getBitWidth());
759 
760   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
761        GTI != GTE; ++GTI) {
762     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
763     if (!OpC)
764       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
765         OpC = dyn_cast<ConstantInt>(SimpleOp);
766     if (!OpC)
767       return false;
768     if (OpC->isZero())
769       continue;
770 
771     // Handle a struct index, which adds its field offset to the pointer.
772     if (StructType *STy = GTI.getStructTypeOrNull()) {
773       unsigned ElementIdx = OpC->getZExtValue();
774       const StructLayout *SL = DL.getStructLayout(STy);
775       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
776       continue;
777     }
778 
779     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
780     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
781   }
782   return true;
783 }
784 
785 /// Use TTI to check whether a GEP is free.
786 ///
787 /// Respects any simplified values known during the analysis of this callsite.
788 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
789   SmallVector<Value *, 4> Operands;
790   Operands.push_back(GEP.getOperand(0));
791   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
792     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
793       Operands.push_back(SimpleOp);
794     else
795       Operands.push_back(*I);
796   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
797 }
798 
799 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
800   // Check whether inlining will turn a dynamic alloca into a static
801   // alloca and handle that case.
802   if (I.isArrayAllocation()) {
803     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
804     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
805       Type *Ty = I.getAllocatedType();
806       AllocatedSize = SaturatingMultiplyAdd(
807           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
808           AllocatedSize);
809       return Base::visitAlloca(I);
810     }
811   }
812 
813   // Accumulate the allocated size.
814   if (I.isStaticAlloca()) {
815     Type *Ty = I.getAllocatedType();
816     AllocatedSize =
817         SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize);
818   }
819 
820   // We will happily inline static alloca instructions.
821   if (I.isStaticAlloca())
822     return Base::visitAlloca(I);
823 
824   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
825   // a variety of reasons, and so we would like to not inline them into
826   // functions which don't currently have a dynamic alloca. This simply
827   // disables inlining altogether in the presence of a dynamic alloca.
828   HasDynamicAlloca = true;
829   return false;
830 }
831 
832 bool CallAnalyzer::visitPHI(PHINode &I) {
833   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
834   // though we don't want to propagate it's bonuses. The idea is to disable
835   // SROA if it *might* be used in an inappropriate manner.
836 
837   // Phi nodes are always zero-cost.
838   // FIXME: Pointer sizes may differ between different address spaces, so do we
839   // need to use correct address space in the call to getPointerSizeInBits here?
840   // Or could we skip the getPointerSizeInBits call completely? As far as I can
841   // see the ZeroOffset is used as a dummy value, so we can probably use any
842   // bit width for the ZeroOffset?
843   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
844   bool CheckSROA = I.getType()->isPointerTy();
845 
846   // Track the constant or pointer with constant offset we've seen so far.
847   Constant *FirstC = nullptr;
848   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
849   Value *FirstV = nullptr;
850 
851   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
852     BasicBlock *Pred = I.getIncomingBlock(i);
853     // If the incoming block is dead, skip the incoming block.
854     if (DeadBlocks.count(Pred))
855       continue;
856     // If the parent block of phi is not the known successor of the incoming
857     // block, skip the incoming block.
858     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
859     if (KnownSuccessor && KnownSuccessor != I.getParent())
860       continue;
861 
862     Value *V = I.getIncomingValue(i);
863     // If the incoming value is this phi itself, skip the incoming value.
864     if (&I == V)
865       continue;
866 
867     Constant *C = dyn_cast<Constant>(V);
868     if (!C)
869       C = SimplifiedValues.lookup(V);
870 
871     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
872     if (!C && CheckSROA)
873       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
874 
875     if (!C && !BaseAndOffset.first)
876       // The incoming value is neither a constant nor a pointer with constant
877       // offset, exit early.
878       return true;
879 
880     if (FirstC) {
881       if (FirstC == C)
882         // If we've seen a constant incoming value before and it is the same
883         // constant we see this time, continue checking the next incoming value.
884         continue;
885       // Otherwise early exit because we either see a different constant or saw
886       // a constant before but we have a pointer with constant offset this time.
887       return true;
888     }
889 
890     if (FirstV) {
891       // The same logic as above, but check pointer with constant offset here.
892       if (FirstBaseAndOffset == BaseAndOffset)
893         continue;
894       return true;
895     }
896 
897     if (C) {
898       // This is the 1st time we've seen a constant, record it.
899       FirstC = C;
900       continue;
901     }
902 
903     // The remaining case is that this is the 1st time we've seen a pointer with
904     // constant offset, record it.
905     FirstV = V;
906     FirstBaseAndOffset = BaseAndOffset;
907   }
908 
909   // Check if we can map phi to a constant.
910   if (FirstC) {
911     SimplifiedValues[&I] = FirstC;
912     return true;
913   }
914 
915   // Check if we can map phi to a pointer with constant offset.
916   if (FirstBaseAndOffset.first) {
917     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
918 
919     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
920       SROAArgValues[&I] = SROAArg;
921   }
922 
923   return true;
924 }
925 
926 /// Check we can fold GEPs of constant-offset call site argument pointers.
927 /// This requires target data and inbounds GEPs.
928 ///
929 /// \return true if the specified GEP can be folded.
930 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
931   // Check if we have a base + offset for the pointer.
932   std::pair<Value *, APInt> BaseAndOffset =
933       ConstantOffsetPtrs.lookup(I.getPointerOperand());
934   if (!BaseAndOffset.first)
935     return false;
936 
937   // Check if the offset of this GEP is constant, and if so accumulate it
938   // into Offset.
939   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
940     return false;
941 
942   // Add the result as a new mapping to Base + Offset.
943   ConstantOffsetPtrs[&I] = BaseAndOffset;
944 
945   return true;
946 }
947 
948 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
949   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
950 
951   // Lambda to check whether a GEP's indices are all constant.
952   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
953     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
954       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
955         return false;
956     return true;
957   };
958 
959   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
960     if (SROAArg)
961       SROAArgValues[&I] = SROAArg;
962 
963     // Constant GEPs are modeled as free.
964     return true;
965   }
966 
967   // Variable GEPs will require math and will disable SROA.
968   if (SROAArg)
969     disableSROAForArg(SROAArg);
970   return isGEPFree(I);
971 }
972 
973 /// Simplify \p I if its operands are constants and update SimplifiedValues.
974 /// \p Evaluate is a callable specific to instruction type that evaluates the
975 /// instruction when all the operands are constants.
976 template <typename Callable>
977 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
978   SmallVector<Constant *, 2> COps;
979   for (Value *Op : I.operands()) {
980     Constant *COp = dyn_cast<Constant>(Op);
981     if (!COp)
982       COp = SimplifiedValues.lookup(Op);
983     if (!COp)
984       return false;
985     COps.push_back(COp);
986   }
987   auto *C = Evaluate(COps);
988   if (!C)
989     return false;
990   SimplifiedValues[&I] = C;
991   return true;
992 }
993 
994 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
995   // Propagate constants through bitcasts.
996   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
997         return ConstantExpr::getBitCast(COps[0], I.getType());
998       }))
999     return true;
1000 
1001   // Track base/offsets through casts
1002   std::pair<Value *, APInt> BaseAndOffset =
1003       ConstantOffsetPtrs.lookup(I.getOperand(0));
1004   // Casts don't change the offset, just wrap it up.
1005   if (BaseAndOffset.first)
1006     ConstantOffsetPtrs[&I] = BaseAndOffset;
1007 
1008   // Also look for SROA candidates here.
1009   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1010     SROAArgValues[&I] = SROAArg;
1011 
1012   // Bitcasts are always zero cost.
1013   return true;
1014 }
1015 
1016 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1017   // Propagate constants through ptrtoint.
1018   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1019         return ConstantExpr::getPtrToInt(COps[0], I.getType());
1020       }))
1021     return true;
1022 
1023   // Track base/offset pairs when converted to a plain integer provided the
1024   // integer is large enough to represent the pointer.
1025   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1026   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1027   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
1028     std::pair<Value *, APInt> BaseAndOffset =
1029         ConstantOffsetPtrs.lookup(I.getOperand(0));
1030     if (BaseAndOffset.first)
1031       ConstantOffsetPtrs[&I] = BaseAndOffset;
1032   }
1033 
1034   // This is really weird. Technically, ptrtoint will disable SROA. However,
1035   // unless that ptrtoint is *used* somewhere in the live basic blocks after
1036   // inlining, it will be nuked, and SROA should proceed. All of the uses which
1037   // would block SROA would also block SROA if applied directly to a pointer,
1038   // and so we can just add the integer in here. The only places where SROA is
1039   // preserved either cannot fire on an integer, or won't in-and-of themselves
1040   // disable SROA (ext) w/o some later use that we would see and disable.
1041   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1042     SROAArgValues[&I] = SROAArg;
1043 
1044   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1045 }
1046 
1047 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1048   // Propagate constants through ptrtoint.
1049   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1050         return ConstantExpr::getIntToPtr(COps[0], I.getType());
1051       }))
1052     return true;
1053 
1054   // Track base/offset pairs when round-tripped through a pointer without
1055   // modifications provided the integer is not too large.
1056   Value *Op = I.getOperand(0);
1057   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1058   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1059     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1060     if (BaseAndOffset.first)
1061       ConstantOffsetPtrs[&I] = BaseAndOffset;
1062   }
1063 
1064   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1065   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1066     SROAArgValues[&I] = SROAArg;
1067 
1068   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1069 }
1070 
1071 bool CallAnalyzer::visitCastInst(CastInst &I) {
1072   // Propagate constants through casts.
1073   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1074         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
1075       }))
1076     return true;
1077 
1078   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
1079   disableSROA(I.getOperand(0));
1080 
1081   // If this is a floating-point cast, and the target says this operation
1082   // is expensive, this may eventually become a library call. Treat the cost
1083   // as such.
1084   switch (I.getOpcode()) {
1085   case Instruction::FPTrunc:
1086   case Instruction::FPExt:
1087   case Instruction::UIToFP:
1088   case Instruction::SIToFP:
1089   case Instruction::FPToUI:
1090   case Instruction::FPToSI:
1091     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1092       onCallPenalty();
1093     break;
1094   default:
1095     break;
1096   }
1097 
1098   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1099 }
1100 
1101 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
1102   Value *Operand = I.getOperand(0);
1103   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1104         return ConstantFoldInstOperands(&I, COps[0], DL);
1105       }))
1106     return true;
1107 
1108   // Disable any SROA on the argument to arbitrary unary instructions.
1109   disableSROA(Operand);
1110 
1111   return false;
1112 }
1113 
1114 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1115   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1116 }
1117 
1118 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1119   // Does the *call site* have the NonNull attribute set on an argument?  We
1120   // use the attribute on the call site to memoize any analysis done in the
1121   // caller. This will also trip if the callee function has a non-null
1122   // parameter attribute, but that's a less interesting case because hopefully
1123   // the callee would already have been simplified based on that.
1124   if (Argument *A = dyn_cast<Argument>(V))
1125     if (paramHasAttr(A, Attribute::NonNull))
1126       return true;
1127 
1128   // Is this an alloca in the caller?  This is distinct from the attribute case
1129   // above because attributes aren't updated within the inliner itself and we
1130   // always want to catch the alloca derived case.
1131   if (isAllocaDerivedArg(V))
1132     // We can actually predict the result of comparisons between an
1133     // alloca-derived value and null. Note that this fires regardless of
1134     // SROA firing.
1135     return true;
1136 
1137   return false;
1138 }
1139 
1140 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1141   // If the normal destination of the invoke or the parent block of the call
1142   // site is unreachable-terminated, there is little point in inlining this
1143   // unless there is literally zero cost.
1144   // FIXME: Note that it is possible that an unreachable-terminated block has a
1145   // hot entry. For example, in below scenario inlining hot_call_X() may be
1146   // beneficial :
1147   // main() {
1148   //   hot_call_1();
1149   //   ...
1150   //   hot_call_N()
1151   //   exit(0);
1152   // }
1153   // For now, we are not handling this corner case here as it is rare in real
1154   // code. In future, we should elaborate this based on BPI and BFI in more
1155   // general threshold adjusting heuristics in updateThreshold().
1156   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1157     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1158       return false;
1159   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1160     return false;
1161 
1162   return true;
1163 }
1164 
1165 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1166                                             BlockFrequencyInfo *CallerBFI) {
1167   // If global profile summary is available, then callsite's coldness is
1168   // determined based on that.
1169   if (PSI && PSI->hasProfileSummary())
1170     return PSI->isColdCallSite(CallSite(&Call), CallerBFI);
1171 
1172   // Otherwise we need BFI to be available.
1173   if (!CallerBFI)
1174     return false;
1175 
1176   // Determine if the callsite is cold relative to caller's entry. We could
1177   // potentially cache the computation of scaled entry frequency, but the added
1178   // complexity is not worth it unless this scaling shows up high in the
1179   // profiles.
1180   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1181   auto CallSiteBB = Call.getParent();
1182   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1183   auto CallerEntryFreq =
1184       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1185   return CallSiteFreq < CallerEntryFreq * ColdProb;
1186 }
1187 
1188 Optional<int>
1189 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1190                                                 BlockFrequencyInfo *CallerBFI) {
1191 
1192   // If global profile summary is available, then callsite's hotness is
1193   // determined based on that.
1194   if (PSI && PSI->hasProfileSummary() &&
1195       PSI->isHotCallSite(CallSite(&Call), CallerBFI))
1196     return Params.HotCallSiteThreshold;
1197 
1198   // Otherwise we need BFI to be available and to have a locally hot callsite
1199   // threshold.
1200   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1201     return None;
1202 
1203   // Determine if the callsite is hot relative to caller's entry. We could
1204   // potentially cache the computation of scaled entry frequency, but the added
1205   // complexity is not worth it unless this scaling shows up high in the
1206   // profiles.
1207   auto CallSiteBB = Call.getParent();
1208   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1209   auto CallerEntryFreq = CallerBFI->getEntryFreq();
1210   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1211     return Params.LocallyHotCallSiteThreshold;
1212 
1213   // Otherwise treat it normally.
1214   return None;
1215 }
1216 
1217 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1218   // If no size growth is allowed for this inlining, set Threshold to 0.
1219   if (!allowSizeGrowth(Call)) {
1220     Threshold = 0;
1221     return;
1222   }
1223 
1224   Function *Caller = Call.getCaller();
1225 
1226   // return min(A, B) if B is valid.
1227   auto MinIfValid = [](int A, Optional<int> B) {
1228     return B ? std::min(A, B.getValue()) : A;
1229   };
1230 
1231   // return max(A, B) if B is valid.
1232   auto MaxIfValid = [](int A, Optional<int> B) {
1233     return B ? std::max(A, B.getValue()) : A;
1234   };
1235 
1236   // Various bonus percentages. These are multiplied by Threshold to get the
1237   // bonus values.
1238   // SingleBBBonus: This bonus is applied if the callee has a single reachable
1239   // basic block at the given callsite context. This is speculatively applied
1240   // and withdrawn if more than one basic block is seen.
1241   //
1242   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1243   // of the last call to a static function as inlining such functions is
1244   // guaranteed to reduce code size.
1245   //
1246   // These bonus percentages may be set to 0 based on properties of the caller
1247   // and the callsite.
1248   int SingleBBBonusPercent = 50;
1249   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1250   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1251 
1252   // Lambda to set all the above bonus and bonus percentages to 0.
1253   auto DisallowAllBonuses = [&]() {
1254     SingleBBBonusPercent = 0;
1255     VectorBonusPercent = 0;
1256     LastCallToStaticBonus = 0;
1257   };
1258 
1259   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1260   // and reduce the threshold if the caller has the necessary attribute.
1261   if (Caller->hasMinSize()) {
1262     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1263     // For minsize, we want to disable the single BB bonus and the vector
1264     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1265     // a static function will, at the minimum, eliminate the parameter setup and
1266     // call/return instructions.
1267     SingleBBBonusPercent = 0;
1268     VectorBonusPercent = 0;
1269   } else if (Caller->hasOptSize())
1270     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1271 
1272   // Adjust the threshold based on inlinehint attribute and profile based
1273   // hotness information if the caller does not have MinSize attribute.
1274   if (!Caller->hasMinSize()) {
1275     if (Callee.hasFnAttribute(Attribute::InlineHint))
1276       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1277 
1278     // FIXME: After switching to the new passmanager, simplify the logic below
1279     // by checking only the callsite hotness/coldness as we will reliably
1280     // have local profile information.
1281     //
1282     // Callsite hotness and coldness can be determined if sample profile is
1283     // used (which adds hotness metadata to calls) or if caller's
1284     // BlockFrequencyInfo is available.
1285     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
1286     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1287     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1288       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1289       // FIXME: This should update the threshold only if it exceeds the
1290       // current threshold, but AutoFDO + ThinLTO currently relies on this
1291       // behavior to prevent inlining of hot callsites during ThinLTO
1292       // compile phase.
1293       Threshold = HotCallSiteThreshold.getValue();
1294     } else if (isColdCallSite(Call, CallerBFI)) {
1295       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1296       // Do not apply bonuses for a cold callsite including the
1297       // LastCallToStatic bonus. While this bonus might result in code size
1298       // reduction, it can cause the size of a non-cold caller to increase
1299       // preventing it from being inlined.
1300       DisallowAllBonuses();
1301       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1302     } else if (PSI) {
1303       // Use callee's global profile information only if we have no way of
1304       // determining this via callsite information.
1305       if (PSI->isFunctionEntryHot(&Callee)) {
1306         LLVM_DEBUG(dbgs() << "Hot callee.\n");
1307         // If callsite hotness can not be determined, we may still know
1308         // that the callee is hot and treat it as a weaker hint for threshold
1309         // increase.
1310         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1311       } else if (PSI->isFunctionEntryCold(&Callee)) {
1312         LLVM_DEBUG(dbgs() << "Cold callee.\n");
1313         // Do not apply bonuses for a cold callee including the
1314         // LastCallToStatic bonus. While this bonus might result in code size
1315         // reduction, it can cause the size of a non-cold caller to increase
1316         // preventing it from being inlined.
1317         DisallowAllBonuses();
1318         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1319       }
1320     }
1321   }
1322 
1323   // Finally, take the target-specific inlining threshold multiplier into
1324   // account.
1325   Threshold *= TTI.getInliningThresholdMultiplier();
1326 
1327   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1328   VectorBonus = Threshold * VectorBonusPercent / 100;
1329 
1330   bool OnlyOneCallAndLocalLinkage =
1331       F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
1332   // If there is only one call of the function, and it has internal linkage,
1333   // the cost of inlining it drops dramatically. It may seem odd to update
1334   // Cost in updateThreshold, but the bonus depends on the logic in this method.
1335   if (OnlyOneCallAndLocalLinkage)
1336     Cost -= LastCallToStaticBonus;
1337 }
1338 
1339 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1340   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1341   // First try to handle simplified comparisons.
1342   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1343         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
1344       }))
1345     return true;
1346 
1347   if (I.getOpcode() == Instruction::FCmp)
1348     return false;
1349 
1350   // Otherwise look for a comparison between constant offset pointers with
1351   // a common base.
1352   Value *LHSBase, *RHSBase;
1353   APInt LHSOffset, RHSOffset;
1354   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1355   if (LHSBase) {
1356     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1357     if (RHSBase && LHSBase == RHSBase) {
1358       // We have common bases, fold the icmp to a constant based on the
1359       // offsets.
1360       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1361       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1362       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1363         SimplifiedValues[&I] = C;
1364         ++NumConstantPtrCmps;
1365         return true;
1366       }
1367     }
1368   }
1369 
1370   // If the comparison is an equality comparison with null, we can simplify it
1371   // if we know the value (argument) can't be null
1372   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1373       isKnownNonNullInCallee(I.getOperand(0))) {
1374     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1375     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1376                                       : ConstantInt::getFalse(I.getType());
1377     return true;
1378   }
1379   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1380 }
1381 
1382 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1383   // Try to handle a special case: we can fold computing the difference of two
1384   // constant-related pointers.
1385   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1386   Value *LHSBase, *RHSBase;
1387   APInt LHSOffset, RHSOffset;
1388   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1389   if (LHSBase) {
1390     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1391     if (RHSBase && LHSBase == RHSBase) {
1392       // We have common bases, fold the subtract to a constant based on the
1393       // offsets.
1394       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1395       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1396       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1397         SimplifiedValues[&I] = C;
1398         ++NumConstantPtrDiffs;
1399         return true;
1400       }
1401     }
1402   }
1403 
1404   // Otherwise, fall back to the generic logic for simplifying and handling
1405   // instructions.
1406   return Base::visitSub(I);
1407 }
1408 
1409 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1410   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1411   Constant *CLHS = dyn_cast<Constant>(LHS);
1412   if (!CLHS)
1413     CLHS = SimplifiedValues.lookup(LHS);
1414   Constant *CRHS = dyn_cast<Constant>(RHS);
1415   if (!CRHS)
1416     CRHS = SimplifiedValues.lookup(RHS);
1417 
1418   Value *SimpleV = nullptr;
1419   if (auto FI = dyn_cast<FPMathOperator>(&I))
1420     SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
1421                             FI->getFastMathFlags(), DL);
1422   else
1423     SimpleV =
1424         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1425 
1426   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1427     SimplifiedValues[&I] = C;
1428 
1429   if (SimpleV)
1430     return true;
1431 
1432   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1433   disableSROA(LHS);
1434   disableSROA(RHS);
1435 
1436   // If the instruction is floating point, and the target says this operation
1437   // is expensive, this may eventually become a library call. Treat the cost
1438   // as such. Unless it's fneg which can be implemented with an xor.
1439   using namespace llvm::PatternMatch;
1440   if (I.getType()->isFloatingPointTy() &&
1441       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
1442       !match(&I, m_FNeg(m_Value())))
1443     onCallPenalty();
1444 
1445   return false;
1446 }
1447 
1448 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
1449   Value *Op = I.getOperand(0);
1450   Constant *COp = dyn_cast<Constant>(Op);
1451   if (!COp)
1452     COp = SimplifiedValues.lookup(Op);
1453 
1454   Value *SimpleV = SimplifyFNegInst(
1455       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
1456 
1457   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1458     SimplifiedValues[&I] = C;
1459 
1460   if (SimpleV)
1461     return true;
1462 
1463   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
1464   disableSROA(Op);
1465 
1466   return false;
1467 }
1468 
1469 bool CallAnalyzer::visitLoad(LoadInst &I) {
1470   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1471     return true;
1472 
1473   // If the data is already loaded from this address and hasn't been clobbered
1474   // by any stores or calls, this load is likely to be redundant and can be
1475   // eliminated.
1476   if (EnableLoadElimination &&
1477       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1478     onLoadEliminationOpportunity();
1479     return true;
1480   }
1481 
1482   return false;
1483 }
1484 
1485 bool CallAnalyzer::visitStore(StoreInst &I) {
1486   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1487     return true;
1488 
1489   // The store can potentially clobber loads and prevent repeated loads from
1490   // being eliminated.
1491   // FIXME:
1492   // 1. We can probably keep an initial set of eliminatable loads substracted
1493   // from the cost even when we finally see a store. We just need to disable
1494   // *further* accumulation of elimination savings.
1495   // 2. We should probably at some point thread MemorySSA for the callee into
1496   // this and then use that to actually compute *really* precise savings.
1497   disableLoadElimination();
1498   return false;
1499 }
1500 
1501 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1502   // Constant folding for extract value is trivial.
1503   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1504         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1505       }))
1506     return true;
1507 
1508   // SROA can look through these but give them a cost.
1509   return false;
1510 }
1511 
1512 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1513   // Constant folding for insert value is trivial.
1514   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1515         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1516                                             /*InsertedValueOperand*/ COps[1],
1517                                             I.getIndices());
1518       }))
1519     return true;
1520 
1521   // SROA can look through these but give them a cost.
1522   return false;
1523 }
1524 
1525 /// Try to simplify a call site.
1526 ///
1527 /// Takes a concrete function and callsite and tries to actually simplify it by
1528 /// analyzing the arguments and call itself with instsimplify. Returns true if
1529 /// it has simplified the callsite to some other entity (a constant), making it
1530 /// free.
1531 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
1532   // FIXME: Using the instsimplify logic directly for this is inefficient
1533   // because we have to continually rebuild the argument list even when no
1534   // simplifications can be performed. Until that is fixed with remapping
1535   // inside of instsimplify, directly constant fold calls here.
1536   if (!canConstantFoldCallTo(&Call, F))
1537     return false;
1538 
1539   // Try to re-map the arguments to constants.
1540   SmallVector<Constant *, 4> ConstantArgs;
1541   ConstantArgs.reserve(Call.arg_size());
1542   for (Value *I : Call.args()) {
1543     Constant *C = dyn_cast<Constant>(I);
1544     if (!C)
1545       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
1546     if (!C)
1547       return false; // This argument doesn't map to a constant.
1548 
1549     ConstantArgs.push_back(C);
1550   }
1551   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
1552     SimplifiedValues[&Call] = C;
1553     return true;
1554   }
1555 
1556   return false;
1557 }
1558 
1559 bool CallAnalyzer::visitCallBase(CallBase &Call) {
1560   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
1561       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1562     // This aborts the entire analysis.
1563     ExposesReturnsTwice = true;
1564     return false;
1565   }
1566   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
1567     ContainsNoDuplicateCall = true;
1568 
1569   Value *Callee = Call.getCalledOperand();
1570   Function *F = dyn_cast_or_null<Function>(Callee);
1571   bool IsIndirectCall = !F;
1572   if (IsIndirectCall) {
1573     // Check if this happens to be an indirect function call to a known function
1574     // in this inline context. If not, we've done all we can.
1575     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1576     if (!F) {
1577       onCallArgumentSetup(Call);
1578 
1579       if (!Call.onlyReadsMemory())
1580         disableLoadElimination();
1581       return Base::visitCallBase(Call);
1582     }
1583   }
1584 
1585   assert(F && "Expected a call to a known function");
1586 
1587   // When we have a concrete function, first try to simplify it directly.
1588   if (simplifyCallSite(F, Call))
1589     return true;
1590 
1591   // Next check if it is an intrinsic we know about.
1592   // FIXME: Lift this into part of the InstVisitor.
1593   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
1594     switch (II->getIntrinsicID()) {
1595     default:
1596       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1597         disableLoadElimination();
1598       return Base::visitCallBase(Call);
1599 
1600     case Intrinsic::load_relative:
1601       onLoadRelativeIntrinsic();
1602       return false;
1603 
1604     case Intrinsic::memset:
1605     case Intrinsic::memcpy:
1606     case Intrinsic::memmove:
1607       disableLoadElimination();
1608       // SROA can usually chew through these intrinsics, but they aren't free.
1609       return false;
1610     case Intrinsic::icall_branch_funnel:
1611     case Intrinsic::localescape:
1612       HasUninlineableIntrinsic = true;
1613       return false;
1614     case Intrinsic::vastart:
1615       InitsVargArgs = true;
1616       return false;
1617     }
1618   }
1619 
1620   if (F == Call.getFunction()) {
1621     // This flag will fully abort the analysis, so don't bother with anything
1622     // else.
1623     IsRecursiveCall = true;
1624     return false;
1625   }
1626 
1627   if (TTI.isLoweredToCall(F)) {
1628     onLoweredCall(F, Call, IsIndirectCall);
1629   }
1630 
1631   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
1632     disableLoadElimination();
1633   return Base::visitCallBase(Call);
1634 }
1635 
1636 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1637   // At least one return instruction will be free after inlining.
1638   bool Free = !HasReturn;
1639   HasReturn = true;
1640   return Free;
1641 }
1642 
1643 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1644   // We model unconditional branches as essentially free -- they really
1645   // shouldn't exist at all, but handling them makes the behavior of the
1646   // inliner more regular and predictable. Interestingly, conditional branches
1647   // which will fold away are also free.
1648   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1649          dyn_cast_or_null<ConstantInt>(
1650              SimplifiedValues.lookup(BI.getCondition()));
1651 }
1652 
1653 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1654   bool CheckSROA = SI.getType()->isPointerTy();
1655   Value *TrueVal = SI.getTrueValue();
1656   Value *FalseVal = SI.getFalseValue();
1657 
1658   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1659   if (!TrueC)
1660     TrueC = SimplifiedValues.lookup(TrueVal);
1661   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1662   if (!FalseC)
1663     FalseC = SimplifiedValues.lookup(FalseVal);
1664   Constant *CondC =
1665       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1666 
1667   if (!CondC) {
1668     // Select C, X, X => X
1669     if (TrueC == FalseC && TrueC) {
1670       SimplifiedValues[&SI] = TrueC;
1671       return true;
1672     }
1673 
1674     if (!CheckSROA)
1675       return Base::visitSelectInst(SI);
1676 
1677     std::pair<Value *, APInt> TrueBaseAndOffset =
1678         ConstantOffsetPtrs.lookup(TrueVal);
1679     std::pair<Value *, APInt> FalseBaseAndOffset =
1680         ConstantOffsetPtrs.lookup(FalseVal);
1681     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1682       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1683 
1684       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
1685         SROAArgValues[&SI] = SROAArg;
1686       return true;
1687     }
1688 
1689     return Base::visitSelectInst(SI);
1690   }
1691 
1692   // Select condition is a constant.
1693   Value *SelectedV = CondC->isAllOnesValue()
1694                          ? TrueVal
1695                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1696   if (!SelectedV) {
1697     // Condition is a vector constant that is not all 1s or all 0s.  If all
1698     // operands are constants, ConstantExpr::getSelect() can handle the cases
1699     // such as select vectors.
1700     if (TrueC && FalseC) {
1701       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1702         SimplifiedValues[&SI] = C;
1703         return true;
1704       }
1705     }
1706     return Base::visitSelectInst(SI);
1707   }
1708 
1709   // Condition is either all 1s or all 0s. SI can be simplified.
1710   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1711     SimplifiedValues[&SI] = SelectedC;
1712     return true;
1713   }
1714 
1715   if (!CheckSROA)
1716     return true;
1717 
1718   std::pair<Value *, APInt> BaseAndOffset =
1719       ConstantOffsetPtrs.lookup(SelectedV);
1720   if (BaseAndOffset.first) {
1721     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1722 
1723     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
1724       SROAArgValues[&SI] = SROAArg;
1725   }
1726 
1727   return true;
1728 }
1729 
1730 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1731   // We model unconditional switches as free, see the comments on handling
1732   // branches.
1733   if (isa<ConstantInt>(SI.getCondition()))
1734     return true;
1735   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1736     if (isa<ConstantInt>(V))
1737       return true;
1738 
1739   // Assume the most general case where the switch is lowered into
1740   // either a jump table, bit test, or a balanced binary tree consisting of
1741   // case clusters without merging adjacent clusters with the same
1742   // destination. We do not consider the switches that are lowered with a mix
1743   // of jump table/bit test/binary search tree. The cost of the switch is
1744   // proportional to the size of the tree or the size of jump table range.
1745   //
1746   // NB: We convert large switches which are just used to initialize large phi
1747   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1748   // inlining those. It will prevent inlining in cases where the optimization
1749   // does not (yet) fire.
1750 
1751   unsigned JumpTableSize = 0;
1752   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
1753   unsigned NumCaseCluster =
1754       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
1755 
1756   onFinalizeSwitch(JumpTableSize, NumCaseCluster);
1757   return false;
1758 }
1759 
1760 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1761   // We never want to inline functions that contain an indirectbr.  This is
1762   // incorrect because all the blockaddress's (in static global initializers
1763   // for example) would be referring to the original function, and this
1764   // indirect jump would jump from the inlined copy of the function into the
1765   // original function which is extremely undefined behavior.
1766   // FIXME: This logic isn't really right; we can safely inline functions with
1767   // indirectbr's as long as no other function or global references the
1768   // blockaddress of a block within the current function.
1769   HasIndirectBr = true;
1770   return false;
1771 }
1772 
1773 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1774   // FIXME: It's not clear that a single instruction is an accurate model for
1775   // the inline cost of a resume instruction.
1776   return false;
1777 }
1778 
1779 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1780   // FIXME: It's not clear that a single instruction is an accurate model for
1781   // the inline cost of a cleanupret instruction.
1782   return false;
1783 }
1784 
1785 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1786   // FIXME: It's not clear that a single instruction is an accurate model for
1787   // the inline cost of a catchret instruction.
1788   return false;
1789 }
1790 
1791 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1792   // FIXME: It might be reasonably to discount the cost of instructions leading
1793   // to unreachable as they have the lowest possible impact on both runtime and
1794   // code size.
1795   return true; // No actual code is needed for unreachable.
1796 }
1797 
1798 bool CallAnalyzer::visitInstruction(Instruction &I) {
1799   // Some instructions are free. All of the free intrinsics can also be
1800   // handled by SROA, etc.
1801   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1802     return true;
1803 
1804   // We found something we don't understand or can't handle. Mark any SROA-able
1805   // values in the operand list as no longer viable.
1806   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1807     disableSROA(*OI);
1808 
1809   return false;
1810 }
1811 
1812 /// Analyze a basic block for its contribution to the inline cost.
1813 ///
1814 /// This method walks the analyzer over every instruction in the given basic
1815 /// block and accounts for their cost during inlining at this callsite. It
1816 /// aborts early if the threshold has been exceeded or an impossible to inline
1817 /// construct has been detected. It returns false if inlining is no longer
1818 /// viable, and true if inlining remains viable.
1819 InlineResult
1820 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1821                            SmallPtrSetImpl<const Value *> &EphValues) {
1822   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1823     // FIXME: Currently, the number of instructions in a function regardless of
1824     // our ability to simplify them during inline to constants or dead code,
1825     // are actually used by the vector bonus heuristic. As long as that's true,
1826     // we have to special case debug intrinsics here to prevent differences in
1827     // inlining due to debug symbols. Eventually, the number of unsimplified
1828     // instructions shouldn't factor into the cost computation, but until then,
1829     // hack around it here.
1830     if (isa<DbgInfoIntrinsic>(I))
1831       continue;
1832 
1833     // Skip ephemeral values.
1834     if (EphValues.count(&*I))
1835       continue;
1836 
1837     ++NumInstructions;
1838     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1839       ++NumVectorInstructions;
1840 
1841     // If the instruction simplified to a constant, there is no cost to this
1842     // instruction. Visit the instructions using our InstVisitor to account for
1843     // all of the per-instruction logic. The visit tree returns true if we
1844     // consumed the instruction in any way, and false if the instruction's base
1845     // cost should count against inlining.
1846     onInstructionAnalysisStart(&*I);
1847 
1848     if (Base::visit(&*I))
1849       ++NumInstructionsSimplified;
1850     else
1851       onMissedSimplification();
1852 
1853     onInstructionAnalysisFinish(&*I);
1854     using namespace ore;
1855     // If the visit this instruction detected an uninlinable pattern, abort.
1856     InlineResult IR = InlineResult::success();
1857     if (IsRecursiveCall)
1858       IR = InlineResult::failure("recursive");
1859     else if (ExposesReturnsTwice)
1860       IR = InlineResult::failure("exposes returns twice");
1861     else if (HasDynamicAlloca)
1862       IR = InlineResult::failure("dynamic alloca");
1863     else if (HasIndirectBr)
1864       IR = InlineResult::failure("indirect branch");
1865     else if (HasUninlineableIntrinsic)
1866       IR = InlineResult::failure("uninlinable intrinsic");
1867     else if (InitsVargArgs)
1868       IR = InlineResult::failure("varargs");
1869     if (!IR.isSuccess()) {
1870       if (ORE)
1871         ORE->emit([&]() {
1872           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1873                                           &CandidateCall)
1874                  << NV("Callee", &F) << " has uninlinable pattern ("
1875                  << NV("InlineResult", IR.getFailureReason())
1876                  << ") and cost is not fully computed";
1877         });
1878       return IR;
1879     }
1880 
1881     // If the caller is a recursive function then we don't want to inline
1882     // functions which allocate a lot of stack space because it would increase
1883     // the caller stack usage dramatically.
1884     if (IsCallerRecursive &&
1885         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1886       auto IR =
1887           InlineResult::failure("recursive and allocates too much stack space");
1888       if (ORE)
1889         ORE->emit([&]() {
1890           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1891                                           &CandidateCall)
1892                  << NV("Callee", &F) << " is "
1893                  << NV("InlineResult", IR.getFailureReason())
1894                  << ". Cost is not fully computed";
1895         });
1896       return IR;
1897     }
1898 
1899     if (shouldStop())
1900       return InlineResult::failure(
1901           "Call site analysis is not favorable to inlining.");
1902   }
1903 
1904   return InlineResult::success();
1905 }
1906 
1907 /// Compute the base pointer and cumulative constant offsets for V.
1908 ///
1909 /// This strips all constant offsets off of V, leaving it the base pointer, and
1910 /// accumulates the total constant offset applied in the returned constant. It
1911 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1912 /// no constant offsets applied.
1913 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1914   if (!V->getType()->isPointerTy())
1915     return nullptr;
1916 
1917   unsigned AS = V->getType()->getPointerAddressSpace();
1918   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1919   APInt Offset = APInt::getNullValue(IntPtrWidth);
1920 
1921   // Even though we don't look through PHI nodes, we could be called on an
1922   // instruction in an unreachable block, which may be on a cycle.
1923   SmallPtrSet<Value *, 4> Visited;
1924   Visited.insert(V);
1925   do {
1926     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1927       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1928         return nullptr;
1929       V = GEP->getPointerOperand();
1930     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1931       V = cast<Operator>(V)->getOperand(0);
1932     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1933       if (GA->isInterposable())
1934         break;
1935       V = GA->getAliasee();
1936     } else {
1937       break;
1938     }
1939     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1940   } while (Visited.insert(V).second);
1941 
1942   Type *IdxPtrTy = DL.getIndexType(V->getType());
1943   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
1944 }
1945 
1946 /// Find dead blocks due to deleted CFG edges during inlining.
1947 ///
1948 /// If we know the successor of the current block, \p CurrBB, has to be \p
1949 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1950 /// no live incoming CFG edges.  If one block is found to be dead, we can
1951 /// continue growing the dead block list by checking the successors of the dead
1952 /// blocks to see if all their incoming edges are dead or not.
1953 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1954   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1955     // A CFG edge is dead if the predecessor is dead or the predecessor has a
1956     // known successor which is not the one under exam.
1957     return (DeadBlocks.count(Pred) ||
1958             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1959   };
1960 
1961   auto IsNewlyDead = [&](BasicBlock *BB) {
1962     // If all the edges to a block are dead, the block is also dead.
1963     return (!DeadBlocks.count(BB) &&
1964             llvm::all_of(predecessors(BB),
1965                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1966   };
1967 
1968   for (BasicBlock *Succ : successors(CurrBB)) {
1969     if (Succ == NextBB || !IsNewlyDead(Succ))
1970       continue;
1971     SmallVector<BasicBlock *, 4> NewDead;
1972     NewDead.push_back(Succ);
1973     while (!NewDead.empty()) {
1974       BasicBlock *Dead = NewDead.pop_back_val();
1975       if (DeadBlocks.insert(Dead))
1976         // Continue growing the dead block lists.
1977         for (BasicBlock *S : successors(Dead))
1978           if (IsNewlyDead(S))
1979             NewDead.push_back(S);
1980     }
1981   }
1982 }
1983 
1984 /// Analyze a call site for potential inlining.
1985 ///
1986 /// Returns true if inlining this call is viable, and false if it is not
1987 /// viable. It computes the cost and adjusts the threshold based on numerous
1988 /// factors and heuristics. If this method returns false but the computed cost
1989 /// is below the computed threshold, then inlining was forcibly disabled by
1990 /// some artifact of the routine.
1991 InlineResult CallAnalyzer::analyze() {
1992   ++NumCallsAnalyzed;
1993 
1994   auto Result = onAnalysisStart();
1995   if (!Result.isSuccess())
1996     return Result;
1997 
1998   if (F.empty())
1999     return InlineResult::success();
2000 
2001   Function *Caller = CandidateCall.getFunction();
2002   // Check if the caller function is recursive itself.
2003   for (User *U : Caller->users()) {
2004     CallBase *Call = dyn_cast<CallBase>(U);
2005     if (Call && Call->getFunction() == Caller) {
2006       IsCallerRecursive = true;
2007       break;
2008     }
2009   }
2010 
2011   // Populate our simplified values by mapping from function arguments to call
2012   // arguments with known important simplifications.
2013   auto CAI = CandidateCall.arg_begin();
2014   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
2015        FAI != FAE; ++FAI, ++CAI) {
2016     assert(CAI != CandidateCall.arg_end());
2017     if (Constant *C = dyn_cast<Constant>(CAI))
2018       SimplifiedValues[&*FAI] = C;
2019 
2020     Value *PtrArg = *CAI;
2021     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2022       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
2023 
2024       // We can SROA any pointer arguments derived from alloca instructions.
2025       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2026         SROAArgValues[&*FAI] = SROAArg;
2027         onInitializeSROAArg(SROAArg);
2028         EnabledSROAAllocas.insert(SROAArg);
2029       }
2030     }
2031   }
2032   NumConstantArgs = SimplifiedValues.size();
2033   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2034   NumAllocaArgs = SROAArgValues.size();
2035 
2036   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2037   // the ephemeral values multiple times (and they're completely determined by
2038   // the callee, so this is purely duplicate work).
2039   SmallPtrSet<const Value *, 32> EphValues;
2040   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2041 
2042   // The worklist of live basic blocks in the callee *after* inlining. We avoid
2043   // adding basic blocks of the callee which can be proven to be dead for this
2044   // particular call site in order to get more accurate cost estimates. This
2045   // requires a somewhat heavyweight iteration pattern: we need to walk the
2046   // basic blocks in a breadth-first order as we insert live successors. To
2047   // accomplish this, prioritizing for small iterations because we exit after
2048   // crossing our threshold, we use a small-size optimized SetVector.
2049   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
2050                     SmallPtrSet<BasicBlock *, 16>>
2051       BBSetVector;
2052   BBSetVector BBWorklist;
2053   BBWorklist.insert(&F.getEntryBlock());
2054 
2055   // Note that we *must not* cache the size, this loop grows the worklist.
2056   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2057     if (shouldStop())
2058       break;
2059 
2060     BasicBlock *BB = BBWorklist[Idx];
2061     if (BB->empty())
2062       continue;
2063 
2064     // Disallow inlining a blockaddress with uses other than strictly callbr.
2065     // A blockaddress only has defined behavior for an indirect branch in the
2066     // same function, and we do not currently support inlining indirect
2067     // branches.  But, the inliner may not see an indirect branch that ends up
2068     // being dead code at a particular call site. If the blockaddress escapes
2069     // the function, e.g., via a global variable, inlining may lead to an
2070     // invalid cross-function reference.
2071     // FIXME: pr/39560: continue relaxing this overt restriction.
2072     if (BB->hasAddressTaken())
2073       for (User *U : BlockAddress::get(&*BB)->users())
2074         if (!isa<CallBrInst>(*U))
2075           return InlineResult::failure("blockaddress used outside of callbr");
2076 
2077     // Analyze the cost of this block. If we blow through the threshold, this
2078     // returns false, and we can bail on out.
2079     InlineResult IR = analyzeBlock(BB, EphValues);
2080     if (!IR.isSuccess())
2081       return IR;
2082 
2083     Instruction *TI = BB->getTerminator();
2084 
2085     // Add in the live successors by first checking whether we have terminator
2086     // that may be simplified based on the values simplified by this call.
2087     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2088       if (BI->isConditional()) {
2089         Value *Cond = BI->getCondition();
2090         if (ConstantInt *SimpleCond =
2091                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2092           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2093           BBWorklist.insert(NextBB);
2094           KnownSuccessors[BB] = NextBB;
2095           findDeadBlocks(BB, NextBB);
2096           continue;
2097         }
2098       }
2099     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2100       Value *Cond = SI->getCondition();
2101       if (ConstantInt *SimpleCond =
2102               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2103         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2104         BBWorklist.insert(NextBB);
2105         KnownSuccessors[BB] = NextBB;
2106         findDeadBlocks(BB, NextBB);
2107         continue;
2108       }
2109     }
2110 
2111     // If we're unable to select a particular successor, just count all of
2112     // them.
2113     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2114          ++TIdx)
2115       BBWorklist.insert(TI->getSuccessor(TIdx));
2116 
2117     onBlockAnalyzed(BB);
2118   }
2119 
2120   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
2121                                     &F == CandidateCall.getCalledFunction();
2122   // If this is a noduplicate call, we can still inline as long as
2123   // inlining this would cause the removal of the caller (so the instruction
2124   // is not actually duplicated, just moved).
2125   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
2126     return InlineResult::failure("noduplicate");
2127 
2128   return finalizeAnalysis();
2129 }
2130 
2131 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2132 /// Dump stats about this call's analysis.
2133 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
2134 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
2135   if (PrintDebugInstructionDeltas)
2136     F.print(dbgs(), &Writer);
2137   DEBUG_PRINT_STAT(NumConstantArgs);
2138   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2139   DEBUG_PRINT_STAT(NumAllocaArgs);
2140   DEBUG_PRINT_STAT(NumConstantPtrCmps);
2141   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2142   DEBUG_PRINT_STAT(NumInstructionsSimplified);
2143   DEBUG_PRINT_STAT(NumInstructions);
2144   DEBUG_PRINT_STAT(SROACostSavings);
2145   DEBUG_PRINT_STAT(SROACostSavingsLost);
2146   DEBUG_PRINT_STAT(LoadEliminationCost);
2147   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2148   DEBUG_PRINT_STAT(Cost);
2149   DEBUG_PRINT_STAT(Threshold);
2150 #undef DEBUG_PRINT_STAT
2151 }
2152 #endif
2153 
2154 /// Test that there are no attribute conflicts between Caller and Callee
2155 ///        that prevent inlining.
2156 static bool functionsHaveCompatibleAttributes(
2157     Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2158     function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2159   // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2160   // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2161   // object, and always returns the same object (which is overwritten on each
2162   // GetTLI call). Therefore we copy the first result.
2163   auto CalleeTLI = GetTLI(*Callee);
2164   return TTI.areInlineCompatible(Caller, Callee) &&
2165          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2166                                              InlineCallerSupersetNoBuiltin) &&
2167          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2168 }
2169 
2170 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
2171   int Cost = 0;
2172   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2173     if (Call.isByValArgument(I)) {
2174       // We approximate the number of loads and stores needed by dividing the
2175       // size of the byval type by the target's pointer size.
2176       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2177       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
2178       unsigned AS = PTy->getAddressSpace();
2179       unsigned PointerSize = DL.getPointerSizeInBits(AS);
2180       // Ceiling division.
2181       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2182 
2183       // If it generates more than 8 stores it is likely to be expanded as an
2184       // inline memcpy so we take that as an upper bound. Otherwise we assume
2185       // one load and one store per word copied.
2186       // FIXME: The maxStoresPerMemcpy setting from the target should be used
2187       // here instead of a magic number of 8, but it's not available via
2188       // DataLayout.
2189       NumStores = std::min(NumStores, 8U);
2190 
2191       Cost += 2 * NumStores * InlineConstants::InstrCost;
2192     } else {
2193       // For non-byval arguments subtract off one instruction per call
2194       // argument.
2195       Cost += InlineConstants::InstrCost;
2196     }
2197   }
2198   // The call instruction also disappears after inlining.
2199   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
2200   return Cost;
2201 }
2202 
2203 InlineCost llvm::getInlineCost(
2204     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2205     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2206     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2207     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2208     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2209   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2210                        GetAssumptionCache, GetBFI, GetTLI, PSI, ORE);
2211 }
2212 
2213 InlineCost llvm::getInlineCost(
2214     CallBase &Call, Function *Callee, const InlineParams &Params,
2215     TargetTransformInfo &CalleeTTI,
2216     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2217     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2218     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2219     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2220 
2221   // Cannot inline indirect calls.
2222   if (!Callee)
2223     return llvm::InlineCost::getNever("indirect call");
2224 
2225   // Never inline calls with byval arguments that does not have the alloca
2226   // address space. Since byval arguments can be replaced with a copy to an
2227   // alloca, the inlined code would need to be adjusted to handle that the
2228   // argument is in the alloca address space (so it is a little bit complicated
2229   // to solve).
2230   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2231   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2232     if (Call.isByValArgument(I)) {
2233       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2234       if (PTy->getAddressSpace() != AllocaAS)
2235         return llvm::InlineCost::getNever("byval arguments without alloca"
2236                                           " address space");
2237     }
2238 
2239   // Calls to functions with always-inline attributes should be inlined
2240   // whenever possible.
2241   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2242     auto IsViable = isInlineViable(*Callee);
2243     if (IsViable.isSuccess())
2244       return llvm::InlineCost::getAlways("always inline attribute");
2245     return llvm::InlineCost::getNever(IsViable.getFailureReason());
2246   }
2247 
2248   // Never inline functions with conflicting attributes (unless callee has
2249   // always-inline attribute).
2250   Function *Caller = Call.getCaller();
2251   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
2252     return llvm::InlineCost::getNever("conflicting attributes");
2253 
2254   // Don't inline this call if the caller has the optnone attribute.
2255   if (Caller->hasOptNone())
2256     return llvm::InlineCost::getNever("optnone attribute");
2257 
2258   // Don't inline a function that treats null pointer as valid into a caller
2259   // that does not have this attribute.
2260   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2261     return llvm::InlineCost::getNever("nullptr definitions incompatible");
2262 
2263   // Don't inline functions which can be interposed at link-time.
2264   if (Callee->isInterposable())
2265     return llvm::InlineCost::getNever("interposable");
2266 
2267   // Don't inline functions marked noinline.
2268   if (Callee->hasFnAttribute(Attribute::NoInline))
2269     return llvm::InlineCost::getNever("noinline function attribute");
2270 
2271   // Don't inline call sites marked noinline.
2272   if (Call.isNoInline())
2273     return llvm::InlineCost::getNever("noinline call site attribute");
2274 
2275   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2276                           << "... (caller:" << Caller->getName() << ")\n");
2277 
2278   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2279                             *Callee, Call, Params);
2280   InlineResult ShouldInline = CA.analyze();
2281 
2282   LLVM_DEBUG(CA.dump());
2283 
2284   // Check if there was a reason to force inlining or no inlining.
2285   if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
2286     return InlineCost::getNever(ShouldInline.getFailureReason());
2287   if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
2288     return InlineCost::getAlways("empty function");
2289 
2290   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2291 }
2292 
2293 InlineResult llvm::isInlineViable(Function &F) {
2294   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2295   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2296     // Disallow inlining of functions which contain indirect branches.
2297     if (isa<IndirectBrInst>(BI->getTerminator()))
2298       return InlineResult::failure("contains indirect branches");
2299 
2300     // Disallow inlining of blockaddresses which are used by non-callbr
2301     // instructions.
2302     if (BI->hasAddressTaken())
2303       for (User *U : BlockAddress::get(&*BI)->users())
2304         if (!isa<CallBrInst>(*U))
2305           return InlineResult::failure("blockaddress used outside of callbr");
2306 
2307     for (auto &II : *BI) {
2308       CallBase *Call = dyn_cast<CallBase>(&II);
2309       if (!Call)
2310         continue;
2311 
2312       // Disallow recursive calls.
2313       if (&F == Call->getCalledFunction())
2314         return InlineResult::failure("recursive call");
2315 
2316       // Disallow calls which expose returns-twice to a function not previously
2317       // attributed as such.
2318       if (!ReturnsTwice && isa<CallInst>(Call) &&
2319           cast<CallInst>(Call)->canReturnTwice())
2320         return InlineResult::failure("exposes returns-twice attribute");
2321 
2322       if (Call->getCalledFunction())
2323         switch (Call->getCalledFunction()->getIntrinsicID()) {
2324         default:
2325           break;
2326         case llvm::Intrinsic::icall_branch_funnel:
2327           // Disallow inlining of @llvm.icall.branch.funnel because current
2328           // backend can't separate call targets from call arguments.
2329           return InlineResult::failure(
2330               "disallowed inlining of @llvm.icall.branch.funnel");
2331         case llvm::Intrinsic::localescape:
2332           // Disallow inlining functions that call @llvm.localescape. Doing this
2333           // correctly would require major changes to the inliner.
2334           return InlineResult::failure(
2335               "disallowed inlining of @llvm.localescape");
2336         case llvm::Intrinsic::vastart:
2337           // Disallow inlining of functions that initialize VarArgs with
2338           // va_start.
2339           return InlineResult::failure(
2340               "contains VarArgs initialized with va_start");
2341         }
2342     }
2343   }
2344 
2345   return InlineResult::success();
2346 }
2347 
2348 // APIs to create InlineParams based on command line flags and/or other
2349 // parameters.
2350 
2351 InlineParams llvm::getInlineParams(int Threshold) {
2352   InlineParams Params;
2353 
2354   // This field is the threshold to use for a callee by default. This is
2355   // derived from one or more of:
2356   //  * optimization or size-optimization levels,
2357   //  * a value passed to createFunctionInliningPass function, or
2358   //  * the -inline-threshold flag.
2359   //  If the -inline-threshold flag is explicitly specified, that is used
2360   //  irrespective of anything else.
2361   if (InlineThreshold.getNumOccurrences() > 0)
2362     Params.DefaultThreshold = InlineThreshold;
2363   else
2364     Params.DefaultThreshold = Threshold;
2365 
2366   // Set the HintThreshold knob from the -inlinehint-threshold.
2367   Params.HintThreshold = HintThreshold;
2368 
2369   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2370   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2371 
2372   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2373   // populate LocallyHotCallSiteThreshold. Later, we populate
2374   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2375   // we know that optimization level is O3 (in the getInlineParams variant that
2376   // takes the opt and size levels).
2377   // FIXME: Remove this check (and make the assignment unconditional) after
2378   // addressing size regression issues at O2.
2379   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2380     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2381 
2382   // Set the ColdCallSiteThreshold knob from the
2383   // -inline-cold-callsite-threshold.
2384   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2385 
2386   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2387   // -inlinehint-threshold commandline option is not explicitly given. If that
2388   // option is present, then its value applies even for callees with size and
2389   // minsize attributes.
2390   // If the -inline-threshold is not specified, set the ColdThreshold from the
2391   // -inlinecold-threshold even if it is not explicitly passed. If
2392   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2393   // explicitly specified to set the ColdThreshold knob
2394   if (InlineThreshold.getNumOccurrences() == 0) {
2395     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2396     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2397     Params.ColdThreshold = ColdThreshold;
2398   } else if (ColdThreshold.getNumOccurrences() > 0) {
2399     Params.ColdThreshold = ColdThreshold;
2400   }
2401   return Params;
2402 }
2403 
2404 InlineParams llvm::getInlineParams() {
2405   return getInlineParams(DefaultThreshold);
2406 }
2407 
2408 // Compute the default threshold for inlining based on the opt level and the
2409 // size opt level.
2410 static int computeThresholdFromOptLevels(unsigned OptLevel,
2411                                          unsigned SizeOptLevel) {
2412   if (OptLevel > 2)
2413     return InlineConstants::OptAggressiveThreshold;
2414   if (SizeOptLevel == 1) // -Os
2415     return InlineConstants::OptSizeThreshold;
2416   if (SizeOptLevel == 2) // -Oz
2417     return InlineConstants::OptMinSizeThreshold;
2418   return DefaultThreshold;
2419 }
2420 
2421 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2422   auto Params =
2423       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2424   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2425   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2426   // when it is specified explicitly.
2427   if (OptLevel > 2)
2428     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2429   return Params;
2430 }
2431