1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/AssemblyAnnotationWriter.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/FormattedStream.h"
44 #include "llvm/Support/raw_ostream.h"
45 
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "inline-cost"
49 
50 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
51 
52 static cl::opt<int>
53     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
54                      cl::ZeroOrMore,
55                      cl::desc("Default amount of inlining to perform"));
56 
57 static cl::opt<bool> PrintDebugInstructionDeltas("print-instruction-deltas",
58     cl::Hidden, cl::init(false),
59     cl::desc("Prints deltas of cost and threshold per instruction"));
60 
61 static cl::opt<int> InlineThreshold(
62     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
63     cl::desc("Control the amount of inlining to perform (default = 225)"));
64 
65 static cl::opt<int> HintThreshold(
66     "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
67     cl::desc("Threshold for inlining functions with inline hint"));
68 
69 static cl::opt<int>
70     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
71                           cl::init(45), cl::ZeroOrMore,
72                           cl::desc("Threshold for inlining cold callsites"));
73 
74 // We introduce this threshold to help performance of instrumentation based
75 // PGO before we actually hook up inliner with analysis passes such as BPI and
76 // BFI.
77 static cl::opt<int> ColdThreshold(
78     "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
79     cl::desc("Threshold for inlining functions with cold attribute"));
80 
81 static cl::opt<int>
82     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
83                          cl::ZeroOrMore,
84                          cl::desc("Threshold for hot callsites "));
85 
86 static cl::opt<int> LocallyHotCallSiteThreshold(
87     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
88     cl::desc("Threshold for locally hot callsites "));
89 
90 static cl::opt<int> ColdCallSiteRelFreq(
91     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
92     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
93              "entry frequency, for a callsite to be cold in the absence of "
94              "profile information."));
95 
96 static cl::opt<int> HotCallSiteRelFreq(
97     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
98     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
99              "entry frequency, for a callsite to be hot in the absence of "
100              "profile information."));
101 
102 static cl::opt<bool> OptComputeFullInlineCost(
103     "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
104     cl::desc("Compute the full inline cost of a call site even when the cost "
105              "exceeds the threshold."));
106 
107 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
108     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
109     cl::ZeroOrMore,
110     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
111              "attributes."));
112 
113 namespace {
114 class InlineCostCallAnalyzer;
115 
116 // This struct is used to store information about inline cost of a
117 // particular instruction
118 struct InstructionCostDetail {
119   int CostBefore = 0;
120   int CostAfter = 0;
121   int ThresholdBefore = 0;
122   int ThresholdAfter = 0;
123 
124   int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
125 
126   int getCostDelta() const { return CostAfter - CostBefore; }
127 
128   bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
129 };
130 
131 class CostAnnotationWriter : public AssemblyAnnotationWriter {
132 public:
133   // This DenseMap stores the delta change in cost and threshold after
134   // accounting for the given instruction.
135   DenseMap <const Instruction *, InstructionCostDetail> CostThresholdMap;
136 
137   virtual void emitInstructionAnnot(const Instruction *I,
138                                         formatted_raw_ostream &OS);
139 };
140 
141 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
142   typedef InstVisitor<CallAnalyzer, bool> Base;
143   friend class InstVisitor<CallAnalyzer, bool>;
144 
145 protected:
146   virtual ~CallAnalyzer() {}
147   /// The TargetTransformInfo available for this compilation.
148   const TargetTransformInfo &TTI;
149 
150   /// Getter for the cache of @llvm.assume intrinsics.
151   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
152 
153   /// Getter for BlockFrequencyInfo
154   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
155 
156   /// Profile summary information.
157   ProfileSummaryInfo *PSI;
158 
159   /// The called function.
160   Function &F;
161 
162   // Cache the DataLayout since we use it a lot.
163   const DataLayout &DL;
164 
165   /// The OptimizationRemarkEmitter available for this compilation.
166   OptimizationRemarkEmitter *ORE;
167 
168   /// The candidate callsite being analyzed. Please do not use this to do
169   /// analysis in the caller function; we want the inline cost query to be
170   /// easily cacheable. Instead, use the cover function paramHasAttr.
171   CallBase &CandidateCall;
172 
173   /// Extension points for handling callsite features.
174   /// Called after a basic block was analyzed.
175   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
176 
177   /// Called before an instruction was analyzed
178   virtual void onInstructionAnalysisStart(const Instruction *I) {}
179 
180   /// Called after an instruction was analyzed
181   virtual void onInstructionAnalysisFinish(const Instruction *I) {}
182 
183   /// Called at the end of the analysis of the callsite. Return the outcome of
184   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
185   /// the reason it can't.
186   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
187   /// Called when we're about to start processing a basic block, and every time
188   /// we are done processing an instruction. Return true if there is no point in
189   /// continuing the analysis (e.g. we've determined already the call site is
190   /// too expensive to inline)
191   virtual bool shouldStop() { return false; }
192 
193   /// Called before the analysis of the callee body starts (with callsite
194   /// contexts propagated).  It checks callsite-specific information. Return a
195   /// reason analysis can't continue if that's the case, or 'true' if it may
196   /// continue.
197   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
198   /// Called if the analysis engine decides SROA cannot be done for the given
199   /// alloca.
200   virtual void onDisableSROA(AllocaInst *Arg) {}
201 
202   /// Called the analysis engine determines load elimination won't happen.
203   virtual void onDisableLoadElimination() {}
204 
205   /// Called to account for a call.
206   virtual void onCallPenalty() {}
207 
208   /// Called to account for the expectation the inlining would result in a load
209   /// elimination.
210   virtual void onLoadEliminationOpportunity() {}
211 
212   /// Called to account for the cost of argument setup for the Call in the
213   /// callee's body (not the callsite currently under analysis).
214   virtual void onCallArgumentSetup(const CallBase &Call) {}
215 
216   /// Called to account for a load relative intrinsic.
217   virtual void onLoadRelativeIntrinsic() {}
218 
219   /// Called to account for a lowered call.
220   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
221   }
222 
223   /// Account for a jump table of given size. Return false to stop further
224   /// processing the switch instruction
225   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
226 
227   /// Account for a case cluster of given size. Return false to stop further
228   /// processing of the instruction.
229   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
230 
231   /// Called at the end of processing a switch instruction, with the given
232   /// number of case clusters.
233   virtual void onFinalizeSwitch(unsigned JumpTableSize,
234                                 unsigned NumCaseCluster) {}
235 
236   /// Called to account for any other instruction not specifically accounted
237   /// for.
238   virtual void onMissedSimplification() {}
239 
240   /// Start accounting potential benefits due to SROA for the given alloca.
241   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
242 
243   /// Account SROA savings for the AllocaInst value.
244   virtual void onAggregateSROAUse(AllocaInst *V) {}
245 
246   bool handleSROA(Value *V, bool DoNotDisable) {
247     // Check for SROA candidates in comparisons.
248     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
249       if (DoNotDisable) {
250         onAggregateSROAUse(SROAArg);
251         return true;
252       }
253       disableSROAForArg(SROAArg);
254     }
255     return false;
256   }
257 
258   bool IsCallerRecursive = false;
259   bool IsRecursiveCall = false;
260   bool ExposesReturnsTwice = false;
261   bool HasDynamicAlloca = false;
262   bool ContainsNoDuplicateCall = false;
263   bool HasReturn = false;
264   bool HasIndirectBr = false;
265   bool HasUninlineableIntrinsic = false;
266   bool InitsVargArgs = false;
267 
268   /// Number of bytes allocated statically by the callee.
269   uint64_t AllocatedSize = 0;
270   unsigned NumInstructions = 0;
271   unsigned NumVectorInstructions = 0;
272 
273   /// While we walk the potentially-inlined instructions, we build up and
274   /// maintain a mapping of simplified values specific to this callsite. The
275   /// idea is to propagate any special information we have about arguments to
276   /// this call through the inlinable section of the function, and account for
277   /// likely simplifications post-inlining. The most important aspect we track
278   /// is CFG altering simplifications -- when we prove a basic block dead, that
279   /// can cause dramatic shifts in the cost of inlining a function.
280   DenseMap<Value *, Constant *> SimplifiedValues;
281 
282   /// Keep track of the values which map back (through function arguments) to
283   /// allocas on the caller stack which could be simplified through SROA.
284   DenseMap<Value *, AllocaInst *> SROAArgValues;
285 
286   /// Keep track of Allocas for which we believe we may get SROA optimization.
287   DenseSet<AllocaInst *> EnabledSROAAllocas;
288 
289   /// Keep track of values which map to a pointer base and constant offset.
290   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
291 
292   /// Keep track of dead blocks due to the constant arguments.
293   SetVector<BasicBlock *> DeadBlocks;
294 
295   /// The mapping of the blocks to their known unique successors due to the
296   /// constant arguments.
297   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
298 
299   /// Model the elimination of repeated loads that is expected to happen
300   /// whenever we simplify away the stores that would otherwise cause them to be
301   /// loads.
302   bool EnableLoadElimination;
303   SmallPtrSet<Value *, 16> LoadAddrSet;
304 
305   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
306     auto It = SROAArgValues.find(V);
307     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
308       return nullptr;
309     return It->second;
310   }
311 
312   // Custom simplification helper routines.
313   bool isAllocaDerivedArg(Value *V);
314   void disableSROAForArg(AllocaInst *SROAArg);
315   void disableSROA(Value *V);
316   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
317   void disableLoadElimination();
318   bool isGEPFree(GetElementPtrInst &GEP);
319   bool canFoldInboundsGEP(GetElementPtrInst &I);
320   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
321   bool simplifyCallSite(Function *F, CallBase &Call);
322   template <typename Callable>
323   bool simplifyInstruction(Instruction &I, Callable Evaluate);
324   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
325 
326   /// Return true if the given argument to the function being considered for
327   /// inlining has the given attribute set either at the call site or the
328   /// function declaration.  Primarily used to inspect call site specific
329   /// attributes since these can be more precise than the ones on the callee
330   /// itself.
331   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
332 
333   /// Return true if the given value is known non null within the callee if
334   /// inlined through this particular callsite.
335   bool isKnownNonNullInCallee(Value *V);
336 
337   /// Return true if size growth is allowed when inlining the callee at \p Call.
338   bool allowSizeGrowth(CallBase &Call);
339 
340   // Custom analysis routines.
341   InlineResult analyzeBlock(BasicBlock *BB,
342                             SmallPtrSetImpl<const Value *> &EphValues);
343 
344   // Disable several entry points to the visitor so we don't accidentally use
345   // them by declaring but not defining them here.
346   void visit(Module *);
347   void visit(Module &);
348   void visit(Function *);
349   void visit(Function &);
350   void visit(BasicBlock *);
351   void visit(BasicBlock &);
352 
353   // Provide base case for our instruction visit.
354   bool visitInstruction(Instruction &I);
355 
356   // Our visit overrides.
357   bool visitAlloca(AllocaInst &I);
358   bool visitPHI(PHINode &I);
359   bool visitGetElementPtr(GetElementPtrInst &I);
360   bool visitBitCast(BitCastInst &I);
361   bool visitPtrToInt(PtrToIntInst &I);
362   bool visitIntToPtr(IntToPtrInst &I);
363   bool visitCastInst(CastInst &I);
364   bool visitUnaryInstruction(UnaryInstruction &I);
365   bool visitCmpInst(CmpInst &I);
366   bool visitSub(BinaryOperator &I);
367   bool visitBinaryOperator(BinaryOperator &I);
368   bool visitFNeg(UnaryOperator &I);
369   bool visitLoad(LoadInst &I);
370   bool visitStore(StoreInst &I);
371   bool visitExtractValue(ExtractValueInst &I);
372   bool visitInsertValue(InsertValueInst &I);
373   bool visitCallBase(CallBase &Call);
374   bool visitReturnInst(ReturnInst &RI);
375   bool visitBranchInst(BranchInst &BI);
376   bool visitSelectInst(SelectInst &SI);
377   bool visitSwitchInst(SwitchInst &SI);
378   bool visitIndirectBrInst(IndirectBrInst &IBI);
379   bool visitResumeInst(ResumeInst &RI);
380   bool visitCleanupReturnInst(CleanupReturnInst &RI);
381   bool visitCatchReturnInst(CatchReturnInst &RI);
382   bool visitUnreachableInst(UnreachableInst &I);
383 
384 public:
385   CallAnalyzer(const TargetTransformInfo &TTI,
386                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
387                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
388                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
389                Function &Callee, CallBase &Call)
390       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
391         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
392         CandidateCall(Call), EnableLoadElimination(true) {}
393 
394   InlineResult analyze();
395 
396   // Keep a bunch of stats about the cost savings found so we can print them
397   // out when debugging.
398   unsigned NumConstantArgs = 0;
399   unsigned NumConstantOffsetPtrArgs = 0;
400   unsigned NumAllocaArgs = 0;
401   unsigned NumConstantPtrCmps = 0;
402   unsigned NumConstantPtrDiffs = 0;
403   unsigned NumInstructionsSimplified = 0;
404 
405   void dump();
406 };
407 
408 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
409 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
410 class InlineCostCallAnalyzer final : public CallAnalyzer {
411   const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
412   const bool ComputeFullInlineCost;
413   int LoadEliminationCost = 0;
414   /// Bonus to be applied when percentage of vector instructions in callee is
415   /// high (see more details in updateThreshold).
416   int VectorBonus = 0;
417   /// Bonus to be applied when the callee has only one reachable basic block.
418   int SingleBBBonus = 0;
419 
420   /// Tunable parameters that control the analysis.
421   const InlineParams &Params;
422 
423   /// Upper bound for the inlining cost. Bonuses are being applied to account
424   /// for speculative "expected profit" of the inlining decision.
425   int Threshold = 0;
426 
427   /// Attempt to evaluate indirect calls to boost its inline cost.
428   const bool BoostIndirectCalls;
429 
430   /// Ignore the threshold when finalizing analysis.
431   const bool IgnoreThreshold;
432 
433   /// Inlining cost measured in abstract units, accounts for all the
434   /// instructions expected to be executed for a given function invocation.
435   /// Instructions that are statically proven to be dead based on call-site
436   /// arguments are not counted here.
437   int Cost = 0;
438 
439   bool SingleBB = true;
440 
441   unsigned SROACostSavings = 0;
442   unsigned SROACostSavingsLost = 0;
443 
444   /// The mapping of caller Alloca values to their accumulated cost savings. If
445   /// we have to disable SROA for one of the allocas, this tells us how much
446   /// cost must be added.
447   DenseMap<AllocaInst *, int> SROAArgCosts;
448 
449   /// Return true if \p Call is a cold callsite.
450   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
451 
452   /// Update Threshold based on callsite properties such as callee
453   /// attributes and callee hotness for PGO builds. The Callee is explicitly
454   /// passed to support analyzing indirect calls whose target is inferred by
455   /// analysis.
456   void updateThreshold(CallBase &Call, Function &Callee);
457   /// Return a higher threshold if \p Call is a hot callsite.
458   Optional<int> getHotCallSiteThreshold(CallBase &Call,
459                                         BlockFrequencyInfo *CallerBFI);
460 
461   /// Handle a capped 'int' increment for Cost.
462   void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
463     assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
464     Cost = (int)std::min(UpperBound, Cost + Inc);
465   }
466 
467   void onDisableSROA(AllocaInst *Arg) override {
468     auto CostIt = SROAArgCosts.find(Arg);
469     if (CostIt == SROAArgCosts.end())
470       return;
471     addCost(CostIt->second);
472     SROACostSavings -= CostIt->second;
473     SROACostSavingsLost += CostIt->second;
474     SROAArgCosts.erase(CostIt);
475   }
476 
477   void onDisableLoadElimination() override {
478     addCost(LoadEliminationCost);
479     LoadEliminationCost = 0;
480   }
481   void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
482   void onCallArgumentSetup(const CallBase &Call) override {
483     // Pay the price of the argument setup. We account for the average 1
484     // instruction per call argument setup here.
485     addCost(Call.arg_size() * InlineConstants::InstrCost);
486   }
487   void onLoadRelativeIntrinsic() override {
488     // This is normally lowered to 4 LLVM instructions.
489     addCost(3 * InlineConstants::InstrCost);
490   }
491   void onLoweredCall(Function *F, CallBase &Call,
492                      bool IsIndirectCall) override {
493     // We account for the average 1 instruction per call argument setup here.
494     addCost(Call.arg_size() * InlineConstants::InstrCost);
495 
496     // If we have a constant that we are calling as a function, we can peer
497     // through it and see the function target. This happens not infrequently
498     // during devirtualization and so we want to give it a hefty bonus for
499     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
500     // Pretend to inline the function, with a custom threshold.
501     if (IsIndirectCall && BoostIndirectCalls) {
502       auto IndirectCallParams = Params;
503       IndirectCallParams.DefaultThreshold =
504           InlineConstants::IndirectCallThreshold;
505       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
506       /// to instantiate the derived class.
507       InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
508                                 Call, IndirectCallParams, false);
509       if (CA.analyze().isSuccess()) {
510         // We were able to inline the indirect call! Subtract the cost from the
511         // threshold to get the bonus we want to apply, but don't go below zero.
512         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
513       }
514     } else
515       // Otherwise simply add the cost for merely making the call.
516       addCost(InlineConstants::CallPenalty);
517   }
518 
519   void onFinalizeSwitch(unsigned JumpTableSize,
520                         unsigned NumCaseCluster) override {
521     // If suitable for a jump table, consider the cost for the table size and
522     // branch to destination.
523     // Maximum valid cost increased in this function.
524     if (JumpTableSize) {
525       int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
526                        4 * InlineConstants::InstrCost;
527 
528       addCost(JTCost, (int64_t)CostUpperBound);
529       return;
530     }
531     // Considering forming a binary search, we should find the number of nodes
532     // which is same as the number of comparisons when lowered. For a given
533     // number of clusters, n, we can define a recursive function, f(n), to find
534     // the number of nodes in the tree. The recursion is :
535     // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
536     // and f(n) = n, when n <= 3.
537     // This will lead a binary tree where the leaf should be either f(2) or f(3)
538     // when n > 3.  So, the number of comparisons from leaves should be n, while
539     // the number of non-leaf should be :
540     //   2^(log2(n) - 1) - 1
541     //   = 2^log2(n) * 2^-1 - 1
542     //   = n / 2 - 1.
543     // Considering comparisons from leaf and non-leaf nodes, we can estimate the
544     // number of comparisons in a simple closed form :
545     //   n + n / 2 - 1 = n * 3 / 2 - 1
546     if (NumCaseCluster <= 3) {
547       // Suppose a comparison includes one compare and one conditional branch.
548       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
549       return;
550     }
551 
552     int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
553     int64_t SwitchCost =
554         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
555 
556     addCost(SwitchCost, (int64_t)CostUpperBound);
557   }
558   void onMissedSimplification() override {
559     addCost(InlineConstants::InstrCost);
560   }
561 
562   void onInitializeSROAArg(AllocaInst *Arg) override {
563     assert(Arg != nullptr &&
564            "Should not initialize SROA costs for null value.");
565     SROAArgCosts[Arg] = 0;
566   }
567 
568   void onAggregateSROAUse(AllocaInst *SROAArg) override {
569     auto CostIt = SROAArgCosts.find(SROAArg);
570     assert(CostIt != SROAArgCosts.end() &&
571            "expected this argument to have a cost");
572     CostIt->second += InlineConstants::InstrCost;
573     SROACostSavings += InlineConstants::InstrCost;
574   }
575 
576   void onBlockAnalyzed(const BasicBlock *BB) override {
577     auto *TI = BB->getTerminator();
578     // If we had any successors at this point, than post-inlining is likely to
579     // have them as well. Note that we assume any basic blocks which existed
580     // due to branches or switches which folded above will also fold after
581     // inlining.
582     if (SingleBB && TI->getNumSuccessors() > 1) {
583       // Take off the bonus we applied to the threshold.
584       Threshold -= SingleBBBonus;
585       SingleBB = false;
586     }
587   }
588 
589   void onInstructionAnalysisStart(const Instruction *I) override {
590     // This function is called to store the initial cost of inlining before
591     // the given instruction was assessed.
592     if (!PrintDebugInstructionDeltas)
593         return ;
594     Writer.CostThresholdMap[I].CostBefore = Cost;
595     Writer.CostThresholdMap[I].ThresholdBefore = Threshold;
596   }
597 
598   void onInstructionAnalysisFinish(const Instruction *I) override {
599     // This function is called to find new values of cost and threshold after
600     // the instruction has been assessed.
601     if (!PrintDebugInstructionDeltas)
602         return ;
603     Writer.CostThresholdMap[I].CostAfter = Cost;
604     Writer.CostThresholdMap[I].ThresholdAfter = Threshold;
605   }
606 
607   InlineResult finalizeAnalysis() override {
608     // Loops generally act a lot like calls in that they act like barriers to
609     // movement, require a certain amount of setup, etc. So when optimising for
610     // size, we penalise any call sites that perform loops. We do this after all
611     // other costs here, so will likely only be dealing with relatively small
612     // functions (and hence DT and LI will hopefully be cheap).
613     auto *Caller = CandidateCall.getFunction();
614     if (Caller->hasMinSize()) {
615       DominatorTree DT(F);
616       LoopInfo LI(DT);
617       int NumLoops = 0;
618       for (Loop *L : LI) {
619         // Ignore loops that will not be executed
620         if (DeadBlocks.count(L->getHeader()))
621           continue;
622         NumLoops++;
623       }
624       addCost(NumLoops * InlineConstants::CallPenalty);
625     }
626 
627     // We applied the maximum possible vector bonus at the beginning. Now,
628     // subtract the excess bonus, if any, from the Threshold before
629     // comparing against Cost.
630     if (NumVectorInstructions <= NumInstructions / 10)
631       Threshold -= VectorBonus;
632     else if (NumVectorInstructions <= NumInstructions / 2)
633       Threshold -= VectorBonus / 2;
634 
635     if (IgnoreThreshold || Cost < std::max(1, Threshold))
636       return InlineResult::success();
637     return InlineResult::failure("Cost over threshold.");
638   }
639   bool shouldStop() override {
640     // Bail out the moment we cross the threshold. This means we'll under-count
641     // the cost, but only when undercounting doesn't matter.
642     return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost;
643   }
644 
645   void onLoadEliminationOpportunity() override {
646     LoadEliminationCost += InlineConstants::InstrCost;
647   }
648 
649   InlineResult onAnalysisStart() override {
650     // Perform some tweaks to the cost and threshold based on the direct
651     // callsite information.
652 
653     // We want to more aggressively inline vector-dense kernels, so up the
654     // threshold, and we'll lower it if the % of vector instructions gets too
655     // low. Note that these bonuses are some what arbitrary and evolved over
656     // time by accident as much as because they are principled bonuses.
657     //
658     // FIXME: It would be nice to remove all such bonuses. At least it would be
659     // nice to base the bonus values on something more scientific.
660     assert(NumInstructions == 0);
661     assert(NumVectorInstructions == 0);
662 
663     // Update the threshold based on callsite properties
664     updateThreshold(CandidateCall, F);
665 
666     // While Threshold depends on commandline options that can take negative
667     // values, we want to enforce the invariant that the computed threshold and
668     // bonuses are non-negative.
669     assert(Threshold >= 0);
670     assert(SingleBBBonus >= 0);
671     assert(VectorBonus >= 0);
672 
673     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
674     // this Threshold any time, and cost cannot decrease, we can stop processing
675     // the rest of the function body.
676     Threshold += (SingleBBBonus + VectorBonus);
677 
678     // Give out bonuses for the callsite, as the instructions setting them up
679     // will be gone after inlining.
680     addCost(-getCallsiteCost(this->CandidateCall, DL));
681 
682     // If this function uses the coldcc calling convention, prefer not to inline
683     // it.
684     if (F.getCallingConv() == CallingConv::Cold)
685       Cost += InlineConstants::ColdccPenalty;
686 
687     // Check if we're done. This can happen due to bonuses and penalties.
688     if (Cost >= Threshold && !ComputeFullInlineCost)
689       return InlineResult::failure("high cost");
690 
691     return InlineResult::success();
692   }
693 
694 public:
695   InlineCostCallAnalyzer(
696       const TargetTransformInfo &TTI,
697       std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
698       Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
699       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
700       CallBase &Call, const InlineParams &Params, bool BoostIndirect = true,
701       bool IgnoreThreshold = false)
702       : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
703         ComputeFullInlineCost(OptComputeFullInlineCost ||
704                               Params.ComputeFullInlineCost || ORE),
705         Params(Params), Threshold(Params.DefaultThreshold),
706         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold) {}
707 
708   /// Annotation Writer for cost annotation
709   CostAnnotationWriter Writer;
710 
711   void dump();
712 
713   virtual ~InlineCostCallAnalyzer() {}
714   int getThreshold() { return Threshold; }
715   int getCost() { return Cost; }
716 };
717 } // namespace
718 
719 /// Test whether the given value is an Alloca-derived function argument.
720 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
721   return SROAArgValues.count(V);
722 }
723 
724 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
725   onDisableSROA(SROAArg);
726   EnabledSROAAllocas.erase(SROAArg);
727   disableLoadElimination();
728 }
729 
730 void CostAnnotationWriter::emitInstructionAnnot(
731     const Instruction *I, formatted_raw_ostream &OS) {
732     // The cost of inlining of the given instruction is printed always.
733     // The threshold delta is printed only when it is non-zero. It happens
734     // when we decided to give a bonus at a particular instruction.
735     assert(CostThresholdMap.count(I) > 0 &&
736            "Expected each instruction to have an instruction annotation");
737     const auto &Record = CostThresholdMap[I];
738     OS << "; cost before = " << Record.CostBefore
739        << ", cost after = " << Record.CostAfter
740        << ", threshold before = " << Record.ThresholdBefore
741        << ", threshold after = " << Record.ThresholdAfter << ", ";
742     OS << "cost delta = " << Record.getCostDelta();
743     if (Record.hasThresholdChanged())
744       OS << ", threshold delta = " << Record.getThresholdDelta();
745     OS << "\n";
746 }
747 
748 /// If 'V' maps to a SROA candidate, disable SROA for it.
749 void CallAnalyzer::disableSROA(Value *V) {
750   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
751     disableSROAForArg(SROAArg);
752   }
753 }
754 
755 void CallAnalyzer::disableLoadElimination() {
756   if (EnableLoadElimination) {
757     onDisableLoadElimination();
758     EnableLoadElimination = false;
759   }
760 }
761 
762 /// Accumulate a constant GEP offset into an APInt if possible.
763 ///
764 /// Returns false if unable to compute the offset for any reason. Respects any
765 /// simplified values known during the analysis of this callsite.
766 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
767   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
768   assert(IntPtrWidth == Offset.getBitWidth());
769 
770   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
771        GTI != GTE; ++GTI) {
772     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
773     if (!OpC)
774       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
775         OpC = dyn_cast<ConstantInt>(SimpleOp);
776     if (!OpC)
777       return false;
778     if (OpC->isZero())
779       continue;
780 
781     // Handle a struct index, which adds its field offset to the pointer.
782     if (StructType *STy = GTI.getStructTypeOrNull()) {
783       unsigned ElementIdx = OpC->getZExtValue();
784       const StructLayout *SL = DL.getStructLayout(STy);
785       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
786       continue;
787     }
788 
789     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
790     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
791   }
792   return true;
793 }
794 
795 /// Use TTI to check whether a GEP is free.
796 ///
797 /// Respects any simplified values known during the analysis of this callsite.
798 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
799   SmallVector<Value *, 4> Operands;
800   Operands.push_back(GEP.getOperand(0));
801   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
802     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
803       Operands.push_back(SimpleOp);
804     else
805       Operands.push_back(*I);
806   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
807 }
808 
809 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
810   // Check whether inlining will turn a dynamic alloca into a static
811   // alloca and handle that case.
812   if (I.isArrayAllocation()) {
813     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
814     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
815       Type *Ty = I.getAllocatedType();
816       AllocatedSize = SaturatingMultiplyAdd(
817           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
818           AllocatedSize);
819       return Base::visitAlloca(I);
820     }
821   }
822 
823   // Accumulate the allocated size.
824   if (I.isStaticAlloca()) {
825     Type *Ty = I.getAllocatedType();
826     AllocatedSize =
827         SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize);
828   }
829 
830   // We will happily inline static alloca instructions.
831   if (I.isStaticAlloca())
832     return Base::visitAlloca(I);
833 
834   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
835   // a variety of reasons, and so we would like to not inline them into
836   // functions which don't currently have a dynamic alloca. This simply
837   // disables inlining altogether in the presence of a dynamic alloca.
838   HasDynamicAlloca = true;
839   return false;
840 }
841 
842 bool CallAnalyzer::visitPHI(PHINode &I) {
843   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
844   // though we don't want to propagate it's bonuses. The idea is to disable
845   // SROA if it *might* be used in an inappropriate manner.
846 
847   // Phi nodes are always zero-cost.
848   // FIXME: Pointer sizes may differ between different address spaces, so do we
849   // need to use correct address space in the call to getPointerSizeInBits here?
850   // Or could we skip the getPointerSizeInBits call completely? As far as I can
851   // see the ZeroOffset is used as a dummy value, so we can probably use any
852   // bit width for the ZeroOffset?
853   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
854   bool CheckSROA = I.getType()->isPointerTy();
855 
856   // Track the constant or pointer with constant offset we've seen so far.
857   Constant *FirstC = nullptr;
858   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
859   Value *FirstV = nullptr;
860 
861   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
862     BasicBlock *Pred = I.getIncomingBlock(i);
863     // If the incoming block is dead, skip the incoming block.
864     if (DeadBlocks.count(Pred))
865       continue;
866     // If the parent block of phi is not the known successor of the incoming
867     // block, skip the incoming block.
868     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
869     if (KnownSuccessor && KnownSuccessor != I.getParent())
870       continue;
871 
872     Value *V = I.getIncomingValue(i);
873     // If the incoming value is this phi itself, skip the incoming value.
874     if (&I == V)
875       continue;
876 
877     Constant *C = dyn_cast<Constant>(V);
878     if (!C)
879       C = SimplifiedValues.lookup(V);
880 
881     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
882     if (!C && CheckSROA)
883       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
884 
885     if (!C && !BaseAndOffset.first)
886       // The incoming value is neither a constant nor a pointer with constant
887       // offset, exit early.
888       return true;
889 
890     if (FirstC) {
891       if (FirstC == C)
892         // If we've seen a constant incoming value before and it is the same
893         // constant we see this time, continue checking the next incoming value.
894         continue;
895       // Otherwise early exit because we either see a different constant or saw
896       // a constant before but we have a pointer with constant offset this time.
897       return true;
898     }
899 
900     if (FirstV) {
901       // The same logic as above, but check pointer with constant offset here.
902       if (FirstBaseAndOffset == BaseAndOffset)
903         continue;
904       return true;
905     }
906 
907     if (C) {
908       // This is the 1st time we've seen a constant, record it.
909       FirstC = C;
910       continue;
911     }
912 
913     // The remaining case is that this is the 1st time we've seen a pointer with
914     // constant offset, record it.
915     FirstV = V;
916     FirstBaseAndOffset = BaseAndOffset;
917   }
918 
919   // Check if we can map phi to a constant.
920   if (FirstC) {
921     SimplifiedValues[&I] = FirstC;
922     return true;
923   }
924 
925   // Check if we can map phi to a pointer with constant offset.
926   if (FirstBaseAndOffset.first) {
927     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
928 
929     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
930       SROAArgValues[&I] = SROAArg;
931   }
932 
933   return true;
934 }
935 
936 /// Check we can fold GEPs of constant-offset call site argument pointers.
937 /// This requires target data and inbounds GEPs.
938 ///
939 /// \return true if the specified GEP can be folded.
940 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
941   // Check if we have a base + offset for the pointer.
942   std::pair<Value *, APInt> BaseAndOffset =
943       ConstantOffsetPtrs.lookup(I.getPointerOperand());
944   if (!BaseAndOffset.first)
945     return false;
946 
947   // Check if the offset of this GEP is constant, and if so accumulate it
948   // into Offset.
949   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
950     return false;
951 
952   // Add the result as a new mapping to Base + Offset.
953   ConstantOffsetPtrs[&I] = BaseAndOffset;
954 
955   return true;
956 }
957 
958 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
959   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
960 
961   // Lambda to check whether a GEP's indices are all constant.
962   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
963     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
964       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
965         return false;
966     return true;
967   };
968 
969   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
970     if (SROAArg)
971       SROAArgValues[&I] = SROAArg;
972 
973     // Constant GEPs are modeled as free.
974     return true;
975   }
976 
977   // Variable GEPs will require math and will disable SROA.
978   if (SROAArg)
979     disableSROAForArg(SROAArg);
980   return isGEPFree(I);
981 }
982 
983 /// Simplify \p I if its operands are constants and update SimplifiedValues.
984 /// \p Evaluate is a callable specific to instruction type that evaluates the
985 /// instruction when all the operands are constants.
986 template <typename Callable>
987 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
988   SmallVector<Constant *, 2> COps;
989   for (Value *Op : I.operands()) {
990     Constant *COp = dyn_cast<Constant>(Op);
991     if (!COp)
992       COp = SimplifiedValues.lookup(Op);
993     if (!COp)
994       return false;
995     COps.push_back(COp);
996   }
997   auto *C = Evaluate(COps);
998   if (!C)
999     return false;
1000   SimplifiedValues[&I] = C;
1001   return true;
1002 }
1003 
1004 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1005   // Propagate constants through bitcasts.
1006   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1007         return ConstantExpr::getBitCast(COps[0], I.getType());
1008       }))
1009     return true;
1010 
1011   // Track base/offsets through casts
1012   std::pair<Value *, APInt> BaseAndOffset =
1013       ConstantOffsetPtrs.lookup(I.getOperand(0));
1014   // Casts don't change the offset, just wrap it up.
1015   if (BaseAndOffset.first)
1016     ConstantOffsetPtrs[&I] = BaseAndOffset;
1017 
1018   // Also look for SROA candidates here.
1019   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1020     SROAArgValues[&I] = SROAArg;
1021 
1022   // Bitcasts are always zero cost.
1023   return true;
1024 }
1025 
1026 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1027   // Propagate constants through ptrtoint.
1028   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1029         return ConstantExpr::getPtrToInt(COps[0], I.getType());
1030       }))
1031     return true;
1032 
1033   // Track base/offset pairs when converted to a plain integer provided the
1034   // integer is large enough to represent the pointer.
1035   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1036   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1037   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
1038     std::pair<Value *, APInt> BaseAndOffset =
1039         ConstantOffsetPtrs.lookup(I.getOperand(0));
1040     if (BaseAndOffset.first)
1041       ConstantOffsetPtrs[&I] = BaseAndOffset;
1042   }
1043 
1044   // This is really weird. Technically, ptrtoint will disable SROA. However,
1045   // unless that ptrtoint is *used* somewhere in the live basic blocks after
1046   // inlining, it will be nuked, and SROA should proceed. All of the uses which
1047   // would block SROA would also block SROA if applied directly to a pointer,
1048   // and so we can just add the integer in here. The only places where SROA is
1049   // preserved either cannot fire on an integer, or won't in-and-of themselves
1050   // disable SROA (ext) w/o some later use that we would see and disable.
1051   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1052     SROAArgValues[&I] = SROAArg;
1053 
1054   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1055 }
1056 
1057 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1058   // Propagate constants through ptrtoint.
1059   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1060         return ConstantExpr::getIntToPtr(COps[0], I.getType());
1061       }))
1062     return true;
1063 
1064   // Track base/offset pairs when round-tripped through a pointer without
1065   // modifications provided the integer is not too large.
1066   Value *Op = I.getOperand(0);
1067   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1068   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1069     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1070     if (BaseAndOffset.first)
1071       ConstantOffsetPtrs[&I] = BaseAndOffset;
1072   }
1073 
1074   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1075   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1076     SROAArgValues[&I] = SROAArg;
1077 
1078   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1079 }
1080 
1081 bool CallAnalyzer::visitCastInst(CastInst &I) {
1082   // Propagate constants through casts.
1083   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1084         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
1085       }))
1086     return true;
1087 
1088   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
1089   disableSROA(I.getOperand(0));
1090 
1091   // If this is a floating-point cast, and the target says this operation
1092   // is expensive, this may eventually become a library call. Treat the cost
1093   // as such.
1094   switch (I.getOpcode()) {
1095   case Instruction::FPTrunc:
1096   case Instruction::FPExt:
1097   case Instruction::UIToFP:
1098   case Instruction::SIToFP:
1099   case Instruction::FPToUI:
1100   case Instruction::FPToSI:
1101     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1102       onCallPenalty();
1103     break;
1104   default:
1105     break;
1106   }
1107 
1108   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
1109 }
1110 
1111 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
1112   Value *Operand = I.getOperand(0);
1113   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1114         return ConstantFoldInstOperands(&I, COps[0], DL);
1115       }))
1116     return true;
1117 
1118   // Disable any SROA on the argument to arbitrary unary instructions.
1119   disableSROA(Operand);
1120 
1121   return false;
1122 }
1123 
1124 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1125   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1126 }
1127 
1128 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1129   // Does the *call site* have the NonNull attribute set on an argument?  We
1130   // use the attribute on the call site to memoize any analysis done in the
1131   // caller. This will also trip if the callee function has a non-null
1132   // parameter attribute, but that's a less interesting case because hopefully
1133   // the callee would already have been simplified based on that.
1134   if (Argument *A = dyn_cast<Argument>(V))
1135     if (paramHasAttr(A, Attribute::NonNull))
1136       return true;
1137 
1138   // Is this an alloca in the caller?  This is distinct from the attribute case
1139   // above because attributes aren't updated within the inliner itself and we
1140   // always want to catch the alloca derived case.
1141   if (isAllocaDerivedArg(V))
1142     // We can actually predict the result of comparisons between an
1143     // alloca-derived value and null. Note that this fires regardless of
1144     // SROA firing.
1145     return true;
1146 
1147   return false;
1148 }
1149 
1150 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1151   // If the normal destination of the invoke or the parent block of the call
1152   // site is unreachable-terminated, there is little point in inlining this
1153   // unless there is literally zero cost.
1154   // FIXME: Note that it is possible that an unreachable-terminated block has a
1155   // hot entry. For example, in below scenario inlining hot_call_X() may be
1156   // beneficial :
1157   // main() {
1158   //   hot_call_1();
1159   //   ...
1160   //   hot_call_N()
1161   //   exit(0);
1162   // }
1163   // For now, we are not handling this corner case here as it is rare in real
1164   // code. In future, we should elaborate this based on BPI and BFI in more
1165   // general threshold adjusting heuristics in updateThreshold().
1166   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1167     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1168       return false;
1169   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1170     return false;
1171 
1172   return true;
1173 }
1174 
1175 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1176                                             BlockFrequencyInfo *CallerBFI) {
1177   // If global profile summary is available, then callsite's coldness is
1178   // determined based on that.
1179   if (PSI && PSI->hasProfileSummary())
1180     return PSI->isColdCallSite(Call, CallerBFI);
1181 
1182   // Otherwise we need BFI to be available.
1183   if (!CallerBFI)
1184     return false;
1185 
1186   // Determine if the callsite is cold relative to caller's entry. We could
1187   // potentially cache the computation of scaled entry frequency, but the added
1188   // complexity is not worth it unless this scaling shows up high in the
1189   // profiles.
1190   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1191   auto CallSiteBB = Call.getParent();
1192   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1193   auto CallerEntryFreq =
1194       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1195   return CallSiteFreq < CallerEntryFreq * ColdProb;
1196 }
1197 
1198 Optional<int>
1199 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1200                                                 BlockFrequencyInfo *CallerBFI) {
1201 
1202   // If global profile summary is available, then callsite's hotness is
1203   // determined based on that.
1204   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1205     return Params.HotCallSiteThreshold;
1206 
1207   // Otherwise we need BFI to be available and to have a locally hot callsite
1208   // threshold.
1209   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1210     return None;
1211 
1212   // Determine if the callsite is hot relative to caller's entry. We could
1213   // potentially cache the computation of scaled entry frequency, but the added
1214   // complexity is not worth it unless this scaling shows up high in the
1215   // profiles.
1216   auto CallSiteBB = Call.getParent();
1217   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1218   auto CallerEntryFreq = CallerBFI->getEntryFreq();
1219   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1220     return Params.LocallyHotCallSiteThreshold;
1221 
1222   // Otherwise treat it normally.
1223   return None;
1224 }
1225 
1226 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1227   // If no size growth is allowed for this inlining, set Threshold to 0.
1228   if (!allowSizeGrowth(Call)) {
1229     Threshold = 0;
1230     return;
1231   }
1232 
1233   Function *Caller = Call.getCaller();
1234 
1235   // return min(A, B) if B is valid.
1236   auto MinIfValid = [](int A, Optional<int> B) {
1237     return B ? std::min(A, B.getValue()) : A;
1238   };
1239 
1240   // return max(A, B) if B is valid.
1241   auto MaxIfValid = [](int A, Optional<int> B) {
1242     return B ? std::max(A, B.getValue()) : A;
1243   };
1244 
1245   // Various bonus percentages. These are multiplied by Threshold to get the
1246   // bonus values.
1247   // SingleBBBonus: This bonus is applied if the callee has a single reachable
1248   // basic block at the given callsite context. This is speculatively applied
1249   // and withdrawn if more than one basic block is seen.
1250   //
1251   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1252   // of the last call to a static function as inlining such functions is
1253   // guaranteed to reduce code size.
1254   //
1255   // These bonus percentages may be set to 0 based on properties of the caller
1256   // and the callsite.
1257   int SingleBBBonusPercent = 50;
1258   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1259   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1260 
1261   // Lambda to set all the above bonus and bonus percentages to 0.
1262   auto DisallowAllBonuses = [&]() {
1263     SingleBBBonusPercent = 0;
1264     VectorBonusPercent = 0;
1265     LastCallToStaticBonus = 0;
1266   };
1267 
1268   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1269   // and reduce the threshold if the caller has the necessary attribute.
1270   if (Caller->hasMinSize()) {
1271     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1272     // For minsize, we want to disable the single BB bonus and the vector
1273     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1274     // a static function will, at the minimum, eliminate the parameter setup and
1275     // call/return instructions.
1276     SingleBBBonusPercent = 0;
1277     VectorBonusPercent = 0;
1278   } else if (Caller->hasOptSize())
1279     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1280 
1281   // Adjust the threshold based on inlinehint attribute and profile based
1282   // hotness information if the caller does not have MinSize attribute.
1283   if (!Caller->hasMinSize()) {
1284     if (Callee.hasFnAttribute(Attribute::InlineHint))
1285       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1286 
1287     // FIXME: After switching to the new passmanager, simplify the logic below
1288     // by checking only the callsite hotness/coldness as we will reliably
1289     // have local profile information.
1290     //
1291     // Callsite hotness and coldness can be determined if sample profile is
1292     // used (which adds hotness metadata to calls) or if caller's
1293     // BlockFrequencyInfo is available.
1294     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
1295     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1296     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1297       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1298       // FIXME: This should update the threshold only if it exceeds the
1299       // current threshold, but AutoFDO + ThinLTO currently relies on this
1300       // behavior to prevent inlining of hot callsites during ThinLTO
1301       // compile phase.
1302       Threshold = HotCallSiteThreshold.getValue();
1303     } else if (isColdCallSite(Call, CallerBFI)) {
1304       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1305       // Do not apply bonuses for a cold callsite including the
1306       // LastCallToStatic bonus. While this bonus might result in code size
1307       // reduction, it can cause the size of a non-cold caller to increase
1308       // preventing it from being inlined.
1309       DisallowAllBonuses();
1310       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1311     } else if (PSI) {
1312       // Use callee's global profile information only if we have no way of
1313       // determining this via callsite information.
1314       if (PSI->isFunctionEntryHot(&Callee)) {
1315         LLVM_DEBUG(dbgs() << "Hot callee.\n");
1316         // If callsite hotness can not be determined, we may still know
1317         // that the callee is hot and treat it as a weaker hint for threshold
1318         // increase.
1319         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1320       } else if (PSI->isFunctionEntryCold(&Callee)) {
1321         LLVM_DEBUG(dbgs() << "Cold callee.\n");
1322         // Do not apply bonuses for a cold callee including the
1323         // LastCallToStatic bonus. While this bonus might result in code size
1324         // reduction, it can cause the size of a non-cold caller to increase
1325         // preventing it from being inlined.
1326         DisallowAllBonuses();
1327         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1328       }
1329     }
1330   }
1331 
1332   // Finally, take the target-specific inlining threshold multiplier into
1333   // account.
1334   Threshold *= TTI.getInliningThresholdMultiplier();
1335 
1336   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1337   VectorBonus = Threshold * VectorBonusPercent / 100;
1338 
1339   bool OnlyOneCallAndLocalLinkage =
1340       F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
1341   // If there is only one call of the function, and it has internal linkage,
1342   // the cost of inlining it drops dramatically. It may seem odd to update
1343   // Cost in updateThreshold, but the bonus depends on the logic in this method.
1344   if (OnlyOneCallAndLocalLinkage)
1345     Cost -= LastCallToStaticBonus;
1346 }
1347 
1348 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1349   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1350   // First try to handle simplified comparisons.
1351   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1352         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
1353       }))
1354     return true;
1355 
1356   if (I.getOpcode() == Instruction::FCmp)
1357     return false;
1358 
1359   // Otherwise look for a comparison between constant offset pointers with
1360   // a common base.
1361   Value *LHSBase, *RHSBase;
1362   APInt LHSOffset, RHSOffset;
1363   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1364   if (LHSBase) {
1365     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1366     if (RHSBase && LHSBase == RHSBase) {
1367       // We have common bases, fold the icmp to a constant based on the
1368       // offsets.
1369       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1370       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1371       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1372         SimplifiedValues[&I] = C;
1373         ++NumConstantPtrCmps;
1374         return true;
1375       }
1376     }
1377   }
1378 
1379   // If the comparison is an equality comparison with null, we can simplify it
1380   // if we know the value (argument) can't be null
1381   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1382       isKnownNonNullInCallee(I.getOperand(0))) {
1383     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1384     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1385                                       : ConstantInt::getFalse(I.getType());
1386     return true;
1387   }
1388   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1389 }
1390 
1391 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1392   // Try to handle a special case: we can fold computing the difference of two
1393   // constant-related pointers.
1394   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1395   Value *LHSBase, *RHSBase;
1396   APInt LHSOffset, RHSOffset;
1397   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1398   if (LHSBase) {
1399     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1400     if (RHSBase && LHSBase == RHSBase) {
1401       // We have common bases, fold the subtract to a constant based on the
1402       // offsets.
1403       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1404       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1405       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1406         SimplifiedValues[&I] = C;
1407         ++NumConstantPtrDiffs;
1408         return true;
1409       }
1410     }
1411   }
1412 
1413   // Otherwise, fall back to the generic logic for simplifying and handling
1414   // instructions.
1415   return Base::visitSub(I);
1416 }
1417 
1418 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1419   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1420   Constant *CLHS = dyn_cast<Constant>(LHS);
1421   if (!CLHS)
1422     CLHS = SimplifiedValues.lookup(LHS);
1423   Constant *CRHS = dyn_cast<Constant>(RHS);
1424   if (!CRHS)
1425     CRHS = SimplifiedValues.lookup(RHS);
1426 
1427   Value *SimpleV = nullptr;
1428   if (auto FI = dyn_cast<FPMathOperator>(&I))
1429     SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
1430                             FI->getFastMathFlags(), DL);
1431   else
1432     SimpleV =
1433         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1434 
1435   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1436     SimplifiedValues[&I] = C;
1437 
1438   if (SimpleV)
1439     return true;
1440 
1441   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1442   disableSROA(LHS);
1443   disableSROA(RHS);
1444 
1445   // If the instruction is floating point, and the target says this operation
1446   // is expensive, this may eventually become a library call. Treat the cost
1447   // as such. Unless it's fneg which can be implemented with an xor.
1448   using namespace llvm::PatternMatch;
1449   if (I.getType()->isFloatingPointTy() &&
1450       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
1451       !match(&I, m_FNeg(m_Value())))
1452     onCallPenalty();
1453 
1454   return false;
1455 }
1456 
1457 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
1458   Value *Op = I.getOperand(0);
1459   Constant *COp = dyn_cast<Constant>(Op);
1460   if (!COp)
1461     COp = SimplifiedValues.lookup(Op);
1462 
1463   Value *SimpleV = SimplifyFNegInst(
1464       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
1465 
1466   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1467     SimplifiedValues[&I] = C;
1468 
1469   if (SimpleV)
1470     return true;
1471 
1472   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
1473   disableSROA(Op);
1474 
1475   return false;
1476 }
1477 
1478 bool CallAnalyzer::visitLoad(LoadInst &I) {
1479   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1480     return true;
1481 
1482   // If the data is already loaded from this address and hasn't been clobbered
1483   // by any stores or calls, this load is likely to be redundant and can be
1484   // eliminated.
1485   if (EnableLoadElimination &&
1486       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1487     onLoadEliminationOpportunity();
1488     return true;
1489   }
1490 
1491   return false;
1492 }
1493 
1494 bool CallAnalyzer::visitStore(StoreInst &I) {
1495   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1496     return true;
1497 
1498   // The store can potentially clobber loads and prevent repeated loads from
1499   // being eliminated.
1500   // FIXME:
1501   // 1. We can probably keep an initial set of eliminatable loads substracted
1502   // from the cost even when we finally see a store. We just need to disable
1503   // *further* accumulation of elimination savings.
1504   // 2. We should probably at some point thread MemorySSA for the callee into
1505   // this and then use that to actually compute *really* precise savings.
1506   disableLoadElimination();
1507   return false;
1508 }
1509 
1510 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1511   // Constant folding for extract value is trivial.
1512   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1513         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1514       }))
1515     return true;
1516 
1517   // SROA can look through these but give them a cost.
1518   return false;
1519 }
1520 
1521 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1522   // Constant folding for insert value is trivial.
1523   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1524         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1525                                             /*InsertedValueOperand*/ COps[1],
1526                                             I.getIndices());
1527       }))
1528     return true;
1529 
1530   // SROA can look through these but give them a cost.
1531   return false;
1532 }
1533 
1534 /// Try to simplify a call site.
1535 ///
1536 /// Takes a concrete function and callsite and tries to actually simplify it by
1537 /// analyzing the arguments and call itself with instsimplify. Returns true if
1538 /// it has simplified the callsite to some other entity (a constant), making it
1539 /// free.
1540 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
1541   // FIXME: Using the instsimplify logic directly for this is inefficient
1542   // because we have to continually rebuild the argument list even when no
1543   // simplifications can be performed. Until that is fixed with remapping
1544   // inside of instsimplify, directly constant fold calls here.
1545   if (!canConstantFoldCallTo(&Call, F))
1546     return false;
1547 
1548   // Try to re-map the arguments to constants.
1549   SmallVector<Constant *, 4> ConstantArgs;
1550   ConstantArgs.reserve(Call.arg_size());
1551   for (Value *I : Call.args()) {
1552     Constant *C = dyn_cast<Constant>(I);
1553     if (!C)
1554       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
1555     if (!C)
1556       return false; // This argument doesn't map to a constant.
1557 
1558     ConstantArgs.push_back(C);
1559   }
1560   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
1561     SimplifiedValues[&Call] = C;
1562     return true;
1563   }
1564 
1565   return false;
1566 }
1567 
1568 bool CallAnalyzer::visitCallBase(CallBase &Call) {
1569   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
1570       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1571     // This aborts the entire analysis.
1572     ExposesReturnsTwice = true;
1573     return false;
1574   }
1575   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
1576     ContainsNoDuplicateCall = true;
1577 
1578   Value *Callee = Call.getCalledOperand();
1579   Function *F = dyn_cast_or_null<Function>(Callee);
1580   bool IsIndirectCall = !F;
1581   if (IsIndirectCall) {
1582     // Check if this happens to be an indirect function call to a known function
1583     // in this inline context. If not, we've done all we can.
1584     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1585     if (!F) {
1586       onCallArgumentSetup(Call);
1587 
1588       if (!Call.onlyReadsMemory())
1589         disableLoadElimination();
1590       return Base::visitCallBase(Call);
1591     }
1592   }
1593 
1594   assert(F && "Expected a call to a known function");
1595 
1596   // When we have a concrete function, first try to simplify it directly.
1597   if (simplifyCallSite(F, Call))
1598     return true;
1599 
1600   // Next check if it is an intrinsic we know about.
1601   // FIXME: Lift this into part of the InstVisitor.
1602   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
1603     switch (II->getIntrinsicID()) {
1604     default:
1605       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1606         disableLoadElimination();
1607       return Base::visitCallBase(Call);
1608 
1609     case Intrinsic::load_relative:
1610       onLoadRelativeIntrinsic();
1611       return false;
1612 
1613     case Intrinsic::memset:
1614     case Intrinsic::memcpy:
1615     case Intrinsic::memmove:
1616       disableLoadElimination();
1617       // SROA can usually chew through these intrinsics, but they aren't free.
1618       return false;
1619     case Intrinsic::icall_branch_funnel:
1620     case Intrinsic::localescape:
1621       HasUninlineableIntrinsic = true;
1622       return false;
1623     case Intrinsic::vastart:
1624       InitsVargArgs = true;
1625       return false;
1626     }
1627   }
1628 
1629   if (F == Call.getFunction()) {
1630     // This flag will fully abort the analysis, so don't bother with anything
1631     // else.
1632     IsRecursiveCall = true;
1633     return false;
1634   }
1635 
1636   if (TTI.isLoweredToCall(F)) {
1637     onLoweredCall(F, Call, IsIndirectCall);
1638   }
1639 
1640   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
1641     disableLoadElimination();
1642   return Base::visitCallBase(Call);
1643 }
1644 
1645 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1646   // At least one return instruction will be free after inlining.
1647   bool Free = !HasReturn;
1648   HasReturn = true;
1649   return Free;
1650 }
1651 
1652 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1653   // We model unconditional branches as essentially free -- they really
1654   // shouldn't exist at all, but handling them makes the behavior of the
1655   // inliner more regular and predictable. Interestingly, conditional branches
1656   // which will fold away are also free.
1657   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1658          dyn_cast_or_null<ConstantInt>(
1659              SimplifiedValues.lookup(BI.getCondition()));
1660 }
1661 
1662 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1663   bool CheckSROA = SI.getType()->isPointerTy();
1664   Value *TrueVal = SI.getTrueValue();
1665   Value *FalseVal = SI.getFalseValue();
1666 
1667   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1668   if (!TrueC)
1669     TrueC = SimplifiedValues.lookup(TrueVal);
1670   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1671   if (!FalseC)
1672     FalseC = SimplifiedValues.lookup(FalseVal);
1673   Constant *CondC =
1674       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1675 
1676   if (!CondC) {
1677     // Select C, X, X => X
1678     if (TrueC == FalseC && TrueC) {
1679       SimplifiedValues[&SI] = TrueC;
1680       return true;
1681     }
1682 
1683     if (!CheckSROA)
1684       return Base::visitSelectInst(SI);
1685 
1686     std::pair<Value *, APInt> TrueBaseAndOffset =
1687         ConstantOffsetPtrs.lookup(TrueVal);
1688     std::pair<Value *, APInt> FalseBaseAndOffset =
1689         ConstantOffsetPtrs.lookup(FalseVal);
1690     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1691       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1692 
1693       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
1694         SROAArgValues[&SI] = SROAArg;
1695       return true;
1696     }
1697 
1698     return Base::visitSelectInst(SI);
1699   }
1700 
1701   // Select condition is a constant.
1702   Value *SelectedV = CondC->isAllOnesValue()
1703                          ? TrueVal
1704                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1705   if (!SelectedV) {
1706     // Condition is a vector constant that is not all 1s or all 0s.  If all
1707     // operands are constants, ConstantExpr::getSelect() can handle the cases
1708     // such as select vectors.
1709     if (TrueC && FalseC) {
1710       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1711         SimplifiedValues[&SI] = C;
1712         return true;
1713       }
1714     }
1715     return Base::visitSelectInst(SI);
1716   }
1717 
1718   // Condition is either all 1s or all 0s. SI can be simplified.
1719   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1720     SimplifiedValues[&SI] = SelectedC;
1721     return true;
1722   }
1723 
1724   if (!CheckSROA)
1725     return true;
1726 
1727   std::pair<Value *, APInt> BaseAndOffset =
1728       ConstantOffsetPtrs.lookup(SelectedV);
1729   if (BaseAndOffset.first) {
1730     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1731 
1732     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
1733       SROAArgValues[&SI] = SROAArg;
1734   }
1735 
1736   return true;
1737 }
1738 
1739 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1740   // We model unconditional switches as free, see the comments on handling
1741   // branches.
1742   if (isa<ConstantInt>(SI.getCondition()))
1743     return true;
1744   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1745     if (isa<ConstantInt>(V))
1746       return true;
1747 
1748   // Assume the most general case where the switch is lowered into
1749   // either a jump table, bit test, or a balanced binary tree consisting of
1750   // case clusters without merging adjacent clusters with the same
1751   // destination. We do not consider the switches that are lowered with a mix
1752   // of jump table/bit test/binary search tree. The cost of the switch is
1753   // proportional to the size of the tree or the size of jump table range.
1754   //
1755   // NB: We convert large switches which are just used to initialize large phi
1756   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1757   // inlining those. It will prevent inlining in cases where the optimization
1758   // does not (yet) fire.
1759 
1760   unsigned JumpTableSize = 0;
1761   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
1762   unsigned NumCaseCluster =
1763       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
1764 
1765   onFinalizeSwitch(JumpTableSize, NumCaseCluster);
1766   return false;
1767 }
1768 
1769 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1770   // We never want to inline functions that contain an indirectbr.  This is
1771   // incorrect because all the blockaddress's (in static global initializers
1772   // for example) would be referring to the original function, and this
1773   // indirect jump would jump from the inlined copy of the function into the
1774   // original function which is extremely undefined behavior.
1775   // FIXME: This logic isn't really right; we can safely inline functions with
1776   // indirectbr's as long as no other function or global references the
1777   // blockaddress of a block within the current function.
1778   HasIndirectBr = true;
1779   return false;
1780 }
1781 
1782 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1783   // FIXME: It's not clear that a single instruction is an accurate model for
1784   // the inline cost of a resume instruction.
1785   return false;
1786 }
1787 
1788 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1789   // FIXME: It's not clear that a single instruction is an accurate model for
1790   // the inline cost of a cleanupret instruction.
1791   return false;
1792 }
1793 
1794 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1795   // FIXME: It's not clear that a single instruction is an accurate model for
1796   // the inline cost of a catchret instruction.
1797   return false;
1798 }
1799 
1800 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1801   // FIXME: It might be reasonably to discount the cost of instructions leading
1802   // to unreachable as they have the lowest possible impact on both runtime and
1803   // code size.
1804   return true; // No actual code is needed for unreachable.
1805 }
1806 
1807 bool CallAnalyzer::visitInstruction(Instruction &I) {
1808   // Some instructions are free. All of the free intrinsics can also be
1809   // handled by SROA, etc.
1810   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1811     return true;
1812 
1813   // We found something we don't understand or can't handle. Mark any SROA-able
1814   // values in the operand list as no longer viable.
1815   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1816     disableSROA(*OI);
1817 
1818   return false;
1819 }
1820 
1821 /// Analyze a basic block for its contribution to the inline cost.
1822 ///
1823 /// This method walks the analyzer over every instruction in the given basic
1824 /// block and accounts for their cost during inlining at this callsite. It
1825 /// aborts early if the threshold has been exceeded or an impossible to inline
1826 /// construct has been detected. It returns false if inlining is no longer
1827 /// viable, and true if inlining remains viable.
1828 InlineResult
1829 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1830                            SmallPtrSetImpl<const Value *> &EphValues) {
1831   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1832     // FIXME: Currently, the number of instructions in a function regardless of
1833     // our ability to simplify them during inline to constants or dead code,
1834     // are actually used by the vector bonus heuristic. As long as that's true,
1835     // we have to special case debug intrinsics here to prevent differences in
1836     // inlining due to debug symbols. Eventually, the number of unsimplified
1837     // instructions shouldn't factor into the cost computation, but until then,
1838     // hack around it here.
1839     if (isa<DbgInfoIntrinsic>(I))
1840       continue;
1841 
1842     // Skip ephemeral values.
1843     if (EphValues.count(&*I))
1844       continue;
1845 
1846     ++NumInstructions;
1847     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1848       ++NumVectorInstructions;
1849 
1850     // If the instruction simplified to a constant, there is no cost to this
1851     // instruction. Visit the instructions using our InstVisitor to account for
1852     // all of the per-instruction logic. The visit tree returns true if we
1853     // consumed the instruction in any way, and false if the instruction's base
1854     // cost should count against inlining.
1855     onInstructionAnalysisStart(&*I);
1856 
1857     if (Base::visit(&*I))
1858       ++NumInstructionsSimplified;
1859     else
1860       onMissedSimplification();
1861 
1862     onInstructionAnalysisFinish(&*I);
1863     using namespace ore;
1864     // If the visit this instruction detected an uninlinable pattern, abort.
1865     InlineResult IR = InlineResult::success();
1866     if (IsRecursiveCall)
1867       IR = InlineResult::failure("recursive");
1868     else if (ExposesReturnsTwice)
1869       IR = InlineResult::failure("exposes returns twice");
1870     else if (HasDynamicAlloca)
1871       IR = InlineResult::failure("dynamic alloca");
1872     else if (HasIndirectBr)
1873       IR = InlineResult::failure("indirect branch");
1874     else if (HasUninlineableIntrinsic)
1875       IR = InlineResult::failure("uninlinable intrinsic");
1876     else if (InitsVargArgs)
1877       IR = InlineResult::failure("varargs");
1878     if (!IR.isSuccess()) {
1879       if (ORE)
1880         ORE->emit([&]() {
1881           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1882                                           &CandidateCall)
1883                  << NV("Callee", &F) << " has uninlinable pattern ("
1884                  << NV("InlineResult", IR.getFailureReason())
1885                  << ") and cost is not fully computed";
1886         });
1887       return IR;
1888     }
1889 
1890     // If the caller is a recursive function then we don't want to inline
1891     // functions which allocate a lot of stack space because it would increase
1892     // the caller stack usage dramatically.
1893     if (IsCallerRecursive &&
1894         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1895       auto IR =
1896           InlineResult::failure("recursive and allocates too much stack space");
1897       if (ORE)
1898         ORE->emit([&]() {
1899           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1900                                           &CandidateCall)
1901                  << NV("Callee", &F) << " is "
1902                  << NV("InlineResult", IR.getFailureReason())
1903                  << ". Cost is not fully computed";
1904         });
1905       return IR;
1906     }
1907 
1908     if (shouldStop())
1909       return InlineResult::failure(
1910           "Call site analysis is not favorable to inlining.");
1911   }
1912 
1913   return InlineResult::success();
1914 }
1915 
1916 /// Compute the base pointer and cumulative constant offsets for V.
1917 ///
1918 /// This strips all constant offsets off of V, leaving it the base pointer, and
1919 /// accumulates the total constant offset applied in the returned constant. It
1920 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1921 /// no constant offsets applied.
1922 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1923   if (!V->getType()->isPointerTy())
1924     return nullptr;
1925 
1926   unsigned AS = V->getType()->getPointerAddressSpace();
1927   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1928   APInt Offset = APInt::getNullValue(IntPtrWidth);
1929 
1930   // Even though we don't look through PHI nodes, we could be called on an
1931   // instruction in an unreachable block, which may be on a cycle.
1932   SmallPtrSet<Value *, 4> Visited;
1933   Visited.insert(V);
1934   do {
1935     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1936       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1937         return nullptr;
1938       V = GEP->getPointerOperand();
1939     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1940       V = cast<Operator>(V)->getOperand(0);
1941     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1942       if (GA->isInterposable())
1943         break;
1944       V = GA->getAliasee();
1945     } else {
1946       break;
1947     }
1948     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1949   } while (Visited.insert(V).second);
1950 
1951   Type *IdxPtrTy = DL.getIndexType(V->getType());
1952   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
1953 }
1954 
1955 /// Find dead blocks due to deleted CFG edges during inlining.
1956 ///
1957 /// If we know the successor of the current block, \p CurrBB, has to be \p
1958 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1959 /// no live incoming CFG edges.  If one block is found to be dead, we can
1960 /// continue growing the dead block list by checking the successors of the dead
1961 /// blocks to see if all their incoming edges are dead or not.
1962 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1963   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1964     // A CFG edge is dead if the predecessor is dead or the predecessor has a
1965     // known successor which is not the one under exam.
1966     return (DeadBlocks.count(Pred) ||
1967             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1968   };
1969 
1970   auto IsNewlyDead = [&](BasicBlock *BB) {
1971     // If all the edges to a block are dead, the block is also dead.
1972     return (!DeadBlocks.count(BB) &&
1973             llvm::all_of(predecessors(BB),
1974                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1975   };
1976 
1977   for (BasicBlock *Succ : successors(CurrBB)) {
1978     if (Succ == NextBB || !IsNewlyDead(Succ))
1979       continue;
1980     SmallVector<BasicBlock *, 4> NewDead;
1981     NewDead.push_back(Succ);
1982     while (!NewDead.empty()) {
1983       BasicBlock *Dead = NewDead.pop_back_val();
1984       if (DeadBlocks.insert(Dead))
1985         // Continue growing the dead block lists.
1986         for (BasicBlock *S : successors(Dead))
1987           if (IsNewlyDead(S))
1988             NewDead.push_back(S);
1989     }
1990   }
1991 }
1992 
1993 /// Analyze a call site for potential inlining.
1994 ///
1995 /// Returns true if inlining this call is viable, and false if it is not
1996 /// viable. It computes the cost and adjusts the threshold based on numerous
1997 /// factors and heuristics. If this method returns false but the computed cost
1998 /// is below the computed threshold, then inlining was forcibly disabled by
1999 /// some artifact of the routine.
2000 InlineResult CallAnalyzer::analyze() {
2001   ++NumCallsAnalyzed;
2002 
2003   auto Result = onAnalysisStart();
2004   if (!Result.isSuccess())
2005     return Result;
2006 
2007   if (F.empty())
2008     return InlineResult::success();
2009 
2010   Function *Caller = CandidateCall.getFunction();
2011   // Check if the caller function is recursive itself.
2012   for (User *U : Caller->users()) {
2013     CallBase *Call = dyn_cast<CallBase>(U);
2014     if (Call && Call->getFunction() == Caller) {
2015       IsCallerRecursive = true;
2016       break;
2017     }
2018   }
2019 
2020   // Populate our simplified values by mapping from function arguments to call
2021   // arguments with known important simplifications.
2022   auto CAI = CandidateCall.arg_begin();
2023   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
2024        FAI != FAE; ++FAI, ++CAI) {
2025     assert(CAI != CandidateCall.arg_end());
2026     if (Constant *C = dyn_cast<Constant>(CAI))
2027       SimplifiedValues[&*FAI] = C;
2028 
2029     Value *PtrArg = *CAI;
2030     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2031       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
2032 
2033       // We can SROA any pointer arguments derived from alloca instructions.
2034       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2035         SROAArgValues[&*FAI] = SROAArg;
2036         onInitializeSROAArg(SROAArg);
2037         EnabledSROAAllocas.insert(SROAArg);
2038       }
2039     }
2040   }
2041   NumConstantArgs = SimplifiedValues.size();
2042   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2043   NumAllocaArgs = SROAArgValues.size();
2044 
2045   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2046   // the ephemeral values multiple times (and they're completely determined by
2047   // the callee, so this is purely duplicate work).
2048   SmallPtrSet<const Value *, 32> EphValues;
2049   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2050 
2051   // The worklist of live basic blocks in the callee *after* inlining. We avoid
2052   // adding basic blocks of the callee which can be proven to be dead for this
2053   // particular call site in order to get more accurate cost estimates. This
2054   // requires a somewhat heavyweight iteration pattern: we need to walk the
2055   // basic blocks in a breadth-first order as we insert live successors. To
2056   // accomplish this, prioritizing for small iterations because we exit after
2057   // crossing our threshold, we use a small-size optimized SetVector.
2058   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
2059                     SmallPtrSet<BasicBlock *, 16>>
2060       BBSetVector;
2061   BBSetVector BBWorklist;
2062   BBWorklist.insert(&F.getEntryBlock());
2063 
2064   // Note that we *must not* cache the size, this loop grows the worklist.
2065   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2066     if (shouldStop())
2067       break;
2068 
2069     BasicBlock *BB = BBWorklist[Idx];
2070     if (BB->empty())
2071       continue;
2072 
2073     // Disallow inlining a blockaddress with uses other than strictly callbr.
2074     // A blockaddress only has defined behavior for an indirect branch in the
2075     // same function, and we do not currently support inlining indirect
2076     // branches.  But, the inliner may not see an indirect branch that ends up
2077     // being dead code at a particular call site. If the blockaddress escapes
2078     // the function, e.g., via a global variable, inlining may lead to an
2079     // invalid cross-function reference.
2080     // FIXME: pr/39560: continue relaxing this overt restriction.
2081     if (BB->hasAddressTaken())
2082       for (User *U : BlockAddress::get(&*BB)->users())
2083         if (!isa<CallBrInst>(*U))
2084           return InlineResult::failure("blockaddress used outside of callbr");
2085 
2086     // Analyze the cost of this block. If we blow through the threshold, this
2087     // returns false, and we can bail on out.
2088     InlineResult IR = analyzeBlock(BB, EphValues);
2089     if (!IR.isSuccess())
2090       return IR;
2091 
2092     Instruction *TI = BB->getTerminator();
2093 
2094     // Add in the live successors by first checking whether we have terminator
2095     // that may be simplified based on the values simplified by this call.
2096     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2097       if (BI->isConditional()) {
2098         Value *Cond = BI->getCondition();
2099         if (ConstantInt *SimpleCond =
2100                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2101           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2102           BBWorklist.insert(NextBB);
2103           KnownSuccessors[BB] = NextBB;
2104           findDeadBlocks(BB, NextBB);
2105           continue;
2106         }
2107       }
2108     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2109       Value *Cond = SI->getCondition();
2110       if (ConstantInt *SimpleCond =
2111               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2112         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2113         BBWorklist.insert(NextBB);
2114         KnownSuccessors[BB] = NextBB;
2115         findDeadBlocks(BB, NextBB);
2116         continue;
2117       }
2118     }
2119 
2120     // If we're unable to select a particular successor, just count all of
2121     // them.
2122     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2123          ++TIdx)
2124       BBWorklist.insert(TI->getSuccessor(TIdx));
2125 
2126     onBlockAnalyzed(BB);
2127   }
2128 
2129   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
2130                                     &F == CandidateCall.getCalledFunction();
2131   // If this is a noduplicate call, we can still inline as long as
2132   // inlining this would cause the removal of the caller (so the instruction
2133   // is not actually duplicated, just moved).
2134   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
2135     return InlineResult::failure("noduplicate");
2136 
2137   return finalizeAnalysis();
2138 }
2139 
2140 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2141 /// Dump stats about this call's analysis.
2142 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
2143 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
2144   if (PrintDebugInstructionDeltas)
2145     F.print(dbgs(), &Writer);
2146   DEBUG_PRINT_STAT(NumConstantArgs);
2147   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2148   DEBUG_PRINT_STAT(NumAllocaArgs);
2149   DEBUG_PRINT_STAT(NumConstantPtrCmps);
2150   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2151   DEBUG_PRINT_STAT(NumInstructionsSimplified);
2152   DEBUG_PRINT_STAT(NumInstructions);
2153   DEBUG_PRINT_STAT(SROACostSavings);
2154   DEBUG_PRINT_STAT(SROACostSavingsLost);
2155   DEBUG_PRINT_STAT(LoadEliminationCost);
2156   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2157   DEBUG_PRINT_STAT(Cost);
2158   DEBUG_PRINT_STAT(Threshold);
2159 #undef DEBUG_PRINT_STAT
2160 }
2161 #endif
2162 
2163 /// Test that there are no attribute conflicts between Caller and Callee
2164 ///        that prevent inlining.
2165 static bool functionsHaveCompatibleAttributes(
2166     Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2167     function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2168   // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2169   // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2170   // object, and always returns the same object (which is overwritten on each
2171   // GetTLI call). Therefore we copy the first result.
2172   auto CalleeTLI = GetTLI(*Callee);
2173   return TTI.areInlineCompatible(Caller, Callee) &&
2174          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2175                                              InlineCallerSupersetNoBuiltin) &&
2176          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2177 }
2178 
2179 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
2180   int Cost = 0;
2181   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2182     if (Call.isByValArgument(I)) {
2183       // We approximate the number of loads and stores needed by dividing the
2184       // size of the byval type by the target's pointer size.
2185       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2186       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
2187       unsigned AS = PTy->getAddressSpace();
2188       unsigned PointerSize = DL.getPointerSizeInBits(AS);
2189       // Ceiling division.
2190       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2191 
2192       // If it generates more than 8 stores it is likely to be expanded as an
2193       // inline memcpy so we take that as an upper bound. Otherwise we assume
2194       // one load and one store per word copied.
2195       // FIXME: The maxStoresPerMemcpy setting from the target should be used
2196       // here instead of a magic number of 8, but it's not available via
2197       // DataLayout.
2198       NumStores = std::min(NumStores, 8U);
2199 
2200       Cost += 2 * NumStores * InlineConstants::InstrCost;
2201     } else {
2202       // For non-byval arguments subtract off one instruction per call
2203       // argument.
2204       Cost += InlineConstants::InstrCost;
2205     }
2206   }
2207   // The call instruction also disappears after inlining.
2208   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
2209   return Cost;
2210 }
2211 
2212 InlineCost llvm::getInlineCost(
2213     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2214     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2215     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2216     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2217     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2218   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2219                        GetAssumptionCache, GetBFI, GetTLI, PSI, ORE);
2220 }
2221 
2222 Optional<int> getInliningCostEstimate(
2223     CallBase &Call, TargetTransformInfo &CalleeTTI,
2224     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2225     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2226     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2227   const InlineParams Params = {/* DefaultThreshold*/ 0,
2228                                /*HintThreshold*/ {},
2229                                /*ColdThreshold*/ {},
2230                                /*OptSizeThreshold*/ {},
2231                                /*OptMinSizeThreshold*/ {},
2232                                /*HotCallSiteThreshold*/ {},
2233                                /*LocallyHotCallSiteThreshold*/ {},
2234                                /*ColdCallSiteThreshold*/ {},
2235                                /* ComputeFullInlineCost*/ true};
2236 
2237   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2238                             *Call.getCalledFunction(), Call, Params, true,
2239                             /*IgnoreThreshold*/ true);
2240   auto R = CA.analyze();
2241   if (!R.isSuccess())
2242     return None;
2243   return CA.getCost();
2244 }
2245 
2246 Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2247     CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2248     function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2249 
2250   // Cannot inline indirect calls.
2251   if (!Callee)
2252     return InlineResult::failure("indirect call");
2253 
2254   // Never inline calls with byval arguments that does not have the alloca
2255   // address space. Since byval arguments can be replaced with a copy to an
2256   // alloca, the inlined code would need to be adjusted to handle that the
2257   // argument is in the alloca address space (so it is a little bit complicated
2258   // to solve).
2259   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2260   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2261     if (Call.isByValArgument(I)) {
2262       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2263       if (PTy->getAddressSpace() != AllocaAS)
2264         return InlineResult::failure("byval arguments without alloca"
2265                                      " address space");
2266     }
2267 
2268   // Calls to functions with always-inline attributes should be inlined
2269   // whenever possible.
2270   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2271     auto IsViable = isInlineViable(*Callee);
2272     if (IsViable.isSuccess())
2273       return InlineResult::success();
2274     return InlineResult::failure(IsViable.getFailureReason());
2275   }
2276 
2277   // Never inline functions with conflicting attributes (unless callee has
2278   // always-inline attribute).
2279   Function *Caller = Call.getCaller();
2280   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
2281     return InlineResult::failure("conflicting attributes");
2282 
2283   // Don't inline this call if the caller has the optnone attribute.
2284   if (Caller->hasOptNone())
2285     return InlineResult::failure("optnone attribute");
2286 
2287   // Don't inline a function that treats null pointer as valid into a caller
2288   // that does not have this attribute.
2289   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2290     return InlineResult::failure("nullptr definitions incompatible");
2291 
2292   // Don't inline functions which can be interposed at link-time.
2293   if (Callee->isInterposable())
2294     return InlineResult::failure("interposable");
2295 
2296   // Don't inline functions marked noinline.
2297   if (Callee->hasFnAttribute(Attribute::NoInline))
2298     return InlineResult::failure("noinline function attribute");
2299 
2300   // Don't inline call sites marked noinline.
2301   if (Call.isNoInline())
2302     return InlineResult::failure("noinline call site attribute");
2303 
2304   return None;
2305 }
2306 
2307 InlineCost llvm::getInlineCost(
2308     CallBase &Call, Function *Callee, const InlineParams &Params,
2309     TargetTransformInfo &CalleeTTI,
2310     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2311     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2312     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2313     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2314 
2315   auto UserDecision =
2316       llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
2317 
2318   if (UserDecision.hasValue()) {
2319     if (UserDecision->isSuccess())
2320       return llvm::InlineCost::getAlways("always inline attribute");
2321     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
2322   }
2323 
2324   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2325                           << "... (caller:" << Call.getCaller()->getName()
2326                           << ")\n");
2327 
2328   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2329                             *Callee, Call, Params);
2330   InlineResult ShouldInline = CA.analyze();
2331 
2332   LLVM_DEBUG(CA.dump());
2333 
2334   // Check if there was a reason to force inlining or no inlining.
2335   if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
2336     return InlineCost::getNever(ShouldInline.getFailureReason());
2337   if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
2338     return InlineCost::getAlways("empty function");
2339 
2340   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2341 }
2342 
2343 InlineResult llvm::isInlineViable(Function &F) {
2344   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2345   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2346     // Disallow inlining of functions which contain indirect branches.
2347     if (isa<IndirectBrInst>(BI->getTerminator()))
2348       return InlineResult::failure("contains indirect branches");
2349 
2350     // Disallow inlining of blockaddresses which are used by non-callbr
2351     // instructions.
2352     if (BI->hasAddressTaken())
2353       for (User *U : BlockAddress::get(&*BI)->users())
2354         if (!isa<CallBrInst>(*U))
2355           return InlineResult::failure("blockaddress used outside of callbr");
2356 
2357     for (auto &II : *BI) {
2358       CallBase *Call = dyn_cast<CallBase>(&II);
2359       if (!Call)
2360         continue;
2361 
2362       // Disallow recursive calls.
2363       if (&F == Call->getCalledFunction())
2364         return InlineResult::failure("recursive call");
2365 
2366       // Disallow calls which expose returns-twice to a function not previously
2367       // attributed as such.
2368       if (!ReturnsTwice && isa<CallInst>(Call) &&
2369           cast<CallInst>(Call)->canReturnTwice())
2370         return InlineResult::failure("exposes returns-twice attribute");
2371 
2372       if (Call->getCalledFunction())
2373         switch (Call->getCalledFunction()->getIntrinsicID()) {
2374         default:
2375           break;
2376         case llvm::Intrinsic::icall_branch_funnel:
2377           // Disallow inlining of @llvm.icall.branch.funnel because current
2378           // backend can't separate call targets from call arguments.
2379           return InlineResult::failure(
2380               "disallowed inlining of @llvm.icall.branch.funnel");
2381         case llvm::Intrinsic::localescape:
2382           // Disallow inlining functions that call @llvm.localescape. Doing this
2383           // correctly would require major changes to the inliner.
2384           return InlineResult::failure(
2385               "disallowed inlining of @llvm.localescape");
2386         case llvm::Intrinsic::vastart:
2387           // Disallow inlining of functions that initialize VarArgs with
2388           // va_start.
2389           return InlineResult::failure(
2390               "contains VarArgs initialized with va_start");
2391         }
2392     }
2393   }
2394 
2395   return InlineResult::success();
2396 }
2397 
2398 // APIs to create InlineParams based on command line flags and/or other
2399 // parameters.
2400 
2401 InlineParams llvm::getInlineParams(int Threshold) {
2402   InlineParams Params;
2403 
2404   // This field is the threshold to use for a callee by default. This is
2405   // derived from one or more of:
2406   //  * optimization or size-optimization levels,
2407   //  * a value passed to createFunctionInliningPass function, or
2408   //  * the -inline-threshold flag.
2409   //  If the -inline-threshold flag is explicitly specified, that is used
2410   //  irrespective of anything else.
2411   if (InlineThreshold.getNumOccurrences() > 0)
2412     Params.DefaultThreshold = InlineThreshold;
2413   else
2414     Params.DefaultThreshold = Threshold;
2415 
2416   // Set the HintThreshold knob from the -inlinehint-threshold.
2417   Params.HintThreshold = HintThreshold;
2418 
2419   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2420   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2421 
2422   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2423   // populate LocallyHotCallSiteThreshold. Later, we populate
2424   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2425   // we know that optimization level is O3 (in the getInlineParams variant that
2426   // takes the opt and size levels).
2427   // FIXME: Remove this check (and make the assignment unconditional) after
2428   // addressing size regression issues at O2.
2429   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2430     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2431 
2432   // Set the ColdCallSiteThreshold knob from the
2433   // -inline-cold-callsite-threshold.
2434   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2435 
2436   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2437   // -inlinehint-threshold commandline option is not explicitly given. If that
2438   // option is present, then its value applies even for callees with size and
2439   // minsize attributes.
2440   // If the -inline-threshold is not specified, set the ColdThreshold from the
2441   // -inlinecold-threshold even if it is not explicitly passed. If
2442   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2443   // explicitly specified to set the ColdThreshold knob
2444   if (InlineThreshold.getNumOccurrences() == 0) {
2445     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2446     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2447     Params.ColdThreshold = ColdThreshold;
2448   } else if (ColdThreshold.getNumOccurrences() > 0) {
2449     Params.ColdThreshold = ColdThreshold;
2450   }
2451   return Params;
2452 }
2453 
2454 InlineParams llvm::getInlineParams() {
2455   return getInlineParams(DefaultThreshold);
2456 }
2457 
2458 // Compute the default threshold for inlining based on the opt level and the
2459 // size opt level.
2460 static int computeThresholdFromOptLevels(unsigned OptLevel,
2461                                          unsigned SizeOptLevel) {
2462   if (OptLevel > 2)
2463     return InlineConstants::OptAggressiveThreshold;
2464   if (SizeOptLevel == 1) // -Os
2465     return InlineConstants::OptSizeThreshold;
2466   if (SizeOptLevel == 2) // -Oz
2467     return InlineConstants::OptMinSizeThreshold;
2468   return DefaultThreshold;
2469 }
2470 
2471 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2472   auto Params =
2473       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2474   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2475   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2476   // when it is specified explicitly.
2477   if (OptLevel > 2)
2478     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2479   return Params;
2480 }
2481