1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/InstructionSimplify.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/AssemblyAnnotationWriter.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/FormattedStream.h"
44 #include "llvm/Support/raw_ostream.h"
45 
46 using namespace llvm;
47 
48 #define DEBUG_TYPE "inline-cost"
49 
50 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
51 
52 static cl::opt<int>
53     DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
54                      cl::ZeroOrMore,
55                      cl::desc("Default amount of inlining to perform"));
56 
57 static cl::opt<bool> PrintDebugInstructionDeltas(
58     "print-instruction-deltas", cl::Hidden, cl::init(false),
59     cl::desc("Prints deltas of cost and threshold per instruction"));
60 
61 static cl::opt<int> InlineThreshold(
62     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
63     cl::desc("Control the amount of inlining to perform (default = 225)"));
64 
65 static cl::opt<int> HintThreshold(
66     "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
67     cl::desc("Threshold for inlining functions with inline hint"));
68 
69 static cl::opt<int>
70     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
71                           cl::init(45), cl::ZeroOrMore,
72                           cl::desc("Threshold for inlining cold callsites"));
73 
74 // We introduce this threshold to help performance of instrumentation based
75 // PGO before we actually hook up inliner with analysis passes such as BPI and
76 // BFI.
77 static cl::opt<int> ColdThreshold(
78     "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
79     cl::desc("Threshold for inlining functions with cold attribute"));
80 
81 static cl::opt<int>
82     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
83                          cl::ZeroOrMore,
84                          cl::desc("Threshold for hot callsites "));
85 
86 static cl::opt<int> LocallyHotCallSiteThreshold(
87     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
88     cl::desc("Threshold for locally hot callsites "));
89 
90 static cl::opt<int> ColdCallSiteRelFreq(
91     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
92     cl::desc("Maximum block frequency, expressed as a percentage of caller's "
93              "entry frequency, for a callsite to be cold in the absence of "
94              "profile information."));
95 
96 static cl::opt<int> HotCallSiteRelFreq(
97     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
98     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
99              "entry frequency, for a callsite to be hot in the absence of "
100              "profile information."));
101 
102 static cl::opt<bool> OptComputeFullInlineCost(
103     "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore,
104     cl::desc("Compute the full inline cost of a call site even when the cost "
105              "exceeds the threshold."));
106 
107 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
108     "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
109     cl::ZeroOrMore,
110     cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
111              "attributes."));
112 
113 namespace {
114 class InlineCostCallAnalyzer;
115 
116 // This struct is used to store information about inline cost of a
117 // particular instruction
118 struct InstructionCostDetail {
119   int CostBefore = 0;
120   int CostAfter = 0;
121   int ThresholdBefore = 0;
122   int ThresholdAfter = 0;
123 
124   int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
125 
126   int getCostDelta() const { return CostAfter - CostBefore; }
127 
128   bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
129 };
130 
131 class CostAnnotationWriter : public AssemblyAnnotationWriter {
132 public:
133   // This DenseMap stores the delta change in cost and threshold after
134   // accounting for the given instruction.
135   DenseMap<const Instruction *, InstructionCostDetail> CostThresholdMap;
136 
137   virtual void emitInstructionAnnot(const Instruction *I,
138                                     formatted_raw_ostream &OS);
139 };
140 
141 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
142   typedef InstVisitor<CallAnalyzer, bool> Base;
143   friend class InstVisitor<CallAnalyzer, bool>;
144 
145 protected:
146   virtual ~CallAnalyzer() {}
147   /// The TargetTransformInfo available for this compilation.
148   const TargetTransformInfo &TTI;
149 
150   /// Getter for the cache of @llvm.assume intrinsics.
151   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
152 
153   /// Getter for BlockFrequencyInfo
154   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
155 
156   /// Profile summary information.
157   ProfileSummaryInfo *PSI;
158 
159   /// The called function.
160   Function &F;
161 
162   // Cache the DataLayout since we use it a lot.
163   const DataLayout &DL;
164 
165   /// The OptimizationRemarkEmitter available for this compilation.
166   OptimizationRemarkEmitter *ORE;
167 
168   /// The candidate callsite being analyzed. Please do not use this to do
169   /// analysis in the caller function; we want the inline cost query to be
170   /// easily cacheable. Instead, use the cover function paramHasAttr.
171   CallBase &CandidateCall;
172 
173   /// Extension points for handling callsite features.
174   /// Called after a basic block was analyzed.
175   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
176 
177   /// Called before an instruction was analyzed
178   virtual void onInstructionAnalysisStart(const Instruction *I) {}
179 
180   /// Called after an instruction was analyzed
181   virtual void onInstructionAnalysisFinish(const Instruction *I) {}
182 
183   /// Called at the end of the analysis of the callsite. Return the outcome of
184   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
185   /// the reason it can't.
186   virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
187   /// Called when we're about to start processing a basic block, and every time
188   /// we are done processing an instruction. Return true if there is no point in
189   /// continuing the analysis (e.g. we've determined already the call site is
190   /// too expensive to inline)
191   virtual bool shouldStop() { return false; }
192 
193   /// Called before the analysis of the callee body starts (with callsite
194   /// contexts propagated).  It checks callsite-specific information. Return a
195   /// reason analysis can't continue if that's the case, or 'true' if it may
196   /// continue.
197   virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
198   /// Called if the analysis engine decides SROA cannot be done for the given
199   /// alloca.
200   virtual void onDisableSROA(AllocaInst *Arg) {}
201 
202   /// Called the analysis engine determines load elimination won't happen.
203   virtual void onDisableLoadElimination() {}
204 
205   /// Called to account for a call.
206   virtual void onCallPenalty() {}
207 
208   /// Called to account for the expectation the inlining would result in a load
209   /// elimination.
210   virtual void onLoadEliminationOpportunity() {}
211 
212   /// Called to account for the cost of argument setup for the Call in the
213   /// callee's body (not the callsite currently under analysis).
214   virtual void onCallArgumentSetup(const CallBase &Call) {}
215 
216   /// Called to account for a load relative intrinsic.
217   virtual void onLoadRelativeIntrinsic() {}
218 
219   /// Called to account for a lowered call.
220   virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
221   }
222 
223   /// Account for a jump table of given size. Return false to stop further
224   /// processing the switch instruction
225   virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
226 
227   /// Account for a case cluster of given size. Return false to stop further
228   /// processing of the instruction.
229   virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
230 
231   /// Called at the end of processing a switch instruction, with the given
232   /// number of case clusters.
233   virtual void onFinalizeSwitch(unsigned JumpTableSize,
234                                 unsigned NumCaseCluster) {}
235 
236   /// Called to account for any other instruction not specifically accounted
237   /// for.
238   virtual void onMissedSimplification() {}
239 
240   /// Start accounting potential benefits due to SROA for the given alloca.
241   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
242 
243   /// Account SROA savings for the AllocaInst value.
244   virtual void onAggregateSROAUse(AllocaInst *V) {}
245 
246   bool handleSROA(Value *V, bool DoNotDisable) {
247     // Check for SROA candidates in comparisons.
248     if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
249       if (DoNotDisable) {
250         onAggregateSROAUse(SROAArg);
251         return true;
252       }
253       disableSROAForArg(SROAArg);
254     }
255     return false;
256   }
257 
258   bool IsCallerRecursive = false;
259   bool IsRecursiveCall = false;
260   bool ExposesReturnsTwice = false;
261   bool HasDynamicAlloca = false;
262   bool ContainsNoDuplicateCall = false;
263   bool HasReturn = false;
264   bool HasIndirectBr = false;
265   bool HasUninlineableIntrinsic = false;
266   bool InitsVargArgs = false;
267 
268   /// Number of bytes allocated statically by the callee.
269   uint64_t AllocatedSize = 0;
270   unsigned NumInstructions = 0;
271   unsigned NumVectorInstructions = 0;
272 
273   /// While we walk the potentially-inlined instructions, we build up and
274   /// maintain a mapping of simplified values specific to this callsite. The
275   /// idea is to propagate any special information we have about arguments to
276   /// this call through the inlinable section of the function, and account for
277   /// likely simplifications post-inlining. The most important aspect we track
278   /// is CFG altering simplifications -- when we prove a basic block dead, that
279   /// can cause dramatic shifts in the cost of inlining a function.
280   DenseMap<Value *, Constant *> SimplifiedValues;
281 
282   /// Keep track of the values which map back (through function arguments) to
283   /// allocas on the caller stack which could be simplified through SROA.
284   DenseMap<Value *, AllocaInst *> SROAArgValues;
285 
286   /// Keep track of Allocas for which we believe we may get SROA optimization.
287   DenseSet<AllocaInst *> EnabledSROAAllocas;
288 
289   /// Keep track of values which map to a pointer base and constant offset.
290   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
291 
292   /// Keep track of dead blocks due to the constant arguments.
293   SetVector<BasicBlock *> DeadBlocks;
294 
295   /// The mapping of the blocks to their known unique successors due to the
296   /// constant arguments.
297   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
298 
299   /// Model the elimination of repeated loads that is expected to happen
300   /// whenever we simplify away the stores that would otherwise cause them to be
301   /// loads.
302   bool EnableLoadElimination;
303   SmallPtrSet<Value *, 16> LoadAddrSet;
304 
305   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
306     auto It = SROAArgValues.find(V);
307     if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
308       return nullptr;
309     return It->second;
310   }
311 
312   // Custom simplification helper routines.
313   bool isAllocaDerivedArg(Value *V);
314   void disableSROAForArg(AllocaInst *SROAArg);
315   void disableSROA(Value *V);
316   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
317   void disableLoadElimination();
318   bool isGEPFree(GetElementPtrInst &GEP);
319   bool canFoldInboundsGEP(GetElementPtrInst &I);
320   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
321   bool simplifyCallSite(Function *F, CallBase &Call);
322   template <typename Callable>
323   bool simplifyInstruction(Instruction &I, Callable Evaluate);
324   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
325 
326   /// Return true if the given argument to the function being considered for
327   /// inlining has the given attribute set either at the call site or the
328   /// function declaration.  Primarily used to inspect call site specific
329   /// attributes since these can be more precise than the ones on the callee
330   /// itself.
331   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
332 
333   /// Return true if the given value is known non null within the callee if
334   /// inlined through this particular callsite.
335   bool isKnownNonNullInCallee(Value *V);
336 
337   /// Return true if size growth is allowed when inlining the callee at \p Call.
338   bool allowSizeGrowth(CallBase &Call);
339 
340   // Custom analysis routines.
341   InlineResult analyzeBlock(BasicBlock *BB,
342                             SmallPtrSetImpl<const Value *> &EphValues);
343 
344   // Disable several entry points to the visitor so we don't accidentally use
345   // them by declaring but not defining them here.
346   void visit(Module *);
347   void visit(Module &);
348   void visit(Function *);
349   void visit(Function &);
350   void visit(BasicBlock *);
351   void visit(BasicBlock &);
352 
353   // Provide base case for our instruction visit.
354   bool visitInstruction(Instruction &I);
355 
356   // Our visit overrides.
357   bool visitAlloca(AllocaInst &I);
358   bool visitPHI(PHINode &I);
359   bool visitGetElementPtr(GetElementPtrInst &I);
360   bool visitBitCast(BitCastInst &I);
361   bool visitPtrToInt(PtrToIntInst &I);
362   bool visitIntToPtr(IntToPtrInst &I);
363   bool visitCastInst(CastInst &I);
364   bool visitUnaryInstruction(UnaryInstruction &I);
365   bool visitCmpInst(CmpInst &I);
366   bool visitSub(BinaryOperator &I);
367   bool visitBinaryOperator(BinaryOperator &I);
368   bool visitFNeg(UnaryOperator &I);
369   bool visitLoad(LoadInst &I);
370   bool visitStore(StoreInst &I);
371   bool visitExtractValue(ExtractValueInst &I);
372   bool visitInsertValue(InsertValueInst &I);
373   bool visitCallBase(CallBase &Call);
374   bool visitReturnInst(ReturnInst &RI);
375   bool visitBranchInst(BranchInst &BI);
376   bool visitSelectInst(SelectInst &SI);
377   bool visitSwitchInst(SwitchInst &SI);
378   bool visitIndirectBrInst(IndirectBrInst &IBI);
379   bool visitResumeInst(ResumeInst &RI);
380   bool visitCleanupReturnInst(CleanupReturnInst &RI);
381   bool visitCatchReturnInst(CatchReturnInst &RI);
382   bool visitUnreachableInst(UnreachableInst &I);
383 
384 public:
385   CallAnalyzer(const TargetTransformInfo &TTI,
386                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
387                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
388                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
389                Function &Callee, CallBase &Call)
390       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
391         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
392         CandidateCall(Call), EnableLoadElimination(true) {}
393 
394   InlineResult analyze();
395 
396   // Keep a bunch of stats about the cost savings found so we can print them
397   // out when debugging.
398   unsigned NumConstantArgs = 0;
399   unsigned NumConstantOffsetPtrArgs = 0;
400   unsigned NumAllocaArgs = 0;
401   unsigned NumConstantPtrCmps = 0;
402   unsigned NumConstantPtrDiffs = 0;
403   unsigned NumInstructionsSimplified = 0;
404 
405   void dump();
406 };
407 
408 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
409 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
410 class InlineCostCallAnalyzer final : public CallAnalyzer {
411   const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
412   const bool ComputeFullInlineCost;
413   int LoadEliminationCost = 0;
414   /// Bonus to be applied when percentage of vector instructions in callee is
415   /// high (see more details in updateThreshold).
416   int VectorBonus = 0;
417   /// Bonus to be applied when the callee has only one reachable basic block.
418   int SingleBBBonus = 0;
419 
420   /// Tunable parameters that control the analysis.
421   const InlineParams &Params;
422 
423   /// Upper bound for the inlining cost. Bonuses are being applied to account
424   /// for speculative "expected profit" of the inlining decision.
425   int Threshold = 0;
426 
427   /// Attempt to evaluate indirect calls to boost its inline cost.
428   const bool BoostIndirectCalls;
429 
430   /// Ignore the threshold when finalizing analysis.
431   const bool IgnoreThreshold;
432 
433   /// Inlining cost measured in abstract units, accounts for all the
434   /// instructions expected to be executed for a given function invocation.
435   /// Instructions that are statically proven to be dead based on call-site
436   /// arguments are not counted here.
437   int Cost = 0;
438 
439   bool SingleBB = true;
440 
441   unsigned SROACostSavings = 0;
442   unsigned SROACostSavingsLost = 0;
443 
444   /// The mapping of caller Alloca values to their accumulated cost savings. If
445   /// we have to disable SROA for one of the allocas, this tells us how much
446   /// cost must be added.
447   DenseMap<AllocaInst *, int> SROAArgCosts;
448 
449   /// Return true if \p Call is a cold callsite.
450   bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
451 
452   /// Update Threshold based on callsite properties such as callee
453   /// attributes and callee hotness for PGO builds. The Callee is explicitly
454   /// passed to support analyzing indirect calls whose target is inferred by
455   /// analysis.
456   void updateThreshold(CallBase &Call, Function &Callee);
457   /// Return a higher threshold if \p Call is a hot callsite.
458   Optional<int> getHotCallSiteThreshold(CallBase &Call,
459                                         BlockFrequencyInfo *CallerBFI);
460 
461   /// Handle a capped 'int' increment for Cost.
462   void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
463     assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
464     Cost = (int)std::min(UpperBound, Cost + Inc);
465   }
466 
467   void onDisableSROA(AllocaInst *Arg) override {
468     auto CostIt = SROAArgCosts.find(Arg);
469     if (CostIt == SROAArgCosts.end())
470       return;
471     addCost(CostIt->second);
472     SROACostSavings -= CostIt->second;
473     SROACostSavingsLost += CostIt->second;
474     SROAArgCosts.erase(CostIt);
475   }
476 
477   void onDisableLoadElimination() override {
478     addCost(LoadEliminationCost);
479     LoadEliminationCost = 0;
480   }
481   void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
482   void onCallArgumentSetup(const CallBase &Call) override {
483     // Pay the price of the argument setup. We account for the average 1
484     // instruction per call argument setup here.
485     addCost(Call.arg_size() * InlineConstants::InstrCost);
486   }
487   void onLoadRelativeIntrinsic() override {
488     // This is normally lowered to 4 LLVM instructions.
489     addCost(3 * InlineConstants::InstrCost);
490   }
491   void onLoweredCall(Function *F, CallBase &Call,
492                      bool IsIndirectCall) override {
493     // We account for the average 1 instruction per call argument setup here.
494     addCost(Call.arg_size() * InlineConstants::InstrCost);
495 
496     // If we have a constant that we are calling as a function, we can peer
497     // through it and see the function target. This happens not infrequently
498     // during devirtualization and so we want to give it a hefty bonus for
499     // inlining, but cap that bonus in the event that inlining wouldn't pan out.
500     // Pretend to inline the function, with a custom threshold.
501     if (IsIndirectCall && BoostIndirectCalls) {
502       auto IndirectCallParams = Params;
503       IndirectCallParams.DefaultThreshold =
504           InlineConstants::IndirectCallThreshold;
505       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
506       /// to instantiate the derived class.
507       InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
508                                 Call, IndirectCallParams, false);
509       if (CA.analyze().isSuccess()) {
510         // We were able to inline the indirect call! Subtract the cost from the
511         // threshold to get the bonus we want to apply, but don't go below zero.
512         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
513       }
514     } else
515       // Otherwise simply add the cost for merely making the call.
516       addCost(InlineConstants::CallPenalty);
517   }
518 
519   void onFinalizeSwitch(unsigned JumpTableSize,
520                         unsigned NumCaseCluster) override {
521     // If suitable for a jump table, consider the cost for the table size and
522     // branch to destination.
523     // Maximum valid cost increased in this function.
524     if (JumpTableSize) {
525       int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
526                        4 * InlineConstants::InstrCost;
527 
528       addCost(JTCost, (int64_t)CostUpperBound);
529       return;
530     }
531     // Considering forming a binary search, we should find the number of nodes
532     // which is same as the number of comparisons when lowered. For a given
533     // number of clusters, n, we can define a recursive function, f(n), to find
534     // the number of nodes in the tree. The recursion is :
535     // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
536     // and f(n) = n, when n <= 3.
537     // This will lead a binary tree where the leaf should be either f(2) or f(3)
538     // when n > 3.  So, the number of comparisons from leaves should be n, while
539     // the number of non-leaf should be :
540     //   2^(log2(n) - 1) - 1
541     //   = 2^log2(n) * 2^-1 - 1
542     //   = n / 2 - 1.
543     // Considering comparisons from leaf and non-leaf nodes, we can estimate the
544     // number of comparisons in a simple closed form :
545     //   n + n / 2 - 1 = n * 3 / 2 - 1
546     if (NumCaseCluster <= 3) {
547       // Suppose a comparison includes one compare and one conditional branch.
548       addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
549       return;
550     }
551 
552     int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
553     int64_t SwitchCost =
554         ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
555 
556     addCost(SwitchCost, (int64_t)CostUpperBound);
557   }
558   void onMissedSimplification() override {
559     addCost(InlineConstants::InstrCost);
560   }
561 
562   void onInitializeSROAArg(AllocaInst *Arg) override {
563     assert(Arg != nullptr &&
564            "Should not initialize SROA costs for null value.");
565     SROAArgCosts[Arg] = 0;
566   }
567 
568   void onAggregateSROAUse(AllocaInst *SROAArg) override {
569     auto CostIt = SROAArgCosts.find(SROAArg);
570     assert(CostIt != SROAArgCosts.end() &&
571            "expected this argument to have a cost");
572     CostIt->second += InlineConstants::InstrCost;
573     SROACostSavings += InlineConstants::InstrCost;
574   }
575 
576   void onBlockAnalyzed(const BasicBlock *BB) override {
577     auto *TI = BB->getTerminator();
578     // If we had any successors at this point, than post-inlining is likely to
579     // have them as well. Note that we assume any basic blocks which existed
580     // due to branches or switches which folded above will also fold after
581     // inlining.
582     if (SingleBB && TI->getNumSuccessors() > 1) {
583       // Take off the bonus we applied to the threshold.
584       Threshold -= SingleBBBonus;
585       SingleBB = false;
586     }
587   }
588 
589   void onInstructionAnalysisStart(const Instruction *I) override {
590     // This function is called to store the initial cost of inlining before
591     // the given instruction was assessed.
592     if (!PrintDebugInstructionDeltas)
593       return;
594     Writer.CostThresholdMap[I].CostBefore = Cost;
595     Writer.CostThresholdMap[I].ThresholdBefore = Threshold;
596   }
597 
598   void onInstructionAnalysisFinish(const Instruction *I) override {
599     // This function is called to find new values of cost and threshold after
600     // the instruction has been assessed.
601     if (!PrintDebugInstructionDeltas)
602       return;
603     Writer.CostThresholdMap[I].CostAfter = Cost;
604     Writer.CostThresholdMap[I].ThresholdAfter = Threshold;
605   }
606 
607   InlineResult finalizeAnalysis() override {
608     // Loops generally act a lot like calls in that they act like barriers to
609     // movement, require a certain amount of setup, etc. So when optimising for
610     // size, we penalise any call sites that perform loops. We do this after all
611     // other costs here, so will likely only be dealing with relatively small
612     // functions (and hence DT and LI will hopefully be cheap).
613     auto *Caller = CandidateCall.getFunction();
614     if (Caller->hasMinSize()) {
615       DominatorTree DT(F);
616       LoopInfo LI(DT);
617       int NumLoops = 0;
618       for (Loop *L : LI) {
619         // Ignore loops that will not be executed
620         if (DeadBlocks.count(L->getHeader()))
621           continue;
622         NumLoops++;
623       }
624       addCost(NumLoops * InlineConstants::CallPenalty);
625     }
626 
627     // We applied the maximum possible vector bonus at the beginning. Now,
628     // subtract the excess bonus, if any, from the Threshold before
629     // comparing against Cost.
630     if (NumVectorInstructions <= NumInstructions / 10)
631       Threshold -= VectorBonus;
632     else if (NumVectorInstructions <= NumInstructions / 2)
633       Threshold -= VectorBonus / 2;
634 
635     if (IgnoreThreshold || Cost < std::max(1, Threshold))
636       return InlineResult::success();
637     return InlineResult::failure("Cost over threshold.");
638   }
639   bool shouldStop() override {
640     // Bail out the moment we cross the threshold. This means we'll under-count
641     // the cost, but only when undercounting doesn't matter.
642     return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost;
643   }
644 
645   void onLoadEliminationOpportunity() override {
646     LoadEliminationCost += InlineConstants::InstrCost;
647   }
648 
649   InlineResult onAnalysisStart() override {
650     // Perform some tweaks to the cost and threshold based on the direct
651     // callsite information.
652 
653     // We want to more aggressively inline vector-dense kernels, so up the
654     // threshold, and we'll lower it if the % of vector instructions gets too
655     // low. Note that these bonuses are some what arbitrary and evolved over
656     // time by accident as much as because they are principled bonuses.
657     //
658     // FIXME: It would be nice to remove all such bonuses. At least it would be
659     // nice to base the bonus values on something more scientific.
660     assert(NumInstructions == 0);
661     assert(NumVectorInstructions == 0);
662 
663     // Update the threshold based on callsite properties
664     updateThreshold(CandidateCall, F);
665 
666     // While Threshold depends on commandline options that can take negative
667     // values, we want to enforce the invariant that the computed threshold and
668     // bonuses are non-negative.
669     assert(Threshold >= 0);
670     assert(SingleBBBonus >= 0);
671     assert(VectorBonus >= 0);
672 
673     // Speculatively apply all possible bonuses to Threshold. If cost exceeds
674     // this Threshold any time, and cost cannot decrease, we can stop processing
675     // the rest of the function body.
676     Threshold += (SingleBBBonus + VectorBonus);
677 
678     // Give out bonuses for the callsite, as the instructions setting them up
679     // will be gone after inlining.
680     addCost(-getCallsiteCost(this->CandidateCall, DL));
681 
682     // If this function uses the coldcc calling convention, prefer not to inline
683     // it.
684     if (F.getCallingConv() == CallingConv::Cold)
685       Cost += InlineConstants::ColdccPenalty;
686 
687     // Check if we're done. This can happen due to bonuses and penalties.
688     if (Cost >= Threshold && !ComputeFullInlineCost)
689       return InlineResult::failure("high cost");
690 
691     return InlineResult::success();
692   }
693 
694 public:
695   InlineCostCallAnalyzer(
696       const TargetTransformInfo &TTI,
697       std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
698       Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
699       ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
700       CallBase &Call, const InlineParams &Params, bool BoostIndirect = true,
701       bool IgnoreThreshold = false)
702       : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
703         ComputeFullInlineCost(OptComputeFullInlineCost ||
704                               Params.ComputeFullInlineCost || ORE),
705         Params(Params), Threshold(Params.DefaultThreshold),
706         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold) {}
707 
708   /// Annotation Writer for cost annotation
709   CostAnnotationWriter Writer;
710 
711   void dump();
712 
713   virtual ~InlineCostCallAnalyzer() {}
714   int getThreshold() { return Threshold; }
715   int getCost() { return Cost; }
716 };
717 } // namespace
718 
719 /// Test whether the given value is an Alloca-derived function argument.
720 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
721   return SROAArgValues.count(V);
722 }
723 
724 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
725   onDisableSROA(SROAArg);
726   EnabledSROAAllocas.erase(SROAArg);
727   disableLoadElimination();
728 }
729 
730 void CostAnnotationWriter::emitInstructionAnnot(const Instruction *I,
731                                                 formatted_raw_ostream &OS) {
732   // The cost of inlining of the given instruction is printed always.
733   // The threshold delta is printed only when it is non-zero. It happens
734   // when we decided to give a bonus at a particular instruction.
735   if (CostThresholdMap.count(I) == 0) {
736     OS << "; No analysis for the instruction\n";
737     return;
738   }
739   const auto &Record = CostThresholdMap[I];
740   OS << "; cost before = " << Record.CostBefore
741      << ", cost after = " << Record.CostAfter
742      << ", threshold before = " << Record.ThresholdBefore
743      << ", threshold after = " << Record.ThresholdAfter << ", ";
744   OS << "cost delta = " << Record.getCostDelta();
745   if (Record.hasThresholdChanged())
746     OS << ", threshold delta = " << Record.getThresholdDelta();
747   OS << "\n";
748 }
749 
750 /// If 'V' maps to a SROA candidate, disable SROA for it.
751 void CallAnalyzer::disableSROA(Value *V) {
752   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
753     disableSROAForArg(SROAArg);
754   }
755 }
756 
757 void CallAnalyzer::disableLoadElimination() {
758   if (EnableLoadElimination) {
759     onDisableLoadElimination();
760     EnableLoadElimination = false;
761   }
762 }
763 
764 /// Accumulate a constant GEP offset into an APInt if possible.
765 ///
766 /// Returns false if unable to compute the offset for any reason. Respects any
767 /// simplified values known during the analysis of this callsite.
768 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
769   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
770   assert(IntPtrWidth == Offset.getBitWidth());
771 
772   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
773        GTI != GTE; ++GTI) {
774     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
775     if (!OpC)
776       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
777         OpC = dyn_cast<ConstantInt>(SimpleOp);
778     if (!OpC)
779       return false;
780     if (OpC->isZero())
781       continue;
782 
783     // Handle a struct index, which adds its field offset to the pointer.
784     if (StructType *STy = GTI.getStructTypeOrNull()) {
785       unsigned ElementIdx = OpC->getZExtValue();
786       const StructLayout *SL = DL.getStructLayout(STy);
787       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
788       continue;
789     }
790 
791     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
792     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
793   }
794   return true;
795 }
796 
797 /// Use TTI to check whether a GEP is free.
798 ///
799 /// Respects any simplified values known during the analysis of this callsite.
800 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
801   SmallVector<Value *, 4> Operands;
802   Operands.push_back(GEP.getOperand(0));
803   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
804     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
805       Operands.push_back(SimpleOp);
806     else
807       Operands.push_back(*I);
808   return TargetTransformInfo::TCC_Free ==
809          TTI.getUserCost(&GEP, Operands,
810                          TargetTransformInfo::TCK_SizeAndLatency);
811 }
812 
813 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
814   // Check whether inlining will turn a dynamic alloca into a static
815   // alloca and handle that case.
816   if (I.isArrayAllocation()) {
817     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
818     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
819       Type *Ty = I.getAllocatedType();
820       AllocatedSize = SaturatingMultiplyAdd(
821           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
822           AllocatedSize);
823       return Base::visitAlloca(I);
824     }
825   }
826 
827   // Accumulate the allocated size.
828   if (I.isStaticAlloca()) {
829     Type *Ty = I.getAllocatedType();
830     AllocatedSize =
831         SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize);
832   }
833 
834   // We will happily inline static alloca instructions.
835   if (I.isStaticAlloca())
836     return Base::visitAlloca(I);
837 
838   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
839   // a variety of reasons, and so we would like to not inline them into
840   // functions which don't currently have a dynamic alloca. This simply
841   // disables inlining altogether in the presence of a dynamic alloca.
842   HasDynamicAlloca = true;
843   return false;
844 }
845 
846 bool CallAnalyzer::visitPHI(PHINode &I) {
847   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
848   // though we don't want to propagate it's bonuses. The idea is to disable
849   // SROA if it *might* be used in an inappropriate manner.
850 
851   // Phi nodes are always zero-cost.
852   // FIXME: Pointer sizes may differ between different address spaces, so do we
853   // need to use correct address space in the call to getPointerSizeInBits here?
854   // Or could we skip the getPointerSizeInBits call completely? As far as I can
855   // see the ZeroOffset is used as a dummy value, so we can probably use any
856   // bit width for the ZeroOffset?
857   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
858   bool CheckSROA = I.getType()->isPointerTy();
859 
860   // Track the constant or pointer with constant offset we've seen so far.
861   Constant *FirstC = nullptr;
862   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
863   Value *FirstV = nullptr;
864 
865   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
866     BasicBlock *Pred = I.getIncomingBlock(i);
867     // If the incoming block is dead, skip the incoming block.
868     if (DeadBlocks.count(Pred))
869       continue;
870     // If the parent block of phi is not the known successor of the incoming
871     // block, skip the incoming block.
872     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
873     if (KnownSuccessor && KnownSuccessor != I.getParent())
874       continue;
875 
876     Value *V = I.getIncomingValue(i);
877     // If the incoming value is this phi itself, skip the incoming value.
878     if (&I == V)
879       continue;
880 
881     Constant *C = dyn_cast<Constant>(V);
882     if (!C)
883       C = SimplifiedValues.lookup(V);
884 
885     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
886     if (!C && CheckSROA)
887       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
888 
889     if (!C && !BaseAndOffset.first)
890       // The incoming value is neither a constant nor a pointer with constant
891       // offset, exit early.
892       return true;
893 
894     if (FirstC) {
895       if (FirstC == C)
896         // If we've seen a constant incoming value before and it is the same
897         // constant we see this time, continue checking the next incoming value.
898         continue;
899       // Otherwise early exit because we either see a different constant or saw
900       // a constant before but we have a pointer with constant offset this time.
901       return true;
902     }
903 
904     if (FirstV) {
905       // The same logic as above, but check pointer with constant offset here.
906       if (FirstBaseAndOffset == BaseAndOffset)
907         continue;
908       return true;
909     }
910 
911     if (C) {
912       // This is the 1st time we've seen a constant, record it.
913       FirstC = C;
914       continue;
915     }
916 
917     // The remaining case is that this is the 1st time we've seen a pointer with
918     // constant offset, record it.
919     FirstV = V;
920     FirstBaseAndOffset = BaseAndOffset;
921   }
922 
923   // Check if we can map phi to a constant.
924   if (FirstC) {
925     SimplifiedValues[&I] = FirstC;
926     return true;
927   }
928 
929   // Check if we can map phi to a pointer with constant offset.
930   if (FirstBaseAndOffset.first) {
931     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
932 
933     if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
934       SROAArgValues[&I] = SROAArg;
935   }
936 
937   return true;
938 }
939 
940 /// Check we can fold GEPs of constant-offset call site argument pointers.
941 /// This requires target data and inbounds GEPs.
942 ///
943 /// \return true if the specified GEP can be folded.
944 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
945   // Check if we have a base + offset for the pointer.
946   std::pair<Value *, APInt> BaseAndOffset =
947       ConstantOffsetPtrs.lookup(I.getPointerOperand());
948   if (!BaseAndOffset.first)
949     return false;
950 
951   // Check if the offset of this GEP is constant, and if so accumulate it
952   // into Offset.
953   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
954     return false;
955 
956   // Add the result as a new mapping to Base + Offset.
957   ConstantOffsetPtrs[&I] = BaseAndOffset;
958 
959   return true;
960 }
961 
962 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
963   auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
964 
965   // Lambda to check whether a GEP's indices are all constant.
966   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
967     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
968       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
969         return false;
970     return true;
971   };
972 
973   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
974     if (SROAArg)
975       SROAArgValues[&I] = SROAArg;
976 
977     // Constant GEPs are modeled as free.
978     return true;
979   }
980 
981   // Variable GEPs will require math and will disable SROA.
982   if (SROAArg)
983     disableSROAForArg(SROAArg);
984   return isGEPFree(I);
985 }
986 
987 /// Simplify \p I if its operands are constants and update SimplifiedValues.
988 /// \p Evaluate is a callable specific to instruction type that evaluates the
989 /// instruction when all the operands are constants.
990 template <typename Callable>
991 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
992   SmallVector<Constant *, 2> COps;
993   for (Value *Op : I.operands()) {
994     Constant *COp = dyn_cast<Constant>(Op);
995     if (!COp)
996       COp = SimplifiedValues.lookup(Op);
997     if (!COp)
998       return false;
999     COps.push_back(COp);
1000   }
1001   auto *C = Evaluate(COps);
1002   if (!C)
1003     return false;
1004   SimplifiedValues[&I] = C;
1005   return true;
1006 }
1007 
1008 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1009   // Propagate constants through bitcasts.
1010   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1011         return ConstantExpr::getBitCast(COps[0], I.getType());
1012       }))
1013     return true;
1014 
1015   // Track base/offsets through casts
1016   std::pair<Value *, APInt> BaseAndOffset =
1017       ConstantOffsetPtrs.lookup(I.getOperand(0));
1018   // Casts don't change the offset, just wrap it up.
1019   if (BaseAndOffset.first)
1020     ConstantOffsetPtrs[&I] = BaseAndOffset;
1021 
1022   // Also look for SROA candidates here.
1023   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1024     SROAArgValues[&I] = SROAArg;
1025 
1026   // Bitcasts are always zero cost.
1027   return true;
1028 }
1029 
1030 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1031   // Propagate constants through ptrtoint.
1032   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1033         return ConstantExpr::getPtrToInt(COps[0], I.getType());
1034       }))
1035     return true;
1036 
1037   // Track base/offset pairs when converted to a plain integer provided the
1038   // integer is large enough to represent the pointer.
1039   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1040   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1041   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
1042     std::pair<Value *, APInt> BaseAndOffset =
1043         ConstantOffsetPtrs.lookup(I.getOperand(0));
1044     if (BaseAndOffset.first)
1045       ConstantOffsetPtrs[&I] = BaseAndOffset;
1046   }
1047 
1048   // This is really weird. Technically, ptrtoint will disable SROA. However,
1049   // unless that ptrtoint is *used* somewhere in the live basic blocks after
1050   // inlining, it will be nuked, and SROA should proceed. All of the uses which
1051   // would block SROA would also block SROA if applied directly to a pointer,
1052   // and so we can just add the integer in here. The only places where SROA is
1053   // preserved either cannot fire on an integer, or won't in-and-of themselves
1054   // disable SROA (ext) w/o some later use that we would see and disable.
1055   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1056     SROAArgValues[&I] = SROAArg;
1057 
1058   return TargetTransformInfo::TCC_Free ==
1059          TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
1060 }
1061 
1062 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1063   // Propagate constants through ptrtoint.
1064   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1065         return ConstantExpr::getIntToPtr(COps[0], I.getType());
1066       }))
1067     return true;
1068 
1069   // Track base/offset pairs when round-tripped through a pointer without
1070   // modifications provided the integer is not too large.
1071   Value *Op = I.getOperand(0);
1072   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1073   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1074     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1075     if (BaseAndOffset.first)
1076       ConstantOffsetPtrs[&I] = BaseAndOffset;
1077   }
1078 
1079   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1080   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1081     SROAArgValues[&I] = SROAArg;
1082 
1083   return TargetTransformInfo::TCC_Free ==
1084          TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
1085 }
1086 
1087 bool CallAnalyzer::visitCastInst(CastInst &I) {
1088   // Propagate constants through casts.
1089   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1090         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
1091       }))
1092     return true;
1093 
1094   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
1095   disableSROA(I.getOperand(0));
1096 
1097   // If this is a floating-point cast, and the target says this operation
1098   // is expensive, this may eventually become a library call. Treat the cost
1099   // as such.
1100   switch (I.getOpcode()) {
1101   case Instruction::FPTrunc:
1102   case Instruction::FPExt:
1103   case Instruction::UIToFP:
1104   case Instruction::SIToFP:
1105   case Instruction::FPToUI:
1106   case Instruction::FPToSI:
1107     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1108       onCallPenalty();
1109     break;
1110   default:
1111     break;
1112   }
1113 
1114   return TargetTransformInfo::TCC_Free ==
1115          TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
1116 }
1117 
1118 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
1119   Value *Operand = I.getOperand(0);
1120   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1121         return ConstantFoldInstOperands(&I, COps[0], DL);
1122       }))
1123     return true;
1124 
1125   // Disable any SROA on the argument to arbitrary unary instructions.
1126   disableSROA(Operand);
1127 
1128   return false;
1129 }
1130 
1131 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1132   return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1133 }
1134 
1135 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1136   // Does the *call site* have the NonNull attribute set on an argument?  We
1137   // use the attribute on the call site to memoize any analysis done in the
1138   // caller. This will also trip if the callee function has a non-null
1139   // parameter attribute, but that's a less interesting case because hopefully
1140   // the callee would already have been simplified based on that.
1141   if (Argument *A = dyn_cast<Argument>(V))
1142     if (paramHasAttr(A, Attribute::NonNull))
1143       return true;
1144 
1145   // Is this an alloca in the caller?  This is distinct from the attribute case
1146   // above because attributes aren't updated within the inliner itself and we
1147   // always want to catch the alloca derived case.
1148   if (isAllocaDerivedArg(V))
1149     // We can actually predict the result of comparisons between an
1150     // alloca-derived value and null. Note that this fires regardless of
1151     // SROA firing.
1152     return true;
1153 
1154   return false;
1155 }
1156 
1157 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1158   // If the normal destination of the invoke or the parent block of the call
1159   // site is unreachable-terminated, there is little point in inlining this
1160   // unless there is literally zero cost.
1161   // FIXME: Note that it is possible that an unreachable-terminated block has a
1162   // hot entry. For example, in below scenario inlining hot_call_X() may be
1163   // beneficial :
1164   // main() {
1165   //   hot_call_1();
1166   //   ...
1167   //   hot_call_N()
1168   //   exit(0);
1169   // }
1170   // For now, we are not handling this corner case here as it is rare in real
1171   // code. In future, we should elaborate this based on BPI and BFI in more
1172   // general threshold adjusting heuristics in updateThreshold().
1173   if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1174     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1175       return false;
1176   } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1177     return false;
1178 
1179   return true;
1180 }
1181 
1182 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1183                                             BlockFrequencyInfo *CallerBFI) {
1184   // If global profile summary is available, then callsite's coldness is
1185   // determined based on that.
1186   if (PSI && PSI->hasProfileSummary())
1187     return PSI->isColdCallSite(Call, CallerBFI);
1188 
1189   // Otherwise we need BFI to be available.
1190   if (!CallerBFI)
1191     return false;
1192 
1193   // Determine if the callsite is cold relative to caller's entry. We could
1194   // potentially cache the computation of scaled entry frequency, but the added
1195   // complexity is not worth it unless this scaling shows up high in the
1196   // profiles.
1197   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1198   auto CallSiteBB = Call.getParent();
1199   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1200   auto CallerEntryFreq =
1201       CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1202   return CallSiteFreq < CallerEntryFreq * ColdProb;
1203 }
1204 
1205 Optional<int>
1206 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1207                                                 BlockFrequencyInfo *CallerBFI) {
1208 
1209   // If global profile summary is available, then callsite's hotness is
1210   // determined based on that.
1211   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1212     return Params.HotCallSiteThreshold;
1213 
1214   // Otherwise we need BFI to be available and to have a locally hot callsite
1215   // threshold.
1216   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1217     return None;
1218 
1219   // Determine if the callsite is hot relative to caller's entry. We could
1220   // potentially cache the computation of scaled entry frequency, but the added
1221   // complexity is not worth it unless this scaling shows up high in the
1222   // profiles.
1223   auto CallSiteBB = Call.getParent();
1224   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1225   auto CallerEntryFreq = CallerBFI->getEntryFreq();
1226   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1227     return Params.LocallyHotCallSiteThreshold;
1228 
1229   // Otherwise treat it normally.
1230   return None;
1231 }
1232 
1233 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1234   // If no size growth is allowed for this inlining, set Threshold to 0.
1235   if (!allowSizeGrowth(Call)) {
1236     Threshold = 0;
1237     return;
1238   }
1239 
1240   Function *Caller = Call.getCaller();
1241 
1242   // return min(A, B) if B is valid.
1243   auto MinIfValid = [](int A, Optional<int> B) {
1244     return B ? std::min(A, B.getValue()) : A;
1245   };
1246 
1247   // return max(A, B) if B is valid.
1248   auto MaxIfValid = [](int A, Optional<int> B) {
1249     return B ? std::max(A, B.getValue()) : A;
1250   };
1251 
1252   // Various bonus percentages. These are multiplied by Threshold to get the
1253   // bonus values.
1254   // SingleBBBonus: This bonus is applied if the callee has a single reachable
1255   // basic block at the given callsite context. This is speculatively applied
1256   // and withdrawn if more than one basic block is seen.
1257   //
1258   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1259   // of the last call to a static function as inlining such functions is
1260   // guaranteed to reduce code size.
1261   //
1262   // These bonus percentages may be set to 0 based on properties of the caller
1263   // and the callsite.
1264   int SingleBBBonusPercent = 50;
1265   int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1266   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1267 
1268   // Lambda to set all the above bonus and bonus percentages to 0.
1269   auto DisallowAllBonuses = [&]() {
1270     SingleBBBonusPercent = 0;
1271     VectorBonusPercent = 0;
1272     LastCallToStaticBonus = 0;
1273   };
1274 
1275   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1276   // and reduce the threshold if the caller has the necessary attribute.
1277   if (Caller->hasMinSize()) {
1278     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1279     // For minsize, we want to disable the single BB bonus and the vector
1280     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1281     // a static function will, at the minimum, eliminate the parameter setup and
1282     // call/return instructions.
1283     SingleBBBonusPercent = 0;
1284     VectorBonusPercent = 0;
1285   } else if (Caller->hasOptSize())
1286     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1287 
1288   // Adjust the threshold based on inlinehint attribute and profile based
1289   // hotness information if the caller does not have MinSize attribute.
1290   if (!Caller->hasMinSize()) {
1291     if (Callee.hasFnAttribute(Attribute::InlineHint))
1292       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1293 
1294     // FIXME: After switching to the new passmanager, simplify the logic below
1295     // by checking only the callsite hotness/coldness as we will reliably
1296     // have local profile information.
1297     //
1298     // Callsite hotness and coldness can be determined if sample profile is
1299     // used (which adds hotness metadata to calls) or if caller's
1300     // BlockFrequencyInfo is available.
1301     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
1302     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1303     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1304       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1305       // FIXME: This should update the threshold only if it exceeds the
1306       // current threshold, but AutoFDO + ThinLTO currently relies on this
1307       // behavior to prevent inlining of hot callsites during ThinLTO
1308       // compile phase.
1309       Threshold = HotCallSiteThreshold.getValue();
1310     } else if (isColdCallSite(Call, CallerBFI)) {
1311       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1312       // Do not apply bonuses for a cold callsite including the
1313       // LastCallToStatic bonus. While this bonus might result in code size
1314       // reduction, it can cause the size of a non-cold caller to increase
1315       // preventing it from being inlined.
1316       DisallowAllBonuses();
1317       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1318     } else if (PSI) {
1319       // Use callee's global profile information only if we have no way of
1320       // determining this via callsite information.
1321       if (PSI->isFunctionEntryHot(&Callee)) {
1322         LLVM_DEBUG(dbgs() << "Hot callee.\n");
1323         // If callsite hotness can not be determined, we may still know
1324         // that the callee is hot and treat it as a weaker hint for threshold
1325         // increase.
1326         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1327       } else if (PSI->isFunctionEntryCold(&Callee)) {
1328         LLVM_DEBUG(dbgs() << "Cold callee.\n");
1329         // Do not apply bonuses for a cold callee including the
1330         // LastCallToStatic bonus. While this bonus might result in code size
1331         // reduction, it can cause the size of a non-cold caller to increase
1332         // preventing it from being inlined.
1333         DisallowAllBonuses();
1334         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1335       }
1336     }
1337   }
1338 
1339   // Finally, take the target-specific inlining threshold multiplier into
1340   // account.
1341   Threshold *= TTI.getInliningThresholdMultiplier();
1342 
1343   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1344   VectorBonus = Threshold * VectorBonusPercent / 100;
1345 
1346   bool OnlyOneCallAndLocalLinkage =
1347       F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction();
1348   // If there is only one call of the function, and it has internal linkage,
1349   // the cost of inlining it drops dramatically. It may seem odd to update
1350   // Cost in updateThreshold, but the bonus depends on the logic in this method.
1351   if (OnlyOneCallAndLocalLinkage)
1352     Cost -= LastCallToStaticBonus;
1353 }
1354 
1355 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1356   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1357   // First try to handle simplified comparisons.
1358   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1359         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
1360       }))
1361     return true;
1362 
1363   if (I.getOpcode() == Instruction::FCmp)
1364     return false;
1365 
1366   // Otherwise look for a comparison between constant offset pointers with
1367   // a common base.
1368   Value *LHSBase, *RHSBase;
1369   APInt LHSOffset, RHSOffset;
1370   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1371   if (LHSBase) {
1372     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1373     if (RHSBase && LHSBase == RHSBase) {
1374       // We have common bases, fold the icmp to a constant based on the
1375       // offsets.
1376       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1377       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1378       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1379         SimplifiedValues[&I] = C;
1380         ++NumConstantPtrCmps;
1381         return true;
1382       }
1383     }
1384   }
1385 
1386   // If the comparison is an equality comparison with null, we can simplify it
1387   // if we know the value (argument) can't be null
1388   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1389       isKnownNonNullInCallee(I.getOperand(0))) {
1390     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1391     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1392                                       : ConstantInt::getFalse(I.getType());
1393     return true;
1394   }
1395   return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1396 }
1397 
1398 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1399   // Try to handle a special case: we can fold computing the difference of two
1400   // constant-related pointers.
1401   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1402   Value *LHSBase, *RHSBase;
1403   APInt LHSOffset, RHSOffset;
1404   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1405   if (LHSBase) {
1406     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1407     if (RHSBase && LHSBase == RHSBase) {
1408       // We have common bases, fold the subtract to a constant based on the
1409       // offsets.
1410       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1411       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1412       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1413         SimplifiedValues[&I] = C;
1414         ++NumConstantPtrDiffs;
1415         return true;
1416       }
1417     }
1418   }
1419 
1420   // Otherwise, fall back to the generic logic for simplifying and handling
1421   // instructions.
1422   return Base::visitSub(I);
1423 }
1424 
1425 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1426   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1427   Constant *CLHS = dyn_cast<Constant>(LHS);
1428   if (!CLHS)
1429     CLHS = SimplifiedValues.lookup(LHS);
1430   Constant *CRHS = dyn_cast<Constant>(RHS);
1431   if (!CRHS)
1432     CRHS = SimplifiedValues.lookup(RHS);
1433 
1434   Value *SimpleV = nullptr;
1435   if (auto FI = dyn_cast<FPMathOperator>(&I))
1436     SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
1437                             FI->getFastMathFlags(), DL);
1438   else
1439     SimpleV =
1440         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1441 
1442   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1443     SimplifiedValues[&I] = C;
1444 
1445   if (SimpleV)
1446     return true;
1447 
1448   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1449   disableSROA(LHS);
1450   disableSROA(RHS);
1451 
1452   // If the instruction is floating point, and the target says this operation
1453   // is expensive, this may eventually become a library call. Treat the cost
1454   // as such. Unless it's fneg which can be implemented with an xor.
1455   using namespace llvm::PatternMatch;
1456   if (I.getType()->isFloatingPointTy() &&
1457       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
1458       !match(&I, m_FNeg(m_Value())))
1459     onCallPenalty();
1460 
1461   return false;
1462 }
1463 
1464 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
1465   Value *Op = I.getOperand(0);
1466   Constant *COp = dyn_cast<Constant>(Op);
1467   if (!COp)
1468     COp = SimplifiedValues.lookup(Op);
1469 
1470   Value *SimpleV = SimplifyFNegInst(
1471       COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
1472 
1473   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1474     SimplifiedValues[&I] = C;
1475 
1476   if (SimpleV)
1477     return true;
1478 
1479   // Disable any SROA on arguments to arbitrary, unsimplified fneg.
1480   disableSROA(Op);
1481 
1482   return false;
1483 }
1484 
1485 bool CallAnalyzer::visitLoad(LoadInst &I) {
1486   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1487     return true;
1488 
1489   // If the data is already loaded from this address and hasn't been clobbered
1490   // by any stores or calls, this load is likely to be redundant and can be
1491   // eliminated.
1492   if (EnableLoadElimination &&
1493       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1494     onLoadEliminationOpportunity();
1495     return true;
1496   }
1497 
1498   return false;
1499 }
1500 
1501 bool CallAnalyzer::visitStore(StoreInst &I) {
1502   if (handleSROA(I.getPointerOperand(), I.isSimple()))
1503     return true;
1504 
1505   // The store can potentially clobber loads and prevent repeated loads from
1506   // being eliminated.
1507   // FIXME:
1508   // 1. We can probably keep an initial set of eliminatable loads substracted
1509   // from the cost even when we finally see a store. We just need to disable
1510   // *further* accumulation of elimination savings.
1511   // 2. We should probably at some point thread MemorySSA for the callee into
1512   // this and then use that to actually compute *really* precise savings.
1513   disableLoadElimination();
1514   return false;
1515 }
1516 
1517 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1518   // Constant folding for extract value is trivial.
1519   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1520         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1521       }))
1522     return true;
1523 
1524   // SROA can look through these but give them a cost.
1525   return false;
1526 }
1527 
1528 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1529   // Constant folding for insert value is trivial.
1530   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1531         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1532                                             /*InsertedValueOperand*/ COps[1],
1533                                             I.getIndices());
1534       }))
1535     return true;
1536 
1537   // SROA can look through these but give them a cost.
1538   return false;
1539 }
1540 
1541 /// Try to simplify a call site.
1542 ///
1543 /// Takes a concrete function and callsite and tries to actually simplify it by
1544 /// analyzing the arguments and call itself with instsimplify. Returns true if
1545 /// it has simplified the callsite to some other entity (a constant), making it
1546 /// free.
1547 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
1548   // FIXME: Using the instsimplify logic directly for this is inefficient
1549   // because we have to continually rebuild the argument list even when no
1550   // simplifications can be performed. Until that is fixed with remapping
1551   // inside of instsimplify, directly constant fold calls here.
1552   if (!canConstantFoldCallTo(&Call, F))
1553     return false;
1554 
1555   // Try to re-map the arguments to constants.
1556   SmallVector<Constant *, 4> ConstantArgs;
1557   ConstantArgs.reserve(Call.arg_size());
1558   for (Value *I : Call.args()) {
1559     Constant *C = dyn_cast<Constant>(I);
1560     if (!C)
1561       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
1562     if (!C)
1563       return false; // This argument doesn't map to a constant.
1564 
1565     ConstantArgs.push_back(C);
1566   }
1567   if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
1568     SimplifiedValues[&Call] = C;
1569     return true;
1570   }
1571 
1572   return false;
1573 }
1574 
1575 bool CallAnalyzer::visitCallBase(CallBase &Call) {
1576   if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
1577       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1578     // This aborts the entire analysis.
1579     ExposesReturnsTwice = true;
1580     return false;
1581   }
1582   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
1583     ContainsNoDuplicateCall = true;
1584 
1585   Value *Callee = Call.getCalledOperand();
1586   Function *F = dyn_cast_or_null<Function>(Callee);
1587   bool IsIndirectCall = !F;
1588   if (IsIndirectCall) {
1589     // Check if this happens to be an indirect function call to a known function
1590     // in this inline context. If not, we've done all we can.
1591     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1592     if (!F) {
1593       onCallArgumentSetup(Call);
1594 
1595       if (!Call.onlyReadsMemory())
1596         disableLoadElimination();
1597       return Base::visitCallBase(Call);
1598     }
1599   }
1600 
1601   assert(F && "Expected a call to a known function");
1602 
1603   // When we have a concrete function, first try to simplify it directly.
1604   if (simplifyCallSite(F, Call))
1605     return true;
1606 
1607   // Next check if it is an intrinsic we know about.
1608   // FIXME: Lift this into part of the InstVisitor.
1609   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
1610     switch (II->getIntrinsicID()) {
1611     default:
1612       if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1613         disableLoadElimination();
1614       return Base::visitCallBase(Call);
1615 
1616     case Intrinsic::load_relative:
1617       onLoadRelativeIntrinsic();
1618       return false;
1619 
1620     case Intrinsic::memset:
1621     case Intrinsic::memcpy:
1622     case Intrinsic::memmove:
1623       disableLoadElimination();
1624       // SROA can usually chew through these intrinsics, but they aren't free.
1625       return false;
1626     case Intrinsic::icall_branch_funnel:
1627     case Intrinsic::localescape:
1628       HasUninlineableIntrinsic = true;
1629       return false;
1630     case Intrinsic::vastart:
1631       InitsVargArgs = true;
1632       return false;
1633     }
1634   }
1635 
1636   if (F == Call.getFunction()) {
1637     // This flag will fully abort the analysis, so don't bother with anything
1638     // else.
1639     IsRecursiveCall = true;
1640     return false;
1641   }
1642 
1643   if (TTI.isLoweredToCall(F)) {
1644     onLoweredCall(F, Call, IsIndirectCall);
1645   }
1646 
1647   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
1648     disableLoadElimination();
1649   return Base::visitCallBase(Call);
1650 }
1651 
1652 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1653   // At least one return instruction will be free after inlining.
1654   bool Free = !HasReturn;
1655   HasReturn = true;
1656   return Free;
1657 }
1658 
1659 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1660   // We model unconditional branches as essentially free -- they really
1661   // shouldn't exist at all, but handling them makes the behavior of the
1662   // inliner more regular and predictable. Interestingly, conditional branches
1663   // which will fold away are also free.
1664   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1665          dyn_cast_or_null<ConstantInt>(
1666              SimplifiedValues.lookup(BI.getCondition()));
1667 }
1668 
1669 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1670   bool CheckSROA = SI.getType()->isPointerTy();
1671   Value *TrueVal = SI.getTrueValue();
1672   Value *FalseVal = SI.getFalseValue();
1673 
1674   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1675   if (!TrueC)
1676     TrueC = SimplifiedValues.lookup(TrueVal);
1677   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1678   if (!FalseC)
1679     FalseC = SimplifiedValues.lookup(FalseVal);
1680   Constant *CondC =
1681       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1682 
1683   if (!CondC) {
1684     // Select C, X, X => X
1685     if (TrueC == FalseC && TrueC) {
1686       SimplifiedValues[&SI] = TrueC;
1687       return true;
1688     }
1689 
1690     if (!CheckSROA)
1691       return Base::visitSelectInst(SI);
1692 
1693     std::pair<Value *, APInt> TrueBaseAndOffset =
1694         ConstantOffsetPtrs.lookup(TrueVal);
1695     std::pair<Value *, APInt> FalseBaseAndOffset =
1696         ConstantOffsetPtrs.lookup(FalseVal);
1697     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1698       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1699 
1700       if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
1701         SROAArgValues[&SI] = SROAArg;
1702       return true;
1703     }
1704 
1705     return Base::visitSelectInst(SI);
1706   }
1707 
1708   // Select condition is a constant.
1709   Value *SelectedV = CondC->isAllOnesValue()
1710                          ? TrueVal
1711                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1712   if (!SelectedV) {
1713     // Condition is a vector constant that is not all 1s or all 0s.  If all
1714     // operands are constants, ConstantExpr::getSelect() can handle the cases
1715     // such as select vectors.
1716     if (TrueC && FalseC) {
1717       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1718         SimplifiedValues[&SI] = C;
1719         return true;
1720       }
1721     }
1722     return Base::visitSelectInst(SI);
1723   }
1724 
1725   // Condition is either all 1s or all 0s. SI can be simplified.
1726   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1727     SimplifiedValues[&SI] = SelectedC;
1728     return true;
1729   }
1730 
1731   if (!CheckSROA)
1732     return true;
1733 
1734   std::pair<Value *, APInt> BaseAndOffset =
1735       ConstantOffsetPtrs.lookup(SelectedV);
1736   if (BaseAndOffset.first) {
1737     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1738 
1739     if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
1740       SROAArgValues[&SI] = SROAArg;
1741   }
1742 
1743   return true;
1744 }
1745 
1746 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1747   // We model unconditional switches as free, see the comments on handling
1748   // branches.
1749   if (isa<ConstantInt>(SI.getCondition()))
1750     return true;
1751   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1752     if (isa<ConstantInt>(V))
1753       return true;
1754 
1755   // Assume the most general case where the switch is lowered into
1756   // either a jump table, bit test, or a balanced binary tree consisting of
1757   // case clusters without merging adjacent clusters with the same
1758   // destination. We do not consider the switches that are lowered with a mix
1759   // of jump table/bit test/binary search tree. The cost of the switch is
1760   // proportional to the size of the tree or the size of jump table range.
1761   //
1762   // NB: We convert large switches which are just used to initialize large phi
1763   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1764   // inlining those. It will prevent inlining in cases where the optimization
1765   // does not (yet) fire.
1766 
1767   unsigned JumpTableSize = 0;
1768   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
1769   unsigned NumCaseCluster =
1770       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
1771 
1772   onFinalizeSwitch(JumpTableSize, NumCaseCluster);
1773   return false;
1774 }
1775 
1776 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1777   // We never want to inline functions that contain an indirectbr.  This is
1778   // incorrect because all the blockaddress's (in static global initializers
1779   // for example) would be referring to the original function, and this
1780   // indirect jump would jump from the inlined copy of the function into the
1781   // original function which is extremely undefined behavior.
1782   // FIXME: This logic isn't really right; we can safely inline functions with
1783   // indirectbr's as long as no other function or global references the
1784   // blockaddress of a block within the current function.
1785   HasIndirectBr = true;
1786   return false;
1787 }
1788 
1789 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1790   // FIXME: It's not clear that a single instruction is an accurate model for
1791   // the inline cost of a resume instruction.
1792   return false;
1793 }
1794 
1795 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1796   // FIXME: It's not clear that a single instruction is an accurate model for
1797   // the inline cost of a cleanupret instruction.
1798   return false;
1799 }
1800 
1801 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1802   // FIXME: It's not clear that a single instruction is an accurate model for
1803   // the inline cost of a catchret instruction.
1804   return false;
1805 }
1806 
1807 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1808   // FIXME: It might be reasonably to discount the cost of instructions leading
1809   // to unreachable as they have the lowest possible impact on both runtime and
1810   // code size.
1811   return true; // No actual code is needed for unreachable.
1812 }
1813 
1814 bool CallAnalyzer::visitInstruction(Instruction &I) {
1815   // Some instructions are free. All of the free intrinsics can also be
1816   // handled by SROA, etc.
1817   if (TargetTransformInfo::TCC_Free ==
1818       TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency))
1819     return true;
1820 
1821   // We found something we don't understand or can't handle. Mark any SROA-able
1822   // values in the operand list as no longer viable.
1823   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1824     disableSROA(*OI);
1825 
1826   return false;
1827 }
1828 
1829 /// Analyze a basic block for its contribution to the inline cost.
1830 ///
1831 /// This method walks the analyzer over every instruction in the given basic
1832 /// block and accounts for their cost during inlining at this callsite. It
1833 /// aborts early if the threshold has been exceeded or an impossible to inline
1834 /// construct has been detected. It returns false if inlining is no longer
1835 /// viable, and true if inlining remains viable.
1836 InlineResult
1837 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1838                            SmallPtrSetImpl<const Value *> &EphValues) {
1839   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1840     // FIXME: Currently, the number of instructions in a function regardless of
1841     // our ability to simplify them during inline to constants or dead code,
1842     // are actually used by the vector bonus heuristic. As long as that's true,
1843     // we have to special case debug intrinsics here to prevent differences in
1844     // inlining due to debug symbols. Eventually, the number of unsimplified
1845     // instructions shouldn't factor into the cost computation, but until then,
1846     // hack around it here.
1847     if (isa<DbgInfoIntrinsic>(I))
1848       continue;
1849 
1850     // Skip ephemeral values.
1851     if (EphValues.count(&*I))
1852       continue;
1853 
1854     ++NumInstructions;
1855     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1856       ++NumVectorInstructions;
1857 
1858     // If the instruction simplified to a constant, there is no cost to this
1859     // instruction. Visit the instructions using our InstVisitor to account for
1860     // all of the per-instruction logic. The visit tree returns true if we
1861     // consumed the instruction in any way, and false if the instruction's base
1862     // cost should count against inlining.
1863     onInstructionAnalysisStart(&*I);
1864 
1865     if (Base::visit(&*I))
1866       ++NumInstructionsSimplified;
1867     else
1868       onMissedSimplification();
1869 
1870     onInstructionAnalysisFinish(&*I);
1871     using namespace ore;
1872     // If the visit this instruction detected an uninlinable pattern, abort.
1873     InlineResult IR = InlineResult::success();
1874     if (IsRecursiveCall)
1875       IR = InlineResult::failure("recursive");
1876     else if (ExposesReturnsTwice)
1877       IR = InlineResult::failure("exposes returns twice");
1878     else if (HasDynamicAlloca)
1879       IR = InlineResult::failure("dynamic alloca");
1880     else if (HasIndirectBr)
1881       IR = InlineResult::failure("indirect branch");
1882     else if (HasUninlineableIntrinsic)
1883       IR = InlineResult::failure("uninlinable intrinsic");
1884     else if (InitsVargArgs)
1885       IR = InlineResult::failure("varargs");
1886     if (!IR.isSuccess()) {
1887       if (ORE)
1888         ORE->emit([&]() {
1889           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1890                                           &CandidateCall)
1891                  << NV("Callee", &F) << " has uninlinable pattern ("
1892                  << NV("InlineResult", IR.getFailureReason())
1893                  << ") and cost is not fully computed";
1894         });
1895       return IR;
1896     }
1897 
1898     // If the caller is a recursive function then we don't want to inline
1899     // functions which allocate a lot of stack space because it would increase
1900     // the caller stack usage dramatically.
1901     if (IsCallerRecursive &&
1902         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1903       auto IR =
1904           InlineResult::failure("recursive and allocates too much stack space");
1905       if (ORE)
1906         ORE->emit([&]() {
1907           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1908                                           &CandidateCall)
1909                  << NV("Callee", &F) << " is "
1910                  << NV("InlineResult", IR.getFailureReason())
1911                  << ". Cost is not fully computed";
1912         });
1913       return IR;
1914     }
1915 
1916     if (shouldStop())
1917       return InlineResult::failure(
1918           "Call site analysis is not favorable to inlining.");
1919   }
1920 
1921   return InlineResult::success();
1922 }
1923 
1924 /// Compute the base pointer and cumulative constant offsets for V.
1925 ///
1926 /// This strips all constant offsets off of V, leaving it the base pointer, and
1927 /// accumulates the total constant offset applied in the returned constant. It
1928 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1929 /// no constant offsets applied.
1930 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1931   if (!V->getType()->isPointerTy())
1932     return nullptr;
1933 
1934   unsigned AS = V->getType()->getPointerAddressSpace();
1935   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1936   APInt Offset = APInt::getNullValue(IntPtrWidth);
1937 
1938   // Even though we don't look through PHI nodes, we could be called on an
1939   // instruction in an unreachable block, which may be on a cycle.
1940   SmallPtrSet<Value *, 4> Visited;
1941   Visited.insert(V);
1942   do {
1943     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1944       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1945         return nullptr;
1946       V = GEP->getPointerOperand();
1947     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1948       V = cast<Operator>(V)->getOperand(0);
1949     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1950       if (GA->isInterposable())
1951         break;
1952       V = GA->getAliasee();
1953     } else {
1954       break;
1955     }
1956     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1957   } while (Visited.insert(V).second);
1958 
1959   Type *IdxPtrTy = DL.getIndexType(V->getType());
1960   return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
1961 }
1962 
1963 /// Find dead blocks due to deleted CFG edges during inlining.
1964 ///
1965 /// If we know the successor of the current block, \p CurrBB, has to be \p
1966 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1967 /// no live incoming CFG edges.  If one block is found to be dead, we can
1968 /// continue growing the dead block list by checking the successors of the dead
1969 /// blocks to see if all their incoming edges are dead or not.
1970 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1971   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1972     // A CFG edge is dead if the predecessor is dead or the predecessor has a
1973     // known successor which is not the one under exam.
1974     return (DeadBlocks.count(Pred) ||
1975             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1976   };
1977 
1978   auto IsNewlyDead = [&](BasicBlock *BB) {
1979     // If all the edges to a block are dead, the block is also dead.
1980     return (!DeadBlocks.count(BB) &&
1981             llvm::all_of(predecessors(BB),
1982                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1983   };
1984 
1985   for (BasicBlock *Succ : successors(CurrBB)) {
1986     if (Succ == NextBB || !IsNewlyDead(Succ))
1987       continue;
1988     SmallVector<BasicBlock *, 4> NewDead;
1989     NewDead.push_back(Succ);
1990     while (!NewDead.empty()) {
1991       BasicBlock *Dead = NewDead.pop_back_val();
1992       if (DeadBlocks.insert(Dead))
1993         // Continue growing the dead block lists.
1994         for (BasicBlock *S : successors(Dead))
1995           if (IsNewlyDead(S))
1996             NewDead.push_back(S);
1997     }
1998   }
1999 }
2000 
2001 /// Analyze a call site for potential inlining.
2002 ///
2003 /// Returns true if inlining this call is viable, and false if it is not
2004 /// viable. It computes the cost and adjusts the threshold based on numerous
2005 /// factors and heuristics. If this method returns false but the computed cost
2006 /// is below the computed threshold, then inlining was forcibly disabled by
2007 /// some artifact of the routine.
2008 InlineResult CallAnalyzer::analyze() {
2009   ++NumCallsAnalyzed;
2010 
2011   auto Result = onAnalysisStart();
2012   if (!Result.isSuccess())
2013     return Result;
2014 
2015   if (F.empty())
2016     return InlineResult::success();
2017 
2018   Function *Caller = CandidateCall.getFunction();
2019   // Check if the caller function is recursive itself.
2020   for (User *U : Caller->users()) {
2021     CallBase *Call = dyn_cast<CallBase>(U);
2022     if (Call && Call->getFunction() == Caller) {
2023       IsCallerRecursive = true;
2024       break;
2025     }
2026   }
2027 
2028   // Populate our simplified values by mapping from function arguments to call
2029   // arguments with known important simplifications.
2030   auto CAI = CandidateCall.arg_begin();
2031   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
2032        FAI != FAE; ++FAI, ++CAI) {
2033     assert(CAI != CandidateCall.arg_end());
2034     if (Constant *C = dyn_cast<Constant>(CAI))
2035       SimplifiedValues[&*FAI] = C;
2036 
2037     Value *PtrArg = *CAI;
2038     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2039       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
2040 
2041       // We can SROA any pointer arguments derived from alloca instructions.
2042       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2043         SROAArgValues[&*FAI] = SROAArg;
2044         onInitializeSROAArg(SROAArg);
2045         EnabledSROAAllocas.insert(SROAArg);
2046       }
2047     }
2048   }
2049   NumConstantArgs = SimplifiedValues.size();
2050   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2051   NumAllocaArgs = SROAArgValues.size();
2052 
2053   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2054   // the ephemeral values multiple times (and they're completely determined by
2055   // the callee, so this is purely duplicate work).
2056   SmallPtrSet<const Value *, 32> EphValues;
2057   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2058 
2059   // The worklist of live basic blocks in the callee *after* inlining. We avoid
2060   // adding basic blocks of the callee which can be proven to be dead for this
2061   // particular call site in order to get more accurate cost estimates. This
2062   // requires a somewhat heavyweight iteration pattern: we need to walk the
2063   // basic blocks in a breadth-first order as we insert live successors. To
2064   // accomplish this, prioritizing for small iterations because we exit after
2065   // crossing our threshold, we use a small-size optimized SetVector.
2066   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
2067                     SmallPtrSet<BasicBlock *, 16>>
2068       BBSetVector;
2069   BBSetVector BBWorklist;
2070   BBWorklist.insert(&F.getEntryBlock());
2071 
2072   // Note that we *must not* cache the size, this loop grows the worklist.
2073   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2074     if (shouldStop())
2075       break;
2076 
2077     BasicBlock *BB = BBWorklist[Idx];
2078     if (BB->empty())
2079       continue;
2080 
2081     // Disallow inlining a blockaddress with uses other than strictly callbr.
2082     // A blockaddress only has defined behavior for an indirect branch in the
2083     // same function, and we do not currently support inlining indirect
2084     // branches.  But, the inliner may not see an indirect branch that ends up
2085     // being dead code at a particular call site. If the blockaddress escapes
2086     // the function, e.g., via a global variable, inlining may lead to an
2087     // invalid cross-function reference.
2088     // FIXME: pr/39560: continue relaxing this overt restriction.
2089     if (BB->hasAddressTaken())
2090       for (User *U : BlockAddress::get(&*BB)->users())
2091         if (!isa<CallBrInst>(*U))
2092           return InlineResult::failure("blockaddress used outside of callbr");
2093 
2094     // Analyze the cost of this block. If we blow through the threshold, this
2095     // returns false, and we can bail on out.
2096     InlineResult IR = analyzeBlock(BB, EphValues);
2097     if (!IR.isSuccess())
2098       return IR;
2099 
2100     Instruction *TI = BB->getTerminator();
2101 
2102     // Add in the live successors by first checking whether we have terminator
2103     // that may be simplified based on the values simplified by this call.
2104     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2105       if (BI->isConditional()) {
2106         Value *Cond = BI->getCondition();
2107         if (ConstantInt *SimpleCond =
2108                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2109           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2110           BBWorklist.insert(NextBB);
2111           KnownSuccessors[BB] = NextBB;
2112           findDeadBlocks(BB, NextBB);
2113           continue;
2114         }
2115       }
2116     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2117       Value *Cond = SI->getCondition();
2118       if (ConstantInt *SimpleCond =
2119               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2120         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2121         BBWorklist.insert(NextBB);
2122         KnownSuccessors[BB] = NextBB;
2123         findDeadBlocks(BB, NextBB);
2124         continue;
2125       }
2126     }
2127 
2128     // If we're unable to select a particular successor, just count all of
2129     // them.
2130     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2131          ++TIdx)
2132       BBWorklist.insert(TI->getSuccessor(TIdx));
2133 
2134     onBlockAnalyzed(BB);
2135   }
2136 
2137   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
2138                                     &F == CandidateCall.getCalledFunction();
2139   // If this is a noduplicate call, we can still inline as long as
2140   // inlining this would cause the removal of the caller (so the instruction
2141   // is not actually duplicated, just moved).
2142   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
2143     return InlineResult::failure("noduplicate");
2144 
2145   return finalizeAnalysis();
2146 }
2147 
2148 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2149 /// Dump stats about this call's analysis.
2150 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
2151 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
2152   if (PrintDebugInstructionDeltas)
2153     F.print(dbgs(), &Writer);
2154   DEBUG_PRINT_STAT(NumConstantArgs);
2155   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2156   DEBUG_PRINT_STAT(NumAllocaArgs);
2157   DEBUG_PRINT_STAT(NumConstantPtrCmps);
2158   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2159   DEBUG_PRINT_STAT(NumInstructionsSimplified);
2160   DEBUG_PRINT_STAT(NumInstructions);
2161   DEBUG_PRINT_STAT(SROACostSavings);
2162   DEBUG_PRINT_STAT(SROACostSavingsLost);
2163   DEBUG_PRINT_STAT(LoadEliminationCost);
2164   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2165   DEBUG_PRINT_STAT(Cost);
2166   DEBUG_PRINT_STAT(Threshold);
2167 #undef DEBUG_PRINT_STAT
2168 }
2169 #endif
2170 
2171 /// Test that there are no attribute conflicts between Caller and Callee
2172 ///        that prevent inlining.
2173 static bool functionsHaveCompatibleAttributes(
2174     Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2175     function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2176   // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2177   // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2178   // object, and always returns the same object (which is overwritten on each
2179   // GetTLI call). Therefore we copy the first result.
2180   auto CalleeTLI = GetTLI(*Callee);
2181   return TTI.areInlineCompatible(Caller, Callee) &&
2182          GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2183                                              InlineCallerSupersetNoBuiltin) &&
2184          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2185 }
2186 
2187 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
2188   int Cost = 0;
2189   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2190     if (Call.isByValArgument(I)) {
2191       // We approximate the number of loads and stores needed by dividing the
2192       // size of the byval type by the target's pointer size.
2193       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2194       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
2195       unsigned AS = PTy->getAddressSpace();
2196       unsigned PointerSize = DL.getPointerSizeInBits(AS);
2197       // Ceiling division.
2198       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2199 
2200       // If it generates more than 8 stores it is likely to be expanded as an
2201       // inline memcpy so we take that as an upper bound. Otherwise we assume
2202       // one load and one store per word copied.
2203       // FIXME: The maxStoresPerMemcpy setting from the target should be used
2204       // here instead of a magic number of 8, but it's not available via
2205       // DataLayout.
2206       NumStores = std::min(NumStores, 8U);
2207 
2208       Cost += 2 * NumStores * InlineConstants::InstrCost;
2209     } else {
2210       // For non-byval arguments subtract off one instruction per call
2211       // argument.
2212       Cost += InlineConstants::InstrCost;
2213     }
2214   }
2215   // The call instruction also disappears after inlining.
2216   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
2217   return Cost;
2218 }
2219 
2220 InlineCost llvm::getInlineCost(
2221     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2222     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2223     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2224     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2225     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2226   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2227                        GetAssumptionCache, GetBFI, GetTLI, PSI, ORE);
2228 }
2229 
2230 Optional<int> llvm::getInliningCostEstimate(
2231     CallBase &Call, TargetTransformInfo &CalleeTTI,
2232     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2233     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2234     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2235   const InlineParams Params = {/* DefaultThreshold*/ 0,
2236                                /*HintThreshold*/ {},
2237                                /*ColdThreshold*/ {},
2238                                /*OptSizeThreshold*/ {},
2239                                /*OptMinSizeThreshold*/ {},
2240                                /*HotCallSiteThreshold*/ {},
2241                                /*LocallyHotCallSiteThreshold*/ {},
2242                                /*ColdCallSiteThreshold*/ {},
2243                                /* ComputeFullInlineCost*/ true};
2244 
2245   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2246                             *Call.getCalledFunction(), Call, Params, true,
2247                             /*IgnoreThreshold*/ true);
2248   auto R = CA.analyze();
2249   if (!R.isSuccess())
2250     return None;
2251   return CA.getCost();
2252 }
2253 
2254 Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2255     CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2256     function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2257 
2258   // Cannot inline indirect calls.
2259   if (!Callee)
2260     return InlineResult::failure("indirect call");
2261 
2262   // Never inline calls with byval arguments that does not have the alloca
2263   // address space. Since byval arguments can be replaced with a copy to an
2264   // alloca, the inlined code would need to be adjusted to handle that the
2265   // argument is in the alloca address space (so it is a little bit complicated
2266   // to solve).
2267   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2268   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2269     if (Call.isByValArgument(I)) {
2270       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2271       if (PTy->getAddressSpace() != AllocaAS)
2272         return InlineResult::failure("byval arguments without alloca"
2273                                      " address space");
2274     }
2275 
2276   // Calls to functions with always-inline attributes should be inlined
2277   // whenever possible.
2278   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2279     auto IsViable = isInlineViable(*Callee);
2280     if (IsViable.isSuccess())
2281       return InlineResult::success();
2282     return InlineResult::failure(IsViable.getFailureReason());
2283   }
2284 
2285   // Never inline functions with conflicting attributes (unless callee has
2286   // always-inline attribute).
2287   Function *Caller = Call.getCaller();
2288   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
2289     return InlineResult::failure("conflicting attributes");
2290 
2291   // Don't inline this call if the caller has the optnone attribute.
2292   if (Caller->hasOptNone())
2293     return InlineResult::failure("optnone attribute");
2294 
2295   // Don't inline a function that treats null pointer as valid into a caller
2296   // that does not have this attribute.
2297   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2298     return InlineResult::failure("nullptr definitions incompatible");
2299 
2300   // Don't inline functions which can be interposed at link-time.
2301   if (Callee->isInterposable())
2302     return InlineResult::failure("interposable");
2303 
2304   // Don't inline functions marked noinline.
2305   if (Callee->hasFnAttribute(Attribute::NoInline))
2306     return InlineResult::failure("noinline function attribute");
2307 
2308   // Don't inline call sites marked noinline.
2309   if (Call.isNoInline())
2310     return InlineResult::failure("noinline call site attribute");
2311 
2312   return None;
2313 }
2314 
2315 InlineCost llvm::getInlineCost(
2316     CallBase &Call, Function *Callee, const InlineParams &Params,
2317     TargetTransformInfo &CalleeTTI,
2318     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2319     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2320     function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2321     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2322 
2323   auto UserDecision =
2324       llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
2325 
2326   if (UserDecision.hasValue()) {
2327     if (UserDecision->isSuccess())
2328       return llvm::InlineCost::getAlways("always inline attribute");
2329     return llvm::InlineCost::getNever(UserDecision->getFailureReason());
2330   }
2331 
2332   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2333                           << "... (caller:" << Call.getCaller()->getName()
2334                           << ")\n");
2335 
2336   InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
2337                             *Callee, Call, Params);
2338   InlineResult ShouldInline = CA.analyze();
2339 
2340   LLVM_DEBUG(CA.dump());
2341 
2342   // Check if there was a reason to force inlining or no inlining.
2343   if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
2344     return InlineCost::getNever(ShouldInline.getFailureReason());
2345   if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
2346     return InlineCost::getAlways("empty function");
2347 
2348   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2349 }
2350 
2351 InlineResult llvm::isInlineViable(Function &F) {
2352   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2353   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2354     // Disallow inlining of functions which contain indirect branches.
2355     if (isa<IndirectBrInst>(BI->getTerminator()))
2356       return InlineResult::failure("contains indirect branches");
2357 
2358     // Disallow inlining of blockaddresses which are used by non-callbr
2359     // instructions.
2360     if (BI->hasAddressTaken())
2361       for (User *U : BlockAddress::get(&*BI)->users())
2362         if (!isa<CallBrInst>(*U))
2363           return InlineResult::failure("blockaddress used outside of callbr");
2364 
2365     for (auto &II : *BI) {
2366       CallBase *Call = dyn_cast<CallBase>(&II);
2367       if (!Call)
2368         continue;
2369 
2370       // Disallow recursive calls.
2371       if (&F == Call->getCalledFunction())
2372         return InlineResult::failure("recursive call");
2373 
2374       // Disallow calls which expose returns-twice to a function not previously
2375       // attributed as such.
2376       if (!ReturnsTwice && isa<CallInst>(Call) &&
2377           cast<CallInst>(Call)->canReturnTwice())
2378         return InlineResult::failure("exposes returns-twice attribute");
2379 
2380       if (Call->getCalledFunction())
2381         switch (Call->getCalledFunction()->getIntrinsicID()) {
2382         default:
2383           break;
2384         case llvm::Intrinsic::icall_branch_funnel:
2385           // Disallow inlining of @llvm.icall.branch.funnel because current
2386           // backend can't separate call targets from call arguments.
2387           return InlineResult::failure(
2388               "disallowed inlining of @llvm.icall.branch.funnel");
2389         case llvm::Intrinsic::localescape:
2390           // Disallow inlining functions that call @llvm.localescape. Doing this
2391           // correctly would require major changes to the inliner.
2392           return InlineResult::failure(
2393               "disallowed inlining of @llvm.localescape");
2394         case llvm::Intrinsic::vastart:
2395           // Disallow inlining of functions that initialize VarArgs with
2396           // va_start.
2397           return InlineResult::failure(
2398               "contains VarArgs initialized with va_start");
2399         }
2400     }
2401   }
2402 
2403   return InlineResult::success();
2404 }
2405 
2406 // APIs to create InlineParams based on command line flags and/or other
2407 // parameters.
2408 
2409 InlineParams llvm::getInlineParams(int Threshold) {
2410   InlineParams Params;
2411 
2412   // This field is the threshold to use for a callee by default. This is
2413   // derived from one or more of:
2414   //  * optimization or size-optimization levels,
2415   //  * a value passed to createFunctionInliningPass function, or
2416   //  * the -inline-threshold flag.
2417   //  If the -inline-threshold flag is explicitly specified, that is used
2418   //  irrespective of anything else.
2419   if (InlineThreshold.getNumOccurrences() > 0)
2420     Params.DefaultThreshold = InlineThreshold;
2421   else
2422     Params.DefaultThreshold = Threshold;
2423 
2424   // Set the HintThreshold knob from the -inlinehint-threshold.
2425   Params.HintThreshold = HintThreshold;
2426 
2427   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2428   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2429 
2430   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2431   // populate LocallyHotCallSiteThreshold. Later, we populate
2432   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2433   // we know that optimization level is O3 (in the getInlineParams variant that
2434   // takes the opt and size levels).
2435   // FIXME: Remove this check (and make the assignment unconditional) after
2436   // addressing size regression issues at O2.
2437   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2438     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2439 
2440   // Set the ColdCallSiteThreshold knob from the
2441   // -inline-cold-callsite-threshold.
2442   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2443 
2444   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2445   // -inlinehint-threshold commandline option is not explicitly given. If that
2446   // option is present, then its value applies even for callees with size and
2447   // minsize attributes.
2448   // If the -inline-threshold is not specified, set the ColdThreshold from the
2449   // -inlinecold-threshold even if it is not explicitly passed. If
2450   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2451   // explicitly specified to set the ColdThreshold knob
2452   if (InlineThreshold.getNumOccurrences() == 0) {
2453     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2454     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2455     Params.ColdThreshold = ColdThreshold;
2456   } else if (ColdThreshold.getNumOccurrences() > 0) {
2457     Params.ColdThreshold = ColdThreshold;
2458   }
2459   return Params;
2460 }
2461 
2462 InlineParams llvm::getInlineParams() {
2463   return getInlineParams(DefaultThreshold);
2464 }
2465 
2466 // Compute the default threshold for inlining based on the opt level and the
2467 // size opt level.
2468 static int computeThresholdFromOptLevels(unsigned OptLevel,
2469                                          unsigned SizeOptLevel) {
2470   if (OptLevel > 2)
2471     return InlineConstants::OptAggressiveThreshold;
2472   if (SizeOptLevel == 1) // -Os
2473     return InlineConstants::OptSizeThreshold;
2474   if (SizeOptLevel == 2) // -Oz
2475     return InlineConstants::OptMinSizeThreshold;
2476   return DefaultThreshold;
2477 }
2478 
2479 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2480   auto Params =
2481       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2482   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2483   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2484   // when it is specified explicitly.
2485   if (OptLevel > 2)
2486     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2487   return Params;
2488 }
2489