1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/IR/CallSite.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/GetElementPtrTypeIterator.h"
33 #include "llvm/IR/GlobalAlias.h"
34 #include "llvm/IR/InstVisitor.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Operator.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/raw_ostream.h"
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "inline-cost"
43 
44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
45 
46 static cl::opt<int> InlineThreshold(
47     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
48     cl::desc("Control the amount of inlining to perform (default = 225)"));
49 
50 static cl::opt<int> HintThreshold(
51     "inlinehint-threshold", cl::Hidden, cl::init(325),
52     cl::desc("Threshold for inlining functions with inline hint"));
53 
54 static cl::opt<int>
55     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
56                           cl::init(45),
57                           cl::desc("Threshold for inlining cold callsites"));
58 
59 // We introduce this threshold to help performance of instrumentation based
60 // PGO before we actually hook up inliner with analysis passes such as BPI and
61 // BFI.
62 static cl::opt<int> ColdThreshold(
63     "inlinecold-threshold", cl::Hidden, cl::init(45),
64     cl::desc("Threshold for inlining functions with cold attribute"));
65 
66 static cl::opt<int>
67     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
68                          cl::ZeroOrMore,
69                          cl::desc("Threshold for hot callsites "));
70 
71 static cl::opt<int> LocallyHotCallSiteThreshold(
72     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
73     cl::desc("Threshold for locally hot callsites "));
74 
75 static cl::opt<int> ColdCallSiteRelFreq(
76     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
77     cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
78              "entry frequency, for a callsite to be cold in the absence of "
79              "profile information."));
80 
81 static cl::opt<int> HotCallSiteRelFreq(
82     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
83     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
84              "entry frequency, for a callsite to be hot in the absence of "
85              "profile information."));
86 
87 static cl::opt<bool> OptComputeFullInlineCost(
88     "inline-cost-full", cl::Hidden, cl::init(false),
89     cl::desc("Compute the full inline cost of a call site even when the cost "
90              "exceeds the threshold."));
91 
92 namespace {
93 
94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
95   typedef InstVisitor<CallAnalyzer, bool> Base;
96   friend class InstVisitor<CallAnalyzer, bool>;
97 
98   /// The TargetTransformInfo available for this compilation.
99   const TargetTransformInfo &TTI;
100 
101   /// Getter for the cache of @llvm.assume intrinsics.
102   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
103 
104   /// Getter for BlockFrequencyInfo
105   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
106 
107   /// Profile summary information.
108   ProfileSummaryInfo *PSI;
109 
110   /// The called function.
111   Function &F;
112 
113   // Cache the DataLayout since we use it a lot.
114   const DataLayout &DL;
115 
116   /// The OptimizationRemarkEmitter available for this compilation.
117   OptimizationRemarkEmitter *ORE;
118 
119   /// The candidate callsite being analyzed. Please do not use this to do
120   /// analysis in the caller function; we want the inline cost query to be
121   /// easily cacheable. Instead, use the cover function paramHasAttr.
122   CallSite CandidateCS;
123 
124   /// Tunable parameters that control the analysis.
125   const InlineParams &Params;
126 
127   int Threshold;
128   int Cost;
129   bool ComputeFullInlineCost;
130 
131   bool IsCallerRecursive;
132   bool IsRecursiveCall;
133   bool ExposesReturnsTwice;
134   bool HasDynamicAlloca;
135   bool ContainsNoDuplicateCall;
136   bool HasReturn;
137   bool HasIndirectBr;
138   bool HasFrameEscape;
139 
140   /// Number of bytes allocated statically by the callee.
141   uint64_t AllocatedSize;
142   unsigned NumInstructions, NumVectorInstructions;
143   int VectorBonus, TenPercentVectorBonus;
144   // Bonus to be applied when the callee has only one reachable basic block.
145   int SingleBBBonus;
146 
147   /// While we walk the potentially-inlined instructions, we build up and
148   /// maintain a mapping of simplified values specific to this callsite. The
149   /// idea is to propagate any special information we have about arguments to
150   /// this call through the inlinable section of the function, and account for
151   /// likely simplifications post-inlining. The most important aspect we track
152   /// is CFG altering simplifications -- when we prove a basic block dead, that
153   /// can cause dramatic shifts in the cost of inlining a function.
154   DenseMap<Value *, Constant *> SimplifiedValues;
155 
156   /// Keep track of the values which map back (through function arguments) to
157   /// allocas on the caller stack which could be simplified through SROA.
158   DenseMap<Value *, Value *> SROAArgValues;
159 
160   /// The mapping of caller Alloca values to their accumulated cost savings. If
161   /// we have to disable SROA for one of the allocas, this tells us how much
162   /// cost must be added.
163   DenseMap<Value *, int> SROAArgCosts;
164 
165   /// Keep track of values which map to a pointer base and constant offset.
166   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
167 
168   /// Keep track of dead blocks due to the constant arguments.
169   SetVector<BasicBlock *> DeadBlocks;
170 
171   /// The mapping of the blocks to their known unique successors due to the
172   /// constant arguments.
173   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
174 
175   /// Model the elimination of repeated loads that is expected to happen
176   /// whenever we simplify away the stores that would otherwise cause them to be
177   /// loads.
178   bool EnableLoadElimination;
179   SmallPtrSet<Value *, 16> LoadAddrSet;
180   int LoadEliminationCost;
181 
182   // Custom simplification helper routines.
183   bool isAllocaDerivedArg(Value *V);
184   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
185                             DenseMap<Value *, int>::iterator &CostIt);
186   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
187   void disableSROA(Value *V);
188   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
189   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
190                           int InstructionCost);
191   void disableLoadElimination();
192   bool isGEPFree(GetElementPtrInst &GEP);
193   bool canFoldInboundsGEP(GetElementPtrInst &I);
194   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
195   bool simplifyCallSite(Function *F, CallSite CS);
196   template <typename Callable>
197   bool simplifyInstruction(Instruction &I, Callable Evaluate);
198   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
199 
200   /// Return true if the given argument to the function being considered for
201   /// inlining has the given attribute set either at the call site or the
202   /// function declaration.  Primarily used to inspect call site specific
203   /// attributes since these can be more precise than the ones on the callee
204   /// itself.
205   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
206 
207   /// Return true if the given value is known non null within the callee if
208   /// inlined through this particular callsite.
209   bool isKnownNonNullInCallee(Value *V);
210 
211   /// Update Threshold based on callsite properties such as callee
212   /// attributes and callee hotness for PGO builds. The Callee is explicitly
213   /// passed to support analyzing indirect calls whose target is inferred by
214   /// analysis.
215   void updateThreshold(CallSite CS, Function &Callee);
216 
217   /// Return true if size growth is allowed when inlining the callee at CS.
218   bool allowSizeGrowth(CallSite CS);
219 
220   /// Return true if \p CS is a cold callsite.
221   bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
222 
223   /// Return a higher threshold if \p CS is a hot callsite.
224   Optional<int> getHotCallSiteThreshold(CallSite CS,
225                                         BlockFrequencyInfo *CallerBFI);
226 
227   // Custom analysis routines.
228   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
229 
230   // Disable several entry points to the visitor so we don't accidentally use
231   // them by declaring but not defining them here.
232   void visit(Module *);
233   void visit(Module &);
234   void visit(Function *);
235   void visit(Function &);
236   void visit(BasicBlock *);
237   void visit(BasicBlock &);
238 
239   // Provide base case for our instruction visit.
240   bool visitInstruction(Instruction &I);
241 
242   // Our visit overrides.
243   bool visitAlloca(AllocaInst &I);
244   bool visitPHI(PHINode &I);
245   bool visitGetElementPtr(GetElementPtrInst &I);
246   bool visitBitCast(BitCastInst &I);
247   bool visitPtrToInt(PtrToIntInst &I);
248   bool visitIntToPtr(IntToPtrInst &I);
249   bool visitCastInst(CastInst &I);
250   bool visitUnaryInstruction(UnaryInstruction &I);
251   bool visitCmpInst(CmpInst &I);
252   bool visitSub(BinaryOperator &I);
253   bool visitBinaryOperator(BinaryOperator &I);
254   bool visitLoad(LoadInst &I);
255   bool visitStore(StoreInst &I);
256   bool visitExtractValue(ExtractValueInst &I);
257   bool visitInsertValue(InsertValueInst &I);
258   bool visitCallSite(CallSite CS);
259   bool visitReturnInst(ReturnInst &RI);
260   bool visitBranchInst(BranchInst &BI);
261   bool visitSelectInst(SelectInst &SI);
262   bool visitSwitchInst(SwitchInst &SI);
263   bool visitIndirectBrInst(IndirectBrInst &IBI);
264   bool visitResumeInst(ResumeInst &RI);
265   bool visitCleanupReturnInst(CleanupReturnInst &RI);
266   bool visitCatchReturnInst(CatchReturnInst &RI);
267   bool visitUnreachableInst(UnreachableInst &I);
268 
269 public:
270   CallAnalyzer(const TargetTransformInfo &TTI,
271                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
272                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
273                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
274                Function &Callee, CallSite CSArg, const InlineParams &Params)
275       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
276         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
277         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
278         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
279                                        Params.ComputeFullInlineCost || ORE),
280         IsCallerRecursive(false), IsRecursiveCall(false),
281         ExposesReturnsTwice(false), HasDynamicAlloca(false),
282         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
283         HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
284         NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
285         EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
286         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
287         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
288         SROACostSavings(0), SROACostSavingsLost(0) {}
289 
290   bool analyzeCall(CallSite CS);
291 
292   int getThreshold() { return Threshold; }
293   int getCost() { return Cost; }
294 
295   // Keep a bunch of stats about the cost savings found so we can print them
296   // out when debugging.
297   unsigned NumConstantArgs;
298   unsigned NumConstantOffsetPtrArgs;
299   unsigned NumAllocaArgs;
300   unsigned NumConstantPtrCmps;
301   unsigned NumConstantPtrDiffs;
302   unsigned NumInstructionsSimplified;
303   unsigned SROACostSavings;
304   unsigned SROACostSavingsLost;
305 
306   void dump();
307 };
308 
309 } // namespace
310 
311 /// \brief Test whether the given value is an Alloca-derived function argument.
312 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
313   return SROAArgValues.count(V);
314 }
315 
316 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
317 /// Returns false if V does not map to a SROA-candidate.
318 bool CallAnalyzer::lookupSROAArgAndCost(
319     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
320   if (SROAArgValues.empty() || SROAArgCosts.empty())
321     return false;
322 
323   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
324   if (ArgIt == SROAArgValues.end())
325     return false;
326 
327   Arg = ArgIt->second;
328   CostIt = SROAArgCosts.find(Arg);
329   return CostIt != SROAArgCosts.end();
330 }
331 
332 /// \brief Disable SROA for the candidate marked by this cost iterator.
333 ///
334 /// This marks the candidate as no longer viable for SROA, and adds the cost
335 /// savings associated with it back into the inline cost measurement.
336 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
337   // If we're no longer able to perform SROA we need to undo its cost savings
338   // and prevent subsequent analysis.
339   Cost += CostIt->second;
340   SROACostSavings -= CostIt->second;
341   SROACostSavingsLost += CostIt->second;
342   SROAArgCosts.erase(CostIt);
343   disableLoadElimination();
344 }
345 
346 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
347 void CallAnalyzer::disableSROA(Value *V) {
348   Value *SROAArg;
349   DenseMap<Value *, int>::iterator CostIt;
350   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
351     disableSROA(CostIt);
352 }
353 
354 /// \brief Accumulate the given cost for a particular SROA candidate.
355 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
356                                       int InstructionCost) {
357   CostIt->second += InstructionCost;
358   SROACostSavings += InstructionCost;
359 }
360 
361 void CallAnalyzer::disableLoadElimination() {
362   if (EnableLoadElimination) {
363     Cost += LoadEliminationCost;
364     LoadEliminationCost = 0;
365     EnableLoadElimination = false;
366   }
367 }
368 
369 /// \brief Accumulate a constant GEP offset into an APInt if possible.
370 ///
371 /// Returns false if unable to compute the offset for any reason. Respects any
372 /// simplified values known during the analysis of this callsite.
373 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
374   unsigned IntPtrWidth = DL.getPointerTypeSizeInBits(GEP.getType());
375   assert(IntPtrWidth == Offset.getBitWidth());
376 
377   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
378        GTI != GTE; ++GTI) {
379     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
380     if (!OpC)
381       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
382         OpC = dyn_cast<ConstantInt>(SimpleOp);
383     if (!OpC)
384       return false;
385     if (OpC->isZero())
386       continue;
387 
388     // Handle a struct index, which adds its field offset to the pointer.
389     if (StructType *STy = GTI.getStructTypeOrNull()) {
390       unsigned ElementIdx = OpC->getZExtValue();
391       const StructLayout *SL = DL.getStructLayout(STy);
392       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
393       continue;
394     }
395 
396     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
397     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
398   }
399   return true;
400 }
401 
402 /// \brief Use TTI to check whether a GEP is free.
403 ///
404 /// Respects any simplified values known during the analysis of this callsite.
405 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
406   SmallVector<Value *, 4> Operands;
407   Operands.push_back(GEP.getOperand(0));
408   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
409     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
410        Operands.push_back(SimpleOp);
411      else
412        Operands.push_back(*I);
413   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
414 }
415 
416 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
417   // Check whether inlining will turn a dynamic alloca into a static
418   // alloca and handle that case.
419   if (I.isArrayAllocation()) {
420     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
421     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
422       Type *Ty = I.getAllocatedType();
423       AllocatedSize = SaturatingMultiplyAdd(
424           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
425       return Base::visitAlloca(I);
426     }
427   }
428 
429   // Accumulate the allocated size.
430   if (I.isStaticAlloca()) {
431     Type *Ty = I.getAllocatedType();
432     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
433   }
434 
435   // We will happily inline static alloca instructions.
436   if (I.isStaticAlloca())
437     return Base::visitAlloca(I);
438 
439   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
440   // a variety of reasons, and so we would like to not inline them into
441   // functions which don't currently have a dynamic alloca. This simply
442   // disables inlining altogether in the presence of a dynamic alloca.
443   HasDynamicAlloca = true;
444   return false;
445 }
446 
447 bool CallAnalyzer::visitPHI(PHINode &I) {
448   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
449   // though we don't want to propagate it's bonuses. The idea is to disable
450   // SROA if it *might* be used in an inappropriate manner.
451 
452   // Phi nodes are always zero-cost.
453   // FIXME: Pointer sizes may differ between different address spaces, so do we
454   // need to use correct address space in the call to getPointerSizeInBits here?
455   // Or could we skip the getPointerSizeInBits call completely? As far as I can
456   // see the ZeroOffset is used as a dummy value, so we can probably use any
457   // bit width for the ZeroOffset?
458   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
459   bool CheckSROA = I.getType()->isPointerTy();
460 
461   // Track the constant or pointer with constant offset we've seen so far.
462   Constant *FirstC = nullptr;
463   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
464   Value *FirstV = nullptr;
465 
466   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
467     BasicBlock *Pred = I.getIncomingBlock(i);
468     // If the incoming block is dead, skip the incoming block.
469     if (DeadBlocks.count(Pred))
470       continue;
471     // If the parent block of phi is not the known successor of the incoming
472     // block, skip the incoming block.
473     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
474     if (KnownSuccessor && KnownSuccessor != I.getParent())
475       continue;
476 
477     Value *V = I.getIncomingValue(i);
478     // If the incoming value is this phi itself, skip the incoming value.
479     if (&I == V)
480       continue;
481 
482     Constant *C = dyn_cast<Constant>(V);
483     if (!C)
484       C = SimplifiedValues.lookup(V);
485 
486     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
487     if (!C && CheckSROA)
488       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
489 
490     if (!C && !BaseAndOffset.first)
491       // The incoming value is neither a constant nor a pointer with constant
492       // offset, exit early.
493       return true;
494 
495     if (FirstC) {
496       if (FirstC == C)
497         // If we've seen a constant incoming value before and it is the same
498         // constant we see this time, continue checking the next incoming value.
499         continue;
500       // Otherwise early exit because we either see a different constant or saw
501       // a constant before but we have a pointer with constant offset this time.
502       return true;
503     }
504 
505     if (FirstV) {
506       // The same logic as above, but check pointer with constant offset here.
507       if (FirstBaseAndOffset == BaseAndOffset)
508         continue;
509       return true;
510     }
511 
512     if (C) {
513       // This is the 1st time we've seen a constant, record it.
514       FirstC = C;
515       continue;
516     }
517 
518     // The remaining case is that this is the 1st time we've seen a pointer with
519     // constant offset, record it.
520     FirstV = V;
521     FirstBaseAndOffset = BaseAndOffset;
522   }
523 
524   // Check if we can map phi to a constant.
525   if (FirstC) {
526     SimplifiedValues[&I] = FirstC;
527     return true;
528   }
529 
530   // Check if we can map phi to a pointer with constant offset.
531   if (FirstBaseAndOffset.first) {
532     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
533 
534     Value *SROAArg;
535     DenseMap<Value *, int>::iterator CostIt;
536     if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
537       SROAArgValues[&I] = SROAArg;
538   }
539 
540   return true;
541 }
542 
543 /// \brief Check we can fold GEPs of constant-offset call site argument pointers.
544 /// This requires target data and inbounds GEPs.
545 ///
546 /// \return true if the specified GEP can be folded.
547 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
548   // Check if we have a base + offset for the pointer.
549   std::pair<Value *, APInt> BaseAndOffset =
550       ConstantOffsetPtrs.lookup(I.getPointerOperand());
551   if (!BaseAndOffset.first)
552     return false;
553 
554   // Check if the offset of this GEP is constant, and if so accumulate it
555   // into Offset.
556   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
557     return false;
558 
559   // Add the result as a new mapping to Base + Offset.
560   ConstantOffsetPtrs[&I] = BaseAndOffset;
561 
562   return true;
563 }
564 
565 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
566   Value *SROAArg;
567   DenseMap<Value *, int>::iterator CostIt;
568   bool SROACandidate =
569       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
570 
571   // Lambda to check whether a GEP's indices are all constant.
572   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
573     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
574       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
575         return false;
576     return true;
577   };
578 
579   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
580     if (SROACandidate)
581       SROAArgValues[&I] = SROAArg;
582 
583     // Constant GEPs are modeled as free.
584     return true;
585   }
586 
587   // Variable GEPs will require math and will disable SROA.
588   if (SROACandidate)
589     disableSROA(CostIt);
590   return isGEPFree(I);
591 }
592 
593 /// Simplify \p I if its operands are constants and update SimplifiedValues.
594 /// \p Evaluate is a callable specific to instruction type that evaluates the
595 /// instruction when all the operands are constants.
596 template <typename Callable>
597 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
598   SmallVector<Constant *, 2> COps;
599   for (Value *Op : I.operands()) {
600     Constant *COp = dyn_cast<Constant>(Op);
601     if (!COp)
602       COp = SimplifiedValues.lookup(Op);
603     if (!COp)
604       return false;
605     COps.push_back(COp);
606   }
607   auto *C = Evaluate(COps);
608   if (!C)
609     return false;
610   SimplifiedValues[&I] = C;
611   return true;
612 }
613 
614 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
615   // Propagate constants through bitcasts.
616   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
617         return ConstantExpr::getBitCast(COps[0], I.getType());
618       }))
619     return true;
620 
621   // Track base/offsets through casts
622   std::pair<Value *, APInt> BaseAndOffset =
623       ConstantOffsetPtrs.lookup(I.getOperand(0));
624   // Casts don't change the offset, just wrap it up.
625   if (BaseAndOffset.first)
626     ConstantOffsetPtrs[&I] = BaseAndOffset;
627 
628   // Also look for SROA candidates here.
629   Value *SROAArg;
630   DenseMap<Value *, int>::iterator CostIt;
631   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
632     SROAArgValues[&I] = SROAArg;
633 
634   // Bitcasts are always zero cost.
635   return true;
636 }
637 
638 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
639   // Propagate constants through ptrtoint.
640   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
641         return ConstantExpr::getPtrToInt(COps[0], I.getType());
642       }))
643     return true;
644 
645   // Track base/offset pairs when converted to a plain integer provided the
646   // integer is large enough to represent the pointer.
647   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
648   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
649   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
650     std::pair<Value *, APInt> BaseAndOffset =
651         ConstantOffsetPtrs.lookup(I.getOperand(0));
652     if (BaseAndOffset.first)
653       ConstantOffsetPtrs[&I] = BaseAndOffset;
654   }
655 
656   // This is really weird. Technically, ptrtoint will disable SROA. However,
657   // unless that ptrtoint is *used* somewhere in the live basic blocks after
658   // inlining, it will be nuked, and SROA should proceed. All of the uses which
659   // would block SROA would also block SROA if applied directly to a pointer,
660   // and so we can just add the integer in here. The only places where SROA is
661   // preserved either cannot fire on an integer, or won't in-and-of themselves
662   // disable SROA (ext) w/o some later use that we would see and disable.
663   Value *SROAArg;
664   DenseMap<Value *, int>::iterator CostIt;
665   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
666     SROAArgValues[&I] = SROAArg;
667 
668   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
669 }
670 
671 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
672   // Propagate constants through ptrtoint.
673   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
674         return ConstantExpr::getIntToPtr(COps[0], I.getType());
675       }))
676     return true;
677 
678   // Track base/offset pairs when round-tripped through a pointer without
679   // modifications provided the integer is not too large.
680   Value *Op = I.getOperand(0);
681   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
682   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
683     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
684     if (BaseAndOffset.first)
685       ConstantOffsetPtrs[&I] = BaseAndOffset;
686   }
687 
688   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
689   Value *SROAArg;
690   DenseMap<Value *, int>::iterator CostIt;
691   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
692     SROAArgValues[&I] = SROAArg;
693 
694   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
695 }
696 
697 bool CallAnalyzer::visitCastInst(CastInst &I) {
698   // Propagate constants through ptrtoint.
699   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
700         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
701       }))
702     return true;
703 
704   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
705   disableSROA(I.getOperand(0));
706 
707   // If this is a floating-point cast, and the target says this operation
708   // is expensive, this may eventually become a library call. Treat the cost
709   // as such.
710   switch (I.getOpcode()) {
711   case Instruction::FPTrunc:
712   case Instruction::FPExt:
713   case Instruction::UIToFP:
714   case Instruction::SIToFP:
715   case Instruction::FPToUI:
716   case Instruction::FPToSI:
717     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
718       Cost += InlineConstants::CallPenalty;
719   default:
720     break;
721   }
722 
723   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
724 }
725 
726 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
727   Value *Operand = I.getOperand(0);
728   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
729         return ConstantFoldInstOperands(&I, COps[0], DL);
730       }))
731     return true;
732 
733   // Disable any SROA on the argument to arbitrary unary operators.
734   disableSROA(Operand);
735 
736   return false;
737 }
738 
739 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
740   return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
741 }
742 
743 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
744   // Does the *call site* have the NonNull attribute set on an argument?  We
745   // use the attribute on the call site to memoize any analysis done in the
746   // caller. This will also trip if the callee function has a non-null
747   // parameter attribute, but that's a less interesting case because hopefully
748   // the callee would already have been simplified based on that.
749   if (Argument *A = dyn_cast<Argument>(V))
750     if (paramHasAttr(A, Attribute::NonNull))
751       return true;
752 
753   // Is this an alloca in the caller?  This is distinct from the attribute case
754   // above because attributes aren't updated within the inliner itself and we
755   // always want to catch the alloca derived case.
756   if (isAllocaDerivedArg(V))
757     // We can actually predict the result of comparisons between an
758     // alloca-derived value and null. Note that this fires regardless of
759     // SROA firing.
760     return true;
761 
762   return false;
763 }
764 
765 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
766   // If the normal destination of the invoke or the parent block of the call
767   // site is unreachable-terminated, there is little point in inlining this
768   // unless there is literally zero cost.
769   // FIXME: Note that it is possible that an unreachable-terminated block has a
770   // hot entry. For example, in below scenario inlining hot_call_X() may be
771   // beneficial :
772   // main() {
773   //   hot_call_1();
774   //   ...
775   //   hot_call_N()
776   //   exit(0);
777   // }
778   // For now, we are not handling this corner case here as it is rare in real
779   // code. In future, we should elaborate this based on BPI and BFI in more
780   // general threshold adjusting heuristics in updateThreshold().
781   Instruction *Instr = CS.getInstruction();
782   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
783     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
784       return false;
785   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
786     return false;
787 
788   return true;
789 }
790 
791 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
792   // If global profile summary is available, then callsite's coldness is
793   // determined based on that.
794   if (PSI && PSI->hasProfileSummary())
795     return PSI->isColdCallSite(CS, CallerBFI);
796 
797   // Otherwise we need BFI to be available.
798   if (!CallerBFI)
799     return false;
800 
801   // Determine if the callsite is cold relative to caller's entry. We could
802   // potentially cache the computation of scaled entry frequency, but the added
803   // complexity is not worth it unless this scaling shows up high in the
804   // profiles.
805   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
806   auto CallSiteBB = CS.getInstruction()->getParent();
807   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
808   auto CallerEntryFreq =
809       CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
810   return CallSiteFreq < CallerEntryFreq * ColdProb;
811 }
812 
813 Optional<int>
814 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
815                                       BlockFrequencyInfo *CallerBFI) {
816 
817   // If global profile summary is available, then callsite's hotness is
818   // determined based on that.
819   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
820     return Params.HotCallSiteThreshold;
821 
822   // Otherwise we need BFI to be available and to have a locally hot callsite
823   // threshold.
824   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
825     return None;
826 
827   // Determine if the callsite is hot relative to caller's entry. We could
828   // potentially cache the computation of scaled entry frequency, but the added
829   // complexity is not worth it unless this scaling shows up high in the
830   // profiles.
831   auto CallSiteBB = CS.getInstruction()->getParent();
832   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
833   auto CallerEntryFreq = CallerBFI->getEntryFreq();
834   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
835     return Params.LocallyHotCallSiteThreshold;
836 
837   // Otherwise treat it normally.
838   return None;
839 }
840 
841 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
842   // If no size growth is allowed for this inlining, set Threshold to 0.
843   if (!allowSizeGrowth(CS)) {
844     Threshold = 0;
845     return;
846   }
847 
848   Function *Caller = CS.getCaller();
849 
850   // return min(A, B) if B is valid.
851   auto MinIfValid = [](int A, Optional<int> B) {
852     return B ? std::min(A, B.getValue()) : A;
853   };
854 
855   // return max(A, B) if B is valid.
856   auto MaxIfValid = [](int A, Optional<int> B) {
857     return B ? std::max(A, B.getValue()) : A;
858   };
859 
860   // Various bonus percentages. These are multiplied by Threshold to get the
861   // bonus values.
862   // SingleBBBonus: This bonus is applied if the callee has a single reachable
863   // basic block at the given callsite context. This is speculatively applied
864   // and withdrawn if more than one basic block is seen.
865   //
866   // Vector bonuses: We want to more aggressively inline vector-dense kernels
867   // and apply this bonus based on the percentage of vector instructions. A
868   // bonus is applied if the vector instructions exceed 50% and half that amount
869   // is applied if it exceeds 10%. Note that these bonuses are some what
870   // arbitrary and evolved over time by accident as much as because they are
871   // principled bonuses.
872   // FIXME: It would be nice to base the bonus values on something more
873   // scientific.
874   //
875   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
876   // of the last call to a static function as inlining such functions is
877   // guaranteed to reduce code size.
878   //
879   // These bonus percentages may be set to 0 based on properties of the caller
880   // and the callsite.
881   int SingleBBBonusPercent = 50;
882   int VectorBonusPercent = 150;
883   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
884 
885   // Lambda to set all the above bonus and bonus percentages to 0.
886   auto DisallowAllBonuses = [&]() {
887     SingleBBBonusPercent = 0;
888     VectorBonusPercent = 0;
889     LastCallToStaticBonus = 0;
890   };
891 
892   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
893   // and reduce the threshold if the caller has the necessary attribute.
894   if (Caller->optForMinSize()) {
895     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
896     // For minsize, we want to disable the single BB bonus and the vector
897     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
898     // a static function will, at the minimum, eliminate the parameter setup and
899     // call/return instructions.
900     SingleBBBonusPercent = 0;
901     VectorBonusPercent = 0;
902   } else if (Caller->optForSize())
903     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
904 
905   // Adjust the threshold based on inlinehint attribute and profile based
906   // hotness information if the caller does not have MinSize attribute.
907   if (!Caller->optForMinSize()) {
908     if (Callee.hasFnAttribute(Attribute::InlineHint))
909       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
910 
911     // FIXME: After switching to the new passmanager, simplify the logic below
912     // by checking only the callsite hotness/coldness as we will reliably
913     // have local profile information.
914     //
915     // Callsite hotness and coldness can be determined if sample profile is
916     // used (which adds hotness metadata to calls) or if caller's
917     // BlockFrequencyInfo is available.
918     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
919     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
920     if (!Caller->optForSize() && HotCallSiteThreshold) {
921       DEBUG(dbgs() << "Hot callsite.\n");
922       // FIXME: This should update the threshold only if it exceeds the
923       // current threshold, but AutoFDO + ThinLTO currently relies on this
924       // behavior to prevent inlining of hot callsites during ThinLTO
925       // compile phase.
926       Threshold = HotCallSiteThreshold.getValue();
927     } else if (isColdCallSite(CS, CallerBFI)) {
928       DEBUG(dbgs() << "Cold callsite.\n");
929       // Do not apply bonuses for a cold callsite including the
930       // LastCallToStatic bonus. While this bonus might result in code size
931       // reduction, it can cause the size of a non-cold caller to increase
932       // preventing it from being inlined.
933       DisallowAllBonuses();
934       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
935     } else if (PSI) {
936       // Use callee's global profile information only if we have no way of
937       // determining this via callsite information.
938       if (PSI->isFunctionEntryHot(&Callee)) {
939         DEBUG(dbgs() << "Hot callee.\n");
940         // If callsite hotness can not be determined, we may still know
941         // that the callee is hot and treat it as a weaker hint for threshold
942         // increase.
943         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
944       } else if (PSI->isFunctionEntryCold(&Callee)) {
945         DEBUG(dbgs() << "Cold callee.\n");
946         // Do not apply bonuses for a cold callee including the
947         // LastCallToStatic bonus. While this bonus might result in code size
948         // reduction, it can cause the size of a non-cold caller to increase
949         // preventing it from being inlined.
950         DisallowAllBonuses();
951         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
952       }
953     }
954   }
955 
956   // Finally, take the target-specific inlining threshold multiplier into
957   // account.
958   Threshold *= TTI.getInliningThresholdMultiplier();
959 
960   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
961   VectorBonus = Threshold * VectorBonusPercent / 100;
962 
963   bool OnlyOneCallAndLocalLinkage =
964       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
965   // If there is only one call of the function, and it has internal linkage,
966   // the cost of inlining it drops dramatically. It may seem odd to update
967   // Cost in updateThreshold, but the bonus depends on the logic in this method.
968   if (OnlyOneCallAndLocalLinkage)
969     Cost -= LastCallToStaticBonus;
970 }
971 
972 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
973   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
974   // First try to handle simplified comparisons.
975   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
976         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
977       }))
978     return true;
979 
980   if (I.getOpcode() == Instruction::FCmp)
981     return false;
982 
983   // Otherwise look for a comparison between constant offset pointers with
984   // a common base.
985   Value *LHSBase, *RHSBase;
986   APInt LHSOffset, RHSOffset;
987   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
988   if (LHSBase) {
989     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
990     if (RHSBase && LHSBase == RHSBase) {
991       // We have common bases, fold the icmp to a constant based on the
992       // offsets.
993       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
994       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
995       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
996         SimplifiedValues[&I] = C;
997         ++NumConstantPtrCmps;
998         return true;
999       }
1000     }
1001   }
1002 
1003   // If the comparison is an equality comparison with null, we can simplify it
1004   // if we know the value (argument) can't be null
1005   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1006       isKnownNonNullInCallee(I.getOperand(0))) {
1007     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1008     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1009                                       : ConstantInt::getFalse(I.getType());
1010     return true;
1011   }
1012   // Finally check for SROA candidates in comparisons.
1013   Value *SROAArg;
1014   DenseMap<Value *, int>::iterator CostIt;
1015   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1016     if (isa<ConstantPointerNull>(I.getOperand(1))) {
1017       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1018       return true;
1019     }
1020 
1021     disableSROA(CostIt);
1022   }
1023 
1024   return false;
1025 }
1026 
1027 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1028   // Try to handle a special case: we can fold computing the difference of two
1029   // constant-related pointers.
1030   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1031   Value *LHSBase, *RHSBase;
1032   APInt LHSOffset, RHSOffset;
1033   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1034   if (LHSBase) {
1035     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1036     if (RHSBase && LHSBase == RHSBase) {
1037       // We have common bases, fold the subtract to a constant based on the
1038       // offsets.
1039       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1040       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1041       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1042         SimplifiedValues[&I] = C;
1043         ++NumConstantPtrDiffs;
1044         return true;
1045       }
1046     }
1047   }
1048 
1049   // Otherwise, fall back to the generic logic for simplifying and handling
1050   // instructions.
1051   return Base::visitSub(I);
1052 }
1053 
1054 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1055   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1056   Constant *CLHS = dyn_cast<Constant>(LHS);
1057   if (!CLHS)
1058     CLHS = SimplifiedValues.lookup(LHS);
1059   Constant *CRHS = dyn_cast<Constant>(RHS);
1060   if (!CRHS)
1061     CRHS = SimplifiedValues.lookup(RHS);
1062 
1063   Value *SimpleV = nullptr;
1064   if (auto FI = dyn_cast<FPMathOperator>(&I))
1065     SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1066                               CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1067   else
1068     SimpleV =
1069         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1070 
1071   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1072     SimplifiedValues[&I] = C;
1073 
1074   if (SimpleV)
1075     return true;
1076 
1077   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1078   disableSROA(LHS);
1079   disableSROA(RHS);
1080 
1081   // If the instruction is floating point, and the target says this operation
1082   // is expensive, this may eventually become a library call. Treat the cost
1083   // as such.
1084   if (I.getType()->isFloatingPointTy() &&
1085       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1086     Cost += InlineConstants::CallPenalty;
1087 
1088   return false;
1089 }
1090 
1091 bool CallAnalyzer::visitLoad(LoadInst &I) {
1092   Value *SROAArg;
1093   DenseMap<Value *, int>::iterator CostIt;
1094   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1095     if (I.isSimple()) {
1096       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1097       return true;
1098     }
1099 
1100     disableSROA(CostIt);
1101   }
1102 
1103   // If the data is already loaded from this address and hasn't been clobbered
1104   // by any stores or calls, this load is likely to be redundant and can be
1105   // eliminated.
1106   if (EnableLoadElimination &&
1107       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1108     LoadEliminationCost += InlineConstants::InstrCost;
1109     return true;
1110   }
1111 
1112   return false;
1113 }
1114 
1115 bool CallAnalyzer::visitStore(StoreInst &I) {
1116   Value *SROAArg;
1117   DenseMap<Value *, int>::iterator CostIt;
1118   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1119     if (I.isSimple()) {
1120       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1121       return true;
1122     }
1123 
1124     disableSROA(CostIt);
1125   }
1126 
1127   // The store can potentially clobber loads and prevent repeated loads from
1128   // being eliminated.
1129   // FIXME:
1130   // 1. We can probably keep an initial set of eliminatable loads substracted
1131   // from the cost even when we finally see a store. We just need to disable
1132   // *further* accumulation of elimination savings.
1133   // 2. We should probably at some point thread MemorySSA for the callee into
1134   // this and then use that to actually compute *really* precise savings.
1135   disableLoadElimination();
1136   return false;
1137 }
1138 
1139 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1140   // Constant folding for extract value is trivial.
1141   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1142         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1143       }))
1144     return true;
1145 
1146   // SROA can look through these but give them a cost.
1147   return false;
1148 }
1149 
1150 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1151   // Constant folding for insert value is trivial.
1152   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1153         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1154                                             /*InsertedValueOperand*/ COps[1],
1155                                             I.getIndices());
1156       }))
1157     return true;
1158 
1159   // SROA can look through these but give them a cost.
1160   return false;
1161 }
1162 
1163 /// \brief Try to simplify a call site.
1164 ///
1165 /// Takes a concrete function and callsite and tries to actually simplify it by
1166 /// analyzing the arguments and call itself with instsimplify. Returns true if
1167 /// it has simplified the callsite to some other entity (a constant), making it
1168 /// free.
1169 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1170   // FIXME: Using the instsimplify logic directly for this is inefficient
1171   // because we have to continually rebuild the argument list even when no
1172   // simplifications can be performed. Until that is fixed with remapping
1173   // inside of instsimplify, directly constant fold calls here.
1174   if (!canConstantFoldCallTo(CS, F))
1175     return false;
1176 
1177   // Try to re-map the arguments to constants.
1178   SmallVector<Constant *, 4> ConstantArgs;
1179   ConstantArgs.reserve(CS.arg_size());
1180   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1181        ++I) {
1182     Constant *C = dyn_cast<Constant>(*I);
1183     if (!C)
1184       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1185     if (!C)
1186       return false; // This argument doesn't map to a constant.
1187 
1188     ConstantArgs.push_back(C);
1189   }
1190   if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1191     SimplifiedValues[CS.getInstruction()] = C;
1192     return true;
1193   }
1194 
1195   return false;
1196 }
1197 
1198 bool CallAnalyzer::visitCallSite(CallSite CS) {
1199   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1200       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1201     // This aborts the entire analysis.
1202     ExposesReturnsTwice = true;
1203     return false;
1204   }
1205   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1206     ContainsNoDuplicateCall = true;
1207 
1208   if (Function *F = CS.getCalledFunction()) {
1209     // When we have a concrete function, first try to simplify it directly.
1210     if (simplifyCallSite(F, CS))
1211       return true;
1212 
1213     // Next check if it is an intrinsic we know about.
1214     // FIXME: Lift this into part of the InstVisitor.
1215     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1216       switch (II->getIntrinsicID()) {
1217       default:
1218         if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1219           disableLoadElimination();
1220         return Base::visitCallSite(CS);
1221 
1222       case Intrinsic::load_relative:
1223         // This is normally lowered to 4 LLVM instructions.
1224         Cost += 3 * InlineConstants::InstrCost;
1225         return false;
1226 
1227       case Intrinsic::memset:
1228       case Intrinsic::memcpy:
1229       case Intrinsic::memmove:
1230         disableLoadElimination();
1231         // SROA can usually chew through these intrinsics, but they aren't free.
1232         return false;
1233       case Intrinsic::localescape:
1234         HasFrameEscape = true;
1235         return false;
1236       }
1237     }
1238 
1239     if (F == CS.getInstruction()->getFunction()) {
1240       // This flag will fully abort the analysis, so don't bother with anything
1241       // else.
1242       IsRecursiveCall = true;
1243       return false;
1244     }
1245 
1246     if (TTI.isLoweredToCall(F)) {
1247       // We account for the average 1 instruction per call argument setup
1248       // here.
1249       Cost += CS.arg_size() * InlineConstants::InstrCost;
1250 
1251       // Everything other than inline ASM will also have a significant cost
1252       // merely from making the call.
1253       if (!isa<InlineAsm>(CS.getCalledValue()))
1254         Cost += InlineConstants::CallPenalty;
1255     }
1256 
1257     if (!CS.onlyReadsMemory())
1258       disableLoadElimination();
1259     return Base::visitCallSite(CS);
1260   }
1261 
1262   // Otherwise we're in a very special case -- an indirect function call. See
1263   // if we can be particularly clever about this.
1264   Value *Callee = CS.getCalledValue();
1265 
1266   // First, pay the price of the argument setup. We account for the average
1267   // 1 instruction per call argument setup here.
1268   Cost += CS.arg_size() * InlineConstants::InstrCost;
1269 
1270   // Next, check if this happens to be an indirect function call to a known
1271   // function in this inline context. If not, we've done all we can.
1272   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1273   if (!F) {
1274     if (!CS.onlyReadsMemory())
1275       disableLoadElimination();
1276     return Base::visitCallSite(CS);
1277   }
1278 
1279   // If we have a constant that we are calling as a function, we can peer
1280   // through it and see the function target. This happens not infrequently
1281   // during devirtualization and so we want to give it a hefty bonus for
1282   // inlining, but cap that bonus in the event that inlining wouldn't pan
1283   // out. Pretend to inline the function, with a custom threshold.
1284   auto IndirectCallParams = Params;
1285   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1286   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1287                   IndirectCallParams);
1288   if (CA.analyzeCall(CS)) {
1289     // We were able to inline the indirect call! Subtract the cost from the
1290     // threshold to get the bonus we want to apply, but don't go below zero.
1291     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1292   }
1293 
1294   if (!F->onlyReadsMemory())
1295     disableLoadElimination();
1296   return Base::visitCallSite(CS);
1297 }
1298 
1299 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1300   // At least one return instruction will be free after inlining.
1301   bool Free = !HasReturn;
1302   HasReturn = true;
1303   return Free;
1304 }
1305 
1306 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1307   // We model unconditional branches as essentially free -- they really
1308   // shouldn't exist at all, but handling them makes the behavior of the
1309   // inliner more regular and predictable. Interestingly, conditional branches
1310   // which will fold away are also free.
1311   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1312          dyn_cast_or_null<ConstantInt>(
1313              SimplifiedValues.lookup(BI.getCondition()));
1314 }
1315 
1316 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1317   bool CheckSROA = SI.getType()->isPointerTy();
1318   Value *TrueVal = SI.getTrueValue();
1319   Value *FalseVal = SI.getFalseValue();
1320 
1321   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1322   if (!TrueC)
1323     TrueC = SimplifiedValues.lookup(TrueVal);
1324   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1325   if (!FalseC)
1326     FalseC = SimplifiedValues.lookup(FalseVal);
1327   Constant *CondC =
1328       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1329 
1330   if (!CondC) {
1331     // Select C, X, X => X
1332     if (TrueC == FalseC && TrueC) {
1333       SimplifiedValues[&SI] = TrueC;
1334       return true;
1335     }
1336 
1337     if (!CheckSROA)
1338       return Base::visitSelectInst(SI);
1339 
1340     std::pair<Value *, APInt> TrueBaseAndOffset =
1341         ConstantOffsetPtrs.lookup(TrueVal);
1342     std::pair<Value *, APInt> FalseBaseAndOffset =
1343         ConstantOffsetPtrs.lookup(FalseVal);
1344     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1345       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1346 
1347       Value *SROAArg;
1348       DenseMap<Value *, int>::iterator CostIt;
1349       if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1350         SROAArgValues[&SI] = SROAArg;
1351       return true;
1352     }
1353 
1354     return Base::visitSelectInst(SI);
1355   }
1356 
1357   // Select condition is a constant.
1358   Value *SelectedV = CondC->isAllOnesValue()
1359                          ? TrueVal
1360                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1361   if (!SelectedV) {
1362     // Condition is a vector constant that is not all 1s or all 0s.  If all
1363     // operands are constants, ConstantExpr::getSelect() can handle the cases
1364     // such as select vectors.
1365     if (TrueC && FalseC) {
1366       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1367         SimplifiedValues[&SI] = C;
1368         return true;
1369       }
1370     }
1371     return Base::visitSelectInst(SI);
1372   }
1373 
1374   // Condition is either all 1s or all 0s. SI can be simplified.
1375   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1376     SimplifiedValues[&SI] = SelectedC;
1377     return true;
1378   }
1379 
1380   if (!CheckSROA)
1381     return true;
1382 
1383   std::pair<Value *, APInt> BaseAndOffset =
1384       ConstantOffsetPtrs.lookup(SelectedV);
1385   if (BaseAndOffset.first) {
1386     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1387 
1388     Value *SROAArg;
1389     DenseMap<Value *, int>::iterator CostIt;
1390     if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1391       SROAArgValues[&SI] = SROAArg;
1392   }
1393 
1394   return true;
1395 }
1396 
1397 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1398   // We model unconditional switches as free, see the comments on handling
1399   // branches.
1400   if (isa<ConstantInt>(SI.getCondition()))
1401     return true;
1402   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1403     if (isa<ConstantInt>(V))
1404       return true;
1405 
1406   // Assume the most general case where the switch is lowered into
1407   // either a jump table, bit test, or a balanced binary tree consisting of
1408   // case clusters without merging adjacent clusters with the same
1409   // destination. We do not consider the switches that are lowered with a mix
1410   // of jump table/bit test/binary search tree. The cost of the switch is
1411   // proportional to the size of the tree or the size of jump table range.
1412   //
1413   // NB: We convert large switches which are just used to initialize large phi
1414   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1415   // inlining those. It will prevent inlining in cases where the optimization
1416   // does not (yet) fire.
1417 
1418   // Maximum valid cost increased in this function.
1419   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1420 
1421   // Exit early for a large switch, assuming one case needs at least one
1422   // instruction.
1423   // FIXME: This is not true for a bit test, but ignore such case for now to
1424   // save compile-time.
1425   int64_t CostLowerBound =
1426       std::min((int64_t)CostUpperBound,
1427                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1428 
1429   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1430     Cost = CostLowerBound;
1431     return false;
1432   }
1433 
1434   unsigned JumpTableSize = 0;
1435   unsigned NumCaseCluster =
1436       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1437 
1438   // If suitable for a jump table, consider the cost for the table size and
1439   // branch to destination.
1440   if (JumpTableSize) {
1441     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1442                      4 * InlineConstants::InstrCost;
1443 
1444     Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1445     return false;
1446   }
1447 
1448   // Considering forming a binary search, we should find the number of nodes
1449   // which is same as the number of comparisons when lowered. For a given
1450   // number of clusters, n, we can define a recursive function, f(n), to find
1451   // the number of nodes in the tree. The recursion is :
1452   // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1453   // and f(n) = n, when n <= 3.
1454   // This will lead a binary tree where the leaf should be either f(2) or f(3)
1455   // when n > 3.  So, the number of comparisons from leaves should be n, while
1456   // the number of non-leaf should be :
1457   //   2^(log2(n) - 1) - 1
1458   //   = 2^log2(n) * 2^-1 - 1
1459   //   = n / 2 - 1.
1460   // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1461   // number of comparisons in a simple closed form :
1462   //   n + n / 2 - 1 = n * 3 / 2 - 1
1463   if (NumCaseCluster <= 3) {
1464     // Suppose a comparison includes one compare and one conditional branch.
1465     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1466     return false;
1467   }
1468 
1469   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1470   int64_t SwitchCost =
1471       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1472 
1473   Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1474   return false;
1475 }
1476 
1477 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1478   // We never want to inline functions that contain an indirectbr.  This is
1479   // incorrect because all the blockaddress's (in static global initializers
1480   // for example) would be referring to the original function, and this
1481   // indirect jump would jump from the inlined copy of the function into the
1482   // original function which is extremely undefined behavior.
1483   // FIXME: This logic isn't really right; we can safely inline functions with
1484   // indirectbr's as long as no other function or global references the
1485   // blockaddress of a block within the current function.
1486   HasIndirectBr = true;
1487   return false;
1488 }
1489 
1490 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1491   // FIXME: It's not clear that a single instruction is an accurate model for
1492   // the inline cost of a resume instruction.
1493   return false;
1494 }
1495 
1496 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1497   // FIXME: It's not clear that a single instruction is an accurate model for
1498   // the inline cost of a cleanupret instruction.
1499   return false;
1500 }
1501 
1502 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1503   // FIXME: It's not clear that a single instruction is an accurate model for
1504   // the inline cost of a catchret instruction.
1505   return false;
1506 }
1507 
1508 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1509   // FIXME: It might be reasonably to discount the cost of instructions leading
1510   // to unreachable as they have the lowest possible impact on both runtime and
1511   // code size.
1512   return true; // No actual code is needed for unreachable.
1513 }
1514 
1515 bool CallAnalyzer::visitInstruction(Instruction &I) {
1516   // Some instructions are free. All of the free intrinsics can also be
1517   // handled by SROA, etc.
1518   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1519     return true;
1520 
1521   // We found something we don't understand or can't handle. Mark any SROA-able
1522   // values in the operand list as no longer viable.
1523   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1524     disableSROA(*OI);
1525 
1526   return false;
1527 }
1528 
1529 /// \brief Analyze a basic block for its contribution to the inline cost.
1530 ///
1531 /// This method walks the analyzer over every instruction in the given basic
1532 /// block and accounts for their cost during inlining at this callsite. It
1533 /// aborts early if the threshold has been exceeded or an impossible to inline
1534 /// construct has been detected. It returns false if inlining is no longer
1535 /// viable, and true if inlining remains viable.
1536 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1537                                 SmallPtrSetImpl<const Value *> &EphValues) {
1538   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1539     // FIXME: Currently, the number of instructions in a function regardless of
1540     // our ability to simplify them during inline to constants or dead code,
1541     // are actually used by the vector bonus heuristic. As long as that's true,
1542     // we have to special case debug intrinsics here to prevent differences in
1543     // inlining due to debug symbols. Eventually, the number of unsimplified
1544     // instructions shouldn't factor into the cost computation, but until then,
1545     // hack around it here.
1546     if (isa<DbgInfoIntrinsic>(I))
1547       continue;
1548 
1549     // Skip ephemeral values.
1550     if (EphValues.count(&*I))
1551       continue;
1552 
1553     ++NumInstructions;
1554     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1555       ++NumVectorInstructions;
1556 
1557     // If the instruction simplified to a constant, there is no cost to this
1558     // instruction. Visit the instructions using our InstVisitor to account for
1559     // all of the per-instruction logic. The visit tree returns true if we
1560     // consumed the instruction in any way, and false if the instruction's base
1561     // cost should count against inlining.
1562     if (Base::visit(&*I))
1563       ++NumInstructionsSimplified;
1564     else
1565       Cost += InlineConstants::InstrCost;
1566 
1567     using namespace ore;
1568     // If the visit this instruction detected an uninlinable pattern, abort.
1569     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1570         HasIndirectBr || HasFrameEscape) {
1571       if (ORE)
1572         ORE->emit([&]() {
1573           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1574                                           CandidateCS.getInstruction())
1575                  << NV("Callee", &F)
1576                  << " has uninlinable pattern and cost is not fully computed";
1577         });
1578       return false;
1579     }
1580 
1581     // If the caller is a recursive function then we don't want to inline
1582     // functions which allocate a lot of stack space because it would increase
1583     // the caller stack usage dramatically.
1584     if (IsCallerRecursive &&
1585         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1586       if (ORE)
1587         ORE->emit([&]() {
1588           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1589                                           CandidateCS.getInstruction())
1590                  << NV("Callee", &F)
1591                  << " is recursive and allocates too much stack space. Cost is "
1592                     "not fully computed";
1593         });
1594       return false;
1595     }
1596 
1597     // Check if we've past the maximum possible threshold so we don't spin in
1598     // huge basic blocks that will never inline.
1599     if (Cost >= Threshold && !ComputeFullInlineCost)
1600       return false;
1601   }
1602 
1603   return true;
1604 }
1605 
1606 /// \brief Compute the base pointer and cumulative constant offsets for V.
1607 ///
1608 /// This strips all constant offsets off of V, leaving it the base pointer, and
1609 /// accumulates the total constant offset applied in the returned constant. It
1610 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1611 /// no constant offsets applied.
1612 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1613   if (!V->getType()->isPointerTy())
1614     return nullptr;
1615 
1616   unsigned AS = V->getType()->getPointerAddressSpace();
1617   unsigned IntPtrWidth = DL.getPointerSizeInBits(AS);
1618   APInt Offset = APInt::getNullValue(IntPtrWidth);
1619 
1620   // Even though we don't look through PHI nodes, we could be called on an
1621   // instruction in an unreachable block, which may be on a cycle.
1622   SmallPtrSet<Value *, 4> Visited;
1623   Visited.insert(V);
1624   do {
1625     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1626       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1627         return nullptr;
1628       V = GEP->getPointerOperand();
1629     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1630       V = cast<Operator>(V)->getOperand(0);
1631     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1632       if (GA->isInterposable())
1633         break;
1634       V = GA->getAliasee();
1635     } else {
1636       break;
1637     }
1638     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1639   } while (Visited.insert(V).second);
1640 
1641   Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1642   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1643 }
1644 
1645 /// \brief Find dead blocks due to deleted CFG edges during inlining.
1646 ///
1647 /// If we know the successor of the current block, \p CurrBB, has to be \p
1648 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1649 /// no live incoming CFG edges.  If one block is found to be dead, we can
1650 /// continue growing the dead block list by checking the successors of the dead
1651 /// blocks to see if all their incoming edges are dead or not.
1652 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1653   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1654     // A CFG edge is dead if the predecessor is dead or the predessor has a
1655     // known successor which is not the one under exam.
1656     return (DeadBlocks.count(Pred) ||
1657             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1658   };
1659 
1660   auto IsNewlyDead = [&](BasicBlock *BB) {
1661     // If all the edges to a block are dead, the block is also dead.
1662     return (!DeadBlocks.count(BB) &&
1663             llvm::all_of(predecessors(BB),
1664                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1665   };
1666 
1667   for (BasicBlock *Succ : successors(CurrBB)) {
1668     if (Succ == NextBB || !IsNewlyDead(Succ))
1669       continue;
1670     SmallVector<BasicBlock *, 4> NewDead;
1671     NewDead.push_back(Succ);
1672     while (!NewDead.empty()) {
1673       BasicBlock *Dead = NewDead.pop_back_val();
1674       if (DeadBlocks.insert(Dead))
1675         // Continue growing the dead block lists.
1676         for (BasicBlock *S : successors(Dead))
1677           if (IsNewlyDead(S))
1678             NewDead.push_back(S);
1679     }
1680   }
1681 }
1682 
1683 /// \brief Analyze a call site for potential inlining.
1684 ///
1685 /// Returns true if inlining this call is viable, and false if it is not
1686 /// viable. It computes the cost and adjusts the threshold based on numerous
1687 /// factors and heuristics. If this method returns false but the computed cost
1688 /// is below the computed threshold, then inlining was forcibly disabled by
1689 /// some artifact of the routine.
1690 bool CallAnalyzer::analyzeCall(CallSite CS) {
1691   ++NumCallsAnalyzed;
1692 
1693   // Perform some tweaks to the cost and threshold based on the direct
1694   // callsite information.
1695 
1696   // We want to more aggressively inline vector-dense kernels, so up the
1697   // threshold, and we'll lower it if the % of vector instructions gets too
1698   // low. Note that these bonuses are some what arbitrary and evolved over time
1699   // by accident as much as because they are principled bonuses.
1700   //
1701   // FIXME: It would be nice to remove all such bonuses. At least it would be
1702   // nice to base the bonus values on something more scientific.
1703   assert(NumInstructions == 0);
1704   assert(NumVectorInstructions == 0);
1705 
1706   // Update the threshold based on callsite properties
1707   updateThreshold(CS, F);
1708 
1709   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1710   // this Threshold any time, and cost cannot decrease, we can stop processing
1711   // the rest of the function body.
1712   Threshold += (SingleBBBonus + VectorBonus);
1713 
1714   // Give out bonuses for the callsite, as the instructions setting them up
1715   // will be gone after inlining.
1716   Cost -= getCallsiteCost(CS, DL);
1717 
1718   // If this function uses the coldcc calling convention, prefer not to inline
1719   // it.
1720   if (F.getCallingConv() == CallingConv::Cold)
1721     Cost += InlineConstants::ColdccPenalty;
1722 
1723   // Check if we're done. This can happen due to bonuses and penalties.
1724   if (Cost >= Threshold && !ComputeFullInlineCost)
1725     return false;
1726 
1727   if (F.empty())
1728     return true;
1729 
1730   Function *Caller = CS.getInstruction()->getFunction();
1731   // Check if the caller function is recursive itself.
1732   for (User *U : Caller->users()) {
1733     CallSite Site(U);
1734     if (!Site)
1735       continue;
1736     Instruction *I = Site.getInstruction();
1737     if (I->getFunction() == Caller) {
1738       IsCallerRecursive = true;
1739       break;
1740     }
1741   }
1742 
1743   // Populate our simplified values by mapping from function arguments to call
1744   // arguments with known important simplifications.
1745   CallSite::arg_iterator CAI = CS.arg_begin();
1746   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1747        FAI != FAE; ++FAI, ++CAI) {
1748     assert(CAI != CS.arg_end());
1749     if (Constant *C = dyn_cast<Constant>(CAI))
1750       SimplifiedValues[&*FAI] = C;
1751 
1752     Value *PtrArg = *CAI;
1753     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1754       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1755 
1756       // We can SROA any pointer arguments derived from alloca instructions.
1757       if (isa<AllocaInst>(PtrArg)) {
1758         SROAArgValues[&*FAI] = PtrArg;
1759         SROAArgCosts[PtrArg] = 0;
1760       }
1761     }
1762   }
1763   NumConstantArgs = SimplifiedValues.size();
1764   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1765   NumAllocaArgs = SROAArgValues.size();
1766 
1767   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1768   // the ephemeral values multiple times (and they're completely determined by
1769   // the callee, so this is purely duplicate work).
1770   SmallPtrSet<const Value *, 32> EphValues;
1771   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1772 
1773   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1774   // adding basic blocks of the callee which can be proven to be dead for this
1775   // particular call site in order to get more accurate cost estimates. This
1776   // requires a somewhat heavyweight iteration pattern: we need to walk the
1777   // basic blocks in a breadth-first order as we insert live successors. To
1778   // accomplish this, prioritizing for small iterations because we exit after
1779   // crossing our threshold, we use a small-size optimized SetVector.
1780   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1781                     SmallPtrSet<BasicBlock *, 16>>
1782       BBSetVector;
1783   BBSetVector BBWorklist;
1784   BBWorklist.insert(&F.getEntryBlock());
1785   bool SingleBB = true;
1786   // Note that we *must not* cache the size, this loop grows the worklist.
1787   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1788     // Bail out the moment we cross the threshold. This means we'll under-count
1789     // the cost, but only when undercounting doesn't matter.
1790     if (Cost >= Threshold && !ComputeFullInlineCost)
1791       break;
1792 
1793     BasicBlock *BB = BBWorklist[Idx];
1794     if (BB->empty())
1795       continue;
1796 
1797     // Disallow inlining a blockaddress. A blockaddress only has defined
1798     // behavior for an indirect branch in the same function, and we do not
1799     // currently support inlining indirect branches. But, the inliner may not
1800     // see an indirect branch that ends up being dead code at a particular call
1801     // site. If the blockaddress escapes the function, e.g., via a global
1802     // variable, inlining may lead to an invalid cross-function reference.
1803     if (BB->hasAddressTaken())
1804       return false;
1805 
1806     // Analyze the cost of this block. If we blow through the threshold, this
1807     // returns false, and we can bail on out.
1808     if (!analyzeBlock(BB, EphValues))
1809       return false;
1810 
1811     TerminatorInst *TI = BB->getTerminator();
1812 
1813     // Add in the live successors by first checking whether we have terminator
1814     // that may be simplified based on the values simplified by this call.
1815     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1816       if (BI->isConditional()) {
1817         Value *Cond = BI->getCondition();
1818         if (ConstantInt *SimpleCond =
1819                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1820           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1821           BBWorklist.insert(NextBB);
1822           KnownSuccessors[BB] = NextBB;
1823           findDeadBlocks(BB, NextBB);
1824           continue;
1825         }
1826       }
1827     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1828       Value *Cond = SI->getCondition();
1829       if (ConstantInt *SimpleCond =
1830               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1831         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1832         BBWorklist.insert(NextBB);
1833         KnownSuccessors[BB] = NextBB;
1834         findDeadBlocks(BB, NextBB);
1835         continue;
1836       }
1837     }
1838 
1839     // If we're unable to select a particular successor, just count all of
1840     // them.
1841     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1842          ++TIdx)
1843       BBWorklist.insert(TI->getSuccessor(TIdx));
1844 
1845     // If we had any successors at this point, than post-inlining is likely to
1846     // have them as well. Note that we assume any basic blocks which existed
1847     // due to branches or switches which folded above will also fold after
1848     // inlining.
1849     if (SingleBB && TI->getNumSuccessors() > 1) {
1850       // Take off the bonus we applied to the threshold.
1851       Threshold -= SingleBBBonus;
1852       SingleBB = false;
1853     }
1854   }
1855 
1856   bool OnlyOneCallAndLocalLinkage =
1857       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1858   // If this is a noduplicate call, we can still inline as long as
1859   // inlining this would cause the removal of the caller (so the instruction
1860   // is not actually duplicated, just moved).
1861   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1862     return false;
1863 
1864   // We applied the maximum possible vector bonus at the beginning. Now,
1865   // subtract the excess bonus, if any, from the Threshold before
1866   // comparing against Cost.
1867   if (NumVectorInstructions <= NumInstructions / 10)
1868     Threshold -= VectorBonus;
1869   else if (NumVectorInstructions <= NumInstructions / 2)
1870     Threshold -= VectorBonus/2;
1871 
1872   return Cost < std::max(1, Threshold);
1873 }
1874 
1875 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1876 /// \brief Dump stats about this call's analysis.
1877 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1878 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1879   DEBUG_PRINT_STAT(NumConstantArgs);
1880   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1881   DEBUG_PRINT_STAT(NumAllocaArgs);
1882   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1883   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1884   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1885   DEBUG_PRINT_STAT(NumInstructions);
1886   DEBUG_PRINT_STAT(SROACostSavings);
1887   DEBUG_PRINT_STAT(SROACostSavingsLost);
1888   DEBUG_PRINT_STAT(LoadEliminationCost);
1889   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1890   DEBUG_PRINT_STAT(Cost);
1891   DEBUG_PRINT_STAT(Threshold);
1892 #undef DEBUG_PRINT_STAT
1893 }
1894 #endif
1895 
1896 /// \brief Test that there are no attribute conflicts between Caller and Callee
1897 ///        that prevent inlining.
1898 static bool functionsHaveCompatibleAttributes(Function *Caller,
1899                                               Function *Callee,
1900                                               TargetTransformInfo &TTI) {
1901   return TTI.areInlineCompatible(Caller, Callee) &&
1902          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1903 }
1904 
1905 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1906   int Cost = 0;
1907   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1908     if (CS.isByValArgument(I)) {
1909       // We approximate the number of loads and stores needed by dividing the
1910       // size of the byval type by the target's pointer size.
1911       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1912       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1913       unsigned AS = PTy->getAddressSpace();
1914       unsigned PointerSize = DL.getPointerSizeInBits(AS);
1915       // Ceiling division.
1916       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1917 
1918       // If it generates more than 8 stores it is likely to be expanded as an
1919       // inline memcpy so we take that as an upper bound. Otherwise we assume
1920       // one load and one store per word copied.
1921       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1922       // here instead of a magic number of 8, but it's not available via
1923       // DataLayout.
1924       NumStores = std::min(NumStores, 8U);
1925 
1926       Cost += 2 * NumStores * InlineConstants::InstrCost;
1927     } else {
1928       // For non-byval arguments subtract off one instruction per call
1929       // argument.
1930       Cost += InlineConstants::InstrCost;
1931     }
1932   }
1933   // The call instruction also disappears after inlining.
1934   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1935   return Cost;
1936 }
1937 
1938 InlineCost llvm::getInlineCost(
1939     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1940     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1941     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1942     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1943   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1944                        GetAssumptionCache, GetBFI, PSI, ORE);
1945 }
1946 
1947 InlineCost llvm::getInlineCost(
1948     CallSite CS, Function *Callee, const InlineParams &Params,
1949     TargetTransformInfo &CalleeTTI,
1950     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1951     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1952     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1953 
1954   // Cannot inline indirect calls.
1955   if (!Callee)
1956     return llvm::InlineCost::getNever();
1957 
1958   // Calls to functions with always-inline attributes should be inlined
1959   // whenever possible.
1960   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1961     if (isInlineViable(*Callee))
1962       return llvm::InlineCost::getAlways();
1963     return llvm::InlineCost::getNever();
1964   }
1965 
1966   // Never inline functions with conflicting attributes (unless callee has
1967   // always-inline attribute).
1968   Function *Caller = CS.getCaller();
1969   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1970     return llvm::InlineCost::getNever();
1971 
1972   // Don't inline this call if the caller has the optnone attribute.
1973   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1974     return llvm::InlineCost::getNever();
1975 
1976   // Don't inline functions which can be interposed at link-time.  Don't inline
1977   // functions marked noinline or call sites marked noinline.
1978   // Note: inlining non-exact non-interposable functions is fine, since we know
1979   // we have *a* correct implementation of the source level function.
1980   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1981       CS.isNoInline())
1982     return llvm::InlineCost::getNever();
1983 
1984   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
1985                      << "... (caller:" << Caller->getName() << ")\n");
1986 
1987   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
1988                   Params);
1989   bool ShouldInline = CA.analyzeCall(CS);
1990 
1991   DEBUG(CA.dump());
1992 
1993   // Check if there was a reason to force inlining or no inlining.
1994   if (!ShouldInline && CA.getCost() < CA.getThreshold())
1995     return InlineCost::getNever();
1996   if (ShouldInline && CA.getCost() >= CA.getThreshold())
1997     return InlineCost::getAlways();
1998 
1999   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2000 }
2001 
2002 bool llvm::isInlineViable(Function &F) {
2003   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2004   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2005     // Disallow inlining of functions which contain indirect branches or
2006     // blockaddresses.
2007     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2008       return false;
2009 
2010     for (auto &II : *BI) {
2011       CallSite CS(&II);
2012       if (!CS)
2013         continue;
2014 
2015       // Disallow recursive calls.
2016       if (&F == CS.getCalledFunction())
2017         return false;
2018 
2019       // Disallow calls which expose returns-twice to a function not previously
2020       // attributed as such.
2021       if (!ReturnsTwice && CS.isCall() &&
2022           cast<CallInst>(CS.getInstruction())->canReturnTwice())
2023         return false;
2024 
2025       // Disallow inlining functions that call @llvm.localescape. Doing this
2026       // correctly would require major changes to the inliner.
2027       if (CS.getCalledFunction() &&
2028           CS.getCalledFunction()->getIntrinsicID() ==
2029               llvm::Intrinsic::localescape)
2030         return false;
2031     }
2032   }
2033 
2034   return true;
2035 }
2036 
2037 // APIs to create InlineParams based on command line flags and/or other
2038 // parameters.
2039 
2040 InlineParams llvm::getInlineParams(int Threshold) {
2041   InlineParams Params;
2042 
2043   // This field is the threshold to use for a callee by default. This is
2044   // derived from one or more of:
2045   //  * optimization or size-optimization levels,
2046   //  * a value passed to createFunctionInliningPass function, or
2047   //  * the -inline-threshold flag.
2048   //  If the -inline-threshold flag is explicitly specified, that is used
2049   //  irrespective of anything else.
2050   if (InlineThreshold.getNumOccurrences() > 0)
2051     Params.DefaultThreshold = InlineThreshold;
2052   else
2053     Params.DefaultThreshold = Threshold;
2054 
2055   // Set the HintThreshold knob from the -inlinehint-threshold.
2056   Params.HintThreshold = HintThreshold;
2057 
2058   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2059   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2060 
2061   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2062   // populate LocallyHotCallSiteThreshold. Later, we populate
2063   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2064   // we know that optimization level is O3 (in the getInlineParams variant that
2065   // takes the opt and size levels).
2066   // FIXME: Remove this check (and make the assignment unconditional) after
2067   // addressing size regression issues at O2.
2068   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2069     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2070 
2071   // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2072   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2073 
2074   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2075   // -inlinehint-threshold commandline option is not explicitly given. If that
2076   // option is present, then its value applies even for callees with size and
2077   // minsize attributes.
2078   // If the -inline-threshold is not specified, set the ColdThreshold from the
2079   // -inlinecold-threshold even if it is not explicitly passed. If
2080   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2081   // explicitly specified to set the ColdThreshold knob
2082   if (InlineThreshold.getNumOccurrences() == 0) {
2083     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2084     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2085     Params.ColdThreshold = ColdThreshold;
2086   } else if (ColdThreshold.getNumOccurrences() > 0) {
2087     Params.ColdThreshold = ColdThreshold;
2088   }
2089   return Params;
2090 }
2091 
2092 InlineParams llvm::getInlineParams() {
2093   return getInlineParams(InlineThreshold);
2094 }
2095 
2096 // Compute the default threshold for inlining based on the opt level and the
2097 // size opt level.
2098 static int computeThresholdFromOptLevels(unsigned OptLevel,
2099                                          unsigned SizeOptLevel) {
2100   if (OptLevel > 2)
2101     return InlineConstants::OptAggressiveThreshold;
2102   if (SizeOptLevel == 1) // -Os
2103     return InlineConstants::OptSizeThreshold;
2104   if (SizeOptLevel == 2) // -Oz
2105     return InlineConstants::OptMinSizeThreshold;
2106   return InlineThreshold;
2107 }
2108 
2109 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2110   auto Params =
2111       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2112   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2113   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2114   // when it is specified explicitly.
2115   if (OptLevel > 2)
2116     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2117   return Params;
2118 }
2119