1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/Analysis/ValueTracking.h"
29 #include "llvm/IR/CallSite.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/GetElementPtrTypeIterator.h"
33 #include "llvm/IR/GlobalAlias.h"
34 #include "llvm/IR/InstVisitor.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Operator.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Support/raw_ostream.h"
39 
40 using namespace llvm;
41 
42 #define DEBUG_TYPE "inline-cost"
43 
44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
45 
46 static cl::opt<int> InlineThreshold(
47     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
48     cl::desc("Control the amount of inlining to perform (default = 225)"));
49 
50 static cl::opt<int> HintThreshold(
51     "inlinehint-threshold", cl::Hidden, cl::init(325),
52     cl::desc("Threshold for inlining functions with inline hint"));
53 
54 static cl::opt<int>
55     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
56                           cl::init(45),
57                           cl::desc("Threshold for inlining cold callsites"));
58 
59 // We introduce this threshold to help performance of instrumentation based
60 // PGO before we actually hook up inliner with analysis passes such as BPI and
61 // BFI.
62 static cl::opt<int> ColdThreshold(
63     "inlinecold-threshold", cl::Hidden, cl::init(45),
64     cl::desc("Threshold for inlining functions with cold attribute"));
65 
66 static cl::opt<int>
67     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
68                          cl::ZeroOrMore,
69                          cl::desc("Threshold for hot callsites "));
70 
71 static cl::opt<int> LocallyHotCallSiteThreshold(
72     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
73     cl::desc("Threshold for locally hot callsites "));
74 
75 static cl::opt<int> ColdCallSiteRelFreq(
76     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
77     cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
78              "entry frequency, for a callsite to be cold in the absence of "
79              "profile information."));
80 
81 static cl::opt<int> HotCallSiteRelFreq(
82     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
83     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
84              "entry frequency, for a callsite to be hot in the absence of "
85              "profile information."));
86 
87 static cl::opt<bool> OptComputeFullInlineCost(
88     "inline-cost-full", cl::Hidden, cl::init(false),
89     cl::desc("Compute the full inline cost of a call site even when the cost "
90              "exceeds the threshold."));
91 
92 namespace {
93 
94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
95   typedef InstVisitor<CallAnalyzer, bool> Base;
96   friend class InstVisitor<CallAnalyzer, bool>;
97 
98   /// The TargetTransformInfo available for this compilation.
99   const TargetTransformInfo &TTI;
100 
101   /// Getter for the cache of @llvm.assume intrinsics.
102   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
103 
104   /// Getter for BlockFrequencyInfo
105   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
106 
107   /// Profile summary information.
108   ProfileSummaryInfo *PSI;
109 
110   /// The called function.
111   Function &F;
112 
113   // Cache the DataLayout since we use it a lot.
114   const DataLayout &DL;
115 
116   /// The OptimizationRemarkEmitter available for this compilation.
117   OptimizationRemarkEmitter *ORE;
118 
119   /// The candidate callsite being analyzed. Please do not use this to do
120   /// analysis in the caller function; we want the inline cost query to be
121   /// easily cacheable. Instead, use the cover function paramHasAttr.
122   CallSite CandidateCS;
123 
124   /// Tunable parameters that control the analysis.
125   const InlineParams &Params;
126 
127   int Threshold;
128   int Cost;
129   bool ComputeFullInlineCost;
130 
131   bool IsCallerRecursive;
132   bool IsRecursiveCall;
133   bool ExposesReturnsTwice;
134   bool HasDynamicAlloca;
135   bool ContainsNoDuplicateCall;
136   bool HasReturn;
137   bool HasIndirectBr;
138   bool HasFrameEscape;
139   bool UsesVarArgs;
140 
141   /// Number of bytes allocated statically by the callee.
142   uint64_t AllocatedSize;
143   unsigned NumInstructions, NumVectorInstructions;
144   int VectorBonus, TenPercentVectorBonus;
145   // Bonus to be applied when the callee has only one reachable basic block.
146   int SingleBBBonus;
147 
148   /// While we walk the potentially-inlined instructions, we build up and
149   /// maintain a mapping of simplified values specific to this callsite. The
150   /// idea is to propagate any special information we have about arguments to
151   /// this call through the inlinable section of the function, and account for
152   /// likely simplifications post-inlining. The most important aspect we track
153   /// is CFG altering simplifications -- when we prove a basic block dead, that
154   /// can cause dramatic shifts in the cost of inlining a function.
155   DenseMap<Value *, Constant *> SimplifiedValues;
156 
157   /// Keep track of the values which map back (through function arguments) to
158   /// allocas on the caller stack which could be simplified through SROA.
159   DenseMap<Value *, Value *> SROAArgValues;
160 
161   /// The mapping of caller Alloca values to their accumulated cost savings. If
162   /// we have to disable SROA for one of the allocas, this tells us how much
163   /// cost must be added.
164   DenseMap<Value *, int> SROAArgCosts;
165 
166   /// Keep track of values which map to a pointer base and constant offset.
167   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
168 
169   /// Keep track of dead blocks due to the constant arguments.
170   SetVector<BasicBlock *> DeadBlocks;
171 
172   /// The mapping of the blocks to their known unique successors due to the
173   /// constant arguments.
174   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
175 
176   /// Model the elimination of repeated loads that is expected to happen
177   /// whenever we simplify away the stores that would otherwise cause them to be
178   /// loads.
179   bool EnableLoadElimination;
180   SmallPtrSet<Value *, 16> LoadAddrSet;
181   int LoadEliminationCost;
182 
183   // Custom simplification helper routines.
184   bool isAllocaDerivedArg(Value *V);
185   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
186                             DenseMap<Value *, int>::iterator &CostIt);
187   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
188   void disableSROA(Value *V);
189   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
190   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
191                           int InstructionCost);
192   void disableLoadElimination();
193   bool isGEPFree(GetElementPtrInst &GEP);
194   bool canFoldInboundsGEP(GetElementPtrInst &I);
195   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
196   bool simplifyCallSite(Function *F, CallSite CS);
197   template <typename Callable>
198   bool simplifyInstruction(Instruction &I, Callable Evaluate);
199   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
200 
201   /// Return true if the given argument to the function being considered for
202   /// inlining has the given attribute set either at the call site or the
203   /// function declaration.  Primarily used to inspect call site specific
204   /// attributes since these can be more precise than the ones on the callee
205   /// itself.
206   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
207 
208   /// Return true if the given value is known non null within the callee if
209   /// inlined through this particular callsite.
210   bool isKnownNonNullInCallee(Value *V);
211 
212   /// Update Threshold based on callsite properties such as callee
213   /// attributes and callee hotness for PGO builds. The Callee is explicitly
214   /// passed to support analyzing indirect calls whose target is inferred by
215   /// analysis.
216   void updateThreshold(CallSite CS, Function &Callee);
217 
218   /// Return true if size growth is allowed when inlining the callee at CS.
219   bool allowSizeGrowth(CallSite CS);
220 
221   /// Return true if \p CS is a cold callsite.
222   bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
223 
224   /// Return a higher threshold if \p CS is a hot callsite.
225   Optional<int> getHotCallSiteThreshold(CallSite CS,
226                                         BlockFrequencyInfo *CallerBFI);
227 
228   // Custom analysis routines.
229   bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
230 
231   // Disable several entry points to the visitor so we don't accidentally use
232   // them by declaring but not defining them here.
233   void visit(Module *);
234   void visit(Module &);
235   void visit(Function *);
236   void visit(Function &);
237   void visit(BasicBlock *);
238   void visit(BasicBlock &);
239 
240   // Provide base case for our instruction visit.
241   bool visitInstruction(Instruction &I);
242 
243   // Our visit overrides.
244   bool visitAlloca(AllocaInst &I);
245   bool visitPHI(PHINode &I);
246   bool visitGetElementPtr(GetElementPtrInst &I);
247   bool visitBitCast(BitCastInst &I);
248   bool visitPtrToInt(PtrToIntInst &I);
249   bool visitIntToPtr(IntToPtrInst &I);
250   bool visitCastInst(CastInst &I);
251   bool visitUnaryInstruction(UnaryInstruction &I);
252   bool visitCmpInst(CmpInst &I);
253   bool visitSub(BinaryOperator &I);
254   bool visitBinaryOperator(BinaryOperator &I);
255   bool visitLoad(LoadInst &I);
256   bool visitStore(StoreInst &I);
257   bool visitExtractValue(ExtractValueInst &I);
258   bool visitInsertValue(InsertValueInst &I);
259   bool visitCallSite(CallSite CS);
260   bool visitReturnInst(ReturnInst &RI);
261   bool visitBranchInst(BranchInst &BI);
262   bool visitSelectInst(SelectInst &SI);
263   bool visitSwitchInst(SwitchInst &SI);
264   bool visitIndirectBrInst(IndirectBrInst &IBI);
265   bool visitResumeInst(ResumeInst &RI);
266   bool visitCleanupReturnInst(CleanupReturnInst &RI);
267   bool visitCatchReturnInst(CatchReturnInst &RI);
268   bool visitUnreachableInst(UnreachableInst &I);
269 
270 public:
271   CallAnalyzer(const TargetTransformInfo &TTI,
272                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
273                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
274                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
275                Function &Callee, CallSite CSArg, const InlineParams &Params)
276       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
277         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
278         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
279         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
280                                        Params.ComputeFullInlineCost || ORE),
281         IsCallerRecursive(false), IsRecursiveCall(false),
282         ExposesReturnsTwice(false), HasDynamicAlloca(false),
283         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
284         HasFrameEscape(false), UsesVarArgs(false), AllocatedSize(0), NumInstructions(0),
285         NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
286         EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0),
287         NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
288         NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
289         SROACostSavings(0), SROACostSavingsLost(0) {}
290 
291   bool analyzeCall(CallSite CS);
292 
293   int getThreshold() { return Threshold; }
294   int getCost() { return Cost; }
295 
296   // Keep a bunch of stats about the cost savings found so we can print them
297   // out when debugging.
298   unsigned NumConstantArgs;
299   unsigned NumConstantOffsetPtrArgs;
300   unsigned NumAllocaArgs;
301   unsigned NumConstantPtrCmps;
302   unsigned NumConstantPtrDiffs;
303   unsigned NumInstructionsSimplified;
304   unsigned SROACostSavings;
305   unsigned SROACostSavingsLost;
306 
307   void dump();
308 };
309 
310 } // namespace
311 
312 /// \brief Test whether the given value is an Alloca-derived function argument.
313 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
314   return SROAArgValues.count(V);
315 }
316 
317 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to.
318 /// Returns false if V does not map to a SROA-candidate.
319 bool CallAnalyzer::lookupSROAArgAndCost(
320     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
321   if (SROAArgValues.empty() || SROAArgCosts.empty())
322     return false;
323 
324   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
325   if (ArgIt == SROAArgValues.end())
326     return false;
327 
328   Arg = ArgIt->second;
329   CostIt = SROAArgCosts.find(Arg);
330   return CostIt != SROAArgCosts.end();
331 }
332 
333 /// \brief Disable SROA for the candidate marked by this cost iterator.
334 ///
335 /// This marks the candidate as no longer viable for SROA, and adds the cost
336 /// savings associated with it back into the inline cost measurement.
337 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
338   // If we're no longer able to perform SROA we need to undo its cost savings
339   // and prevent subsequent analysis.
340   Cost += CostIt->second;
341   SROACostSavings -= CostIt->second;
342   SROACostSavingsLost += CostIt->second;
343   SROAArgCosts.erase(CostIt);
344   disableLoadElimination();
345 }
346 
347 /// \brief If 'V' maps to a SROA candidate, disable SROA for it.
348 void CallAnalyzer::disableSROA(Value *V) {
349   Value *SROAArg;
350   DenseMap<Value *, int>::iterator CostIt;
351   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
352     disableSROA(CostIt);
353 }
354 
355 /// \brief Accumulate the given cost for a particular SROA candidate.
356 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
357                                       int InstructionCost) {
358   CostIt->second += InstructionCost;
359   SROACostSavings += InstructionCost;
360 }
361 
362 void CallAnalyzer::disableLoadElimination() {
363   if (EnableLoadElimination) {
364     Cost += LoadEliminationCost;
365     LoadEliminationCost = 0;
366     EnableLoadElimination = false;
367   }
368 }
369 
370 /// \brief Accumulate a constant GEP offset into an APInt if possible.
371 ///
372 /// Returns false if unable to compute the offset for any reason. Respects any
373 /// simplified values known during the analysis of this callsite.
374 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
375   unsigned IntPtrWidth = DL.getPointerTypeSizeInBits(GEP.getType());
376   assert(IntPtrWidth == Offset.getBitWidth());
377 
378   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
379        GTI != GTE; ++GTI) {
380     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
381     if (!OpC)
382       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
383         OpC = dyn_cast<ConstantInt>(SimpleOp);
384     if (!OpC)
385       return false;
386     if (OpC->isZero())
387       continue;
388 
389     // Handle a struct index, which adds its field offset to the pointer.
390     if (StructType *STy = GTI.getStructTypeOrNull()) {
391       unsigned ElementIdx = OpC->getZExtValue();
392       const StructLayout *SL = DL.getStructLayout(STy);
393       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
394       continue;
395     }
396 
397     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
398     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
399   }
400   return true;
401 }
402 
403 /// \brief Use TTI to check whether a GEP is free.
404 ///
405 /// Respects any simplified values known during the analysis of this callsite.
406 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
407   SmallVector<Value *, 4> Operands;
408   Operands.push_back(GEP.getOperand(0));
409   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
410     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
411        Operands.push_back(SimpleOp);
412      else
413        Operands.push_back(*I);
414   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
415 }
416 
417 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
418   // Check whether inlining will turn a dynamic alloca into a static
419   // alloca and handle that case.
420   if (I.isArrayAllocation()) {
421     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
422     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
423       Type *Ty = I.getAllocatedType();
424       AllocatedSize = SaturatingMultiplyAdd(
425           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
426       return Base::visitAlloca(I);
427     }
428   }
429 
430   // Accumulate the allocated size.
431   if (I.isStaticAlloca()) {
432     Type *Ty = I.getAllocatedType();
433     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
434   }
435 
436   // We will happily inline static alloca instructions.
437   if (I.isStaticAlloca())
438     return Base::visitAlloca(I);
439 
440   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
441   // a variety of reasons, and so we would like to not inline them into
442   // functions which don't currently have a dynamic alloca. This simply
443   // disables inlining altogether in the presence of a dynamic alloca.
444   HasDynamicAlloca = true;
445   return false;
446 }
447 
448 bool CallAnalyzer::visitPHI(PHINode &I) {
449   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
450   // though we don't want to propagate it's bonuses. The idea is to disable
451   // SROA if it *might* be used in an inappropriate manner.
452 
453   // Phi nodes are always zero-cost.
454   // FIXME: Pointer sizes may differ between different address spaces, so do we
455   // need to use correct address space in the call to getPointerSizeInBits here?
456   // Or could we skip the getPointerSizeInBits call completely? As far as I can
457   // see the ZeroOffset is used as a dummy value, so we can probably use any
458   // bit width for the ZeroOffset?
459   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
460   bool CheckSROA = I.getType()->isPointerTy();
461 
462   // Track the constant or pointer with constant offset we've seen so far.
463   Constant *FirstC = nullptr;
464   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
465   Value *FirstV = nullptr;
466 
467   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
468     BasicBlock *Pred = I.getIncomingBlock(i);
469     // If the incoming block is dead, skip the incoming block.
470     if (DeadBlocks.count(Pred))
471       continue;
472     // If the parent block of phi is not the known successor of the incoming
473     // block, skip the incoming block.
474     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
475     if (KnownSuccessor && KnownSuccessor != I.getParent())
476       continue;
477 
478     Value *V = I.getIncomingValue(i);
479     // If the incoming value is this phi itself, skip the incoming value.
480     if (&I == V)
481       continue;
482 
483     Constant *C = dyn_cast<Constant>(V);
484     if (!C)
485       C = SimplifiedValues.lookup(V);
486 
487     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
488     if (!C && CheckSROA)
489       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
490 
491     if (!C && !BaseAndOffset.first)
492       // The incoming value is neither a constant nor a pointer with constant
493       // offset, exit early.
494       return true;
495 
496     if (FirstC) {
497       if (FirstC == C)
498         // If we've seen a constant incoming value before and it is the same
499         // constant we see this time, continue checking the next incoming value.
500         continue;
501       // Otherwise early exit because we either see a different constant or saw
502       // a constant before but we have a pointer with constant offset this time.
503       return true;
504     }
505 
506     if (FirstV) {
507       // The same logic as above, but check pointer with constant offset here.
508       if (FirstBaseAndOffset == BaseAndOffset)
509         continue;
510       return true;
511     }
512 
513     if (C) {
514       // This is the 1st time we've seen a constant, record it.
515       FirstC = C;
516       continue;
517     }
518 
519     // The remaining case is that this is the 1st time we've seen a pointer with
520     // constant offset, record it.
521     FirstV = V;
522     FirstBaseAndOffset = BaseAndOffset;
523   }
524 
525   // Check if we can map phi to a constant.
526   if (FirstC) {
527     SimplifiedValues[&I] = FirstC;
528     return true;
529   }
530 
531   // Check if we can map phi to a pointer with constant offset.
532   if (FirstBaseAndOffset.first) {
533     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
534 
535     Value *SROAArg;
536     DenseMap<Value *, int>::iterator CostIt;
537     if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
538       SROAArgValues[&I] = SROAArg;
539   }
540 
541   return true;
542 }
543 
544 /// \brief Check we can fold GEPs of constant-offset call site argument pointers.
545 /// This requires target data and inbounds GEPs.
546 ///
547 /// \return true if the specified GEP can be folded.
548 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
549   // Check if we have a base + offset for the pointer.
550   std::pair<Value *, APInt> BaseAndOffset =
551       ConstantOffsetPtrs.lookup(I.getPointerOperand());
552   if (!BaseAndOffset.first)
553     return false;
554 
555   // Check if the offset of this GEP is constant, and if so accumulate it
556   // into Offset.
557   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
558     return false;
559 
560   // Add the result as a new mapping to Base + Offset.
561   ConstantOffsetPtrs[&I] = BaseAndOffset;
562 
563   return true;
564 }
565 
566 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
567   Value *SROAArg;
568   DenseMap<Value *, int>::iterator CostIt;
569   bool SROACandidate =
570       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
571 
572   // Lambda to check whether a GEP's indices are all constant.
573   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
574     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
575       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
576         return false;
577     return true;
578   };
579 
580   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
581     if (SROACandidate)
582       SROAArgValues[&I] = SROAArg;
583 
584     // Constant GEPs are modeled as free.
585     return true;
586   }
587 
588   // Variable GEPs will require math and will disable SROA.
589   if (SROACandidate)
590     disableSROA(CostIt);
591   return isGEPFree(I);
592 }
593 
594 /// Simplify \p I if its operands are constants and update SimplifiedValues.
595 /// \p Evaluate is a callable specific to instruction type that evaluates the
596 /// instruction when all the operands are constants.
597 template <typename Callable>
598 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
599   SmallVector<Constant *, 2> COps;
600   for (Value *Op : I.operands()) {
601     Constant *COp = dyn_cast<Constant>(Op);
602     if (!COp)
603       COp = SimplifiedValues.lookup(Op);
604     if (!COp)
605       return false;
606     COps.push_back(COp);
607   }
608   auto *C = Evaluate(COps);
609   if (!C)
610     return false;
611   SimplifiedValues[&I] = C;
612   return true;
613 }
614 
615 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
616   // Propagate constants through bitcasts.
617   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
618         return ConstantExpr::getBitCast(COps[0], I.getType());
619       }))
620     return true;
621 
622   // Track base/offsets through casts
623   std::pair<Value *, APInt> BaseAndOffset =
624       ConstantOffsetPtrs.lookup(I.getOperand(0));
625   // Casts don't change the offset, just wrap it up.
626   if (BaseAndOffset.first)
627     ConstantOffsetPtrs[&I] = BaseAndOffset;
628 
629   // Also look for SROA candidates here.
630   Value *SROAArg;
631   DenseMap<Value *, int>::iterator CostIt;
632   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
633     SROAArgValues[&I] = SROAArg;
634 
635   // Bitcasts are always zero cost.
636   return true;
637 }
638 
639 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
640   // Propagate constants through ptrtoint.
641   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
642         return ConstantExpr::getPtrToInt(COps[0], I.getType());
643       }))
644     return true;
645 
646   // Track base/offset pairs when converted to a plain integer provided the
647   // integer is large enough to represent the pointer.
648   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
649   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
650   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
651     std::pair<Value *, APInt> BaseAndOffset =
652         ConstantOffsetPtrs.lookup(I.getOperand(0));
653     if (BaseAndOffset.first)
654       ConstantOffsetPtrs[&I] = BaseAndOffset;
655   }
656 
657   // This is really weird. Technically, ptrtoint will disable SROA. However,
658   // unless that ptrtoint is *used* somewhere in the live basic blocks after
659   // inlining, it will be nuked, and SROA should proceed. All of the uses which
660   // would block SROA would also block SROA if applied directly to a pointer,
661   // and so we can just add the integer in here. The only places where SROA is
662   // preserved either cannot fire on an integer, or won't in-and-of themselves
663   // disable SROA (ext) w/o some later use that we would see and disable.
664   Value *SROAArg;
665   DenseMap<Value *, int>::iterator CostIt;
666   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
667     SROAArgValues[&I] = SROAArg;
668 
669   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
670 }
671 
672 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
673   // Propagate constants through ptrtoint.
674   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
675         return ConstantExpr::getIntToPtr(COps[0], I.getType());
676       }))
677     return true;
678 
679   // Track base/offset pairs when round-tripped through a pointer without
680   // modifications provided the integer is not too large.
681   Value *Op = I.getOperand(0);
682   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
683   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
684     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
685     if (BaseAndOffset.first)
686       ConstantOffsetPtrs[&I] = BaseAndOffset;
687   }
688 
689   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
690   Value *SROAArg;
691   DenseMap<Value *, int>::iterator CostIt;
692   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
693     SROAArgValues[&I] = SROAArg;
694 
695   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
696 }
697 
698 bool CallAnalyzer::visitCastInst(CastInst &I) {
699   // Propagate constants through ptrtoint.
700   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
701         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
702       }))
703     return true;
704 
705   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
706   disableSROA(I.getOperand(0));
707 
708   // If this is a floating-point cast, and the target says this operation
709   // is expensive, this may eventually become a library call. Treat the cost
710   // as such.
711   switch (I.getOpcode()) {
712   case Instruction::FPTrunc:
713   case Instruction::FPExt:
714   case Instruction::UIToFP:
715   case Instruction::SIToFP:
716   case Instruction::FPToUI:
717   case Instruction::FPToSI:
718     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
719       Cost += InlineConstants::CallPenalty;
720   default:
721     break;
722   }
723 
724   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
725 }
726 
727 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
728   Value *Operand = I.getOperand(0);
729   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
730         return ConstantFoldInstOperands(&I, COps[0], DL);
731       }))
732     return true;
733 
734   // Disable any SROA on the argument to arbitrary unary operators.
735   disableSROA(Operand);
736 
737   return false;
738 }
739 
740 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
741   return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
742 }
743 
744 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
745   // Does the *call site* have the NonNull attribute set on an argument?  We
746   // use the attribute on the call site to memoize any analysis done in the
747   // caller. This will also trip if the callee function has a non-null
748   // parameter attribute, but that's a less interesting case because hopefully
749   // the callee would already have been simplified based on that.
750   if (Argument *A = dyn_cast<Argument>(V))
751     if (paramHasAttr(A, Attribute::NonNull))
752       return true;
753 
754   // Is this an alloca in the caller?  This is distinct from the attribute case
755   // above because attributes aren't updated within the inliner itself and we
756   // always want to catch the alloca derived case.
757   if (isAllocaDerivedArg(V))
758     // We can actually predict the result of comparisons between an
759     // alloca-derived value and null. Note that this fires regardless of
760     // SROA firing.
761     return true;
762 
763   return false;
764 }
765 
766 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
767   // If the normal destination of the invoke or the parent block of the call
768   // site is unreachable-terminated, there is little point in inlining this
769   // unless there is literally zero cost.
770   // FIXME: Note that it is possible that an unreachable-terminated block has a
771   // hot entry. For example, in below scenario inlining hot_call_X() may be
772   // beneficial :
773   // main() {
774   //   hot_call_1();
775   //   ...
776   //   hot_call_N()
777   //   exit(0);
778   // }
779   // For now, we are not handling this corner case here as it is rare in real
780   // code. In future, we should elaborate this based on BPI and BFI in more
781   // general threshold adjusting heuristics in updateThreshold().
782   Instruction *Instr = CS.getInstruction();
783   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
784     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
785       return false;
786   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
787     return false;
788 
789   return true;
790 }
791 
792 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
793   // If global profile summary is available, then callsite's coldness is
794   // determined based on that.
795   if (PSI && PSI->hasProfileSummary())
796     return PSI->isColdCallSite(CS, CallerBFI);
797 
798   // Otherwise we need BFI to be available.
799   if (!CallerBFI)
800     return false;
801 
802   // Determine if the callsite is cold relative to caller's entry. We could
803   // potentially cache the computation of scaled entry frequency, but the added
804   // complexity is not worth it unless this scaling shows up high in the
805   // profiles.
806   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
807   auto CallSiteBB = CS.getInstruction()->getParent();
808   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
809   auto CallerEntryFreq =
810       CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
811   return CallSiteFreq < CallerEntryFreq * ColdProb;
812 }
813 
814 Optional<int>
815 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
816                                       BlockFrequencyInfo *CallerBFI) {
817 
818   // If global profile summary is available, then callsite's hotness is
819   // determined based on that.
820   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
821     return Params.HotCallSiteThreshold;
822 
823   // Otherwise we need BFI to be available and to have a locally hot callsite
824   // threshold.
825   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
826     return None;
827 
828   // Determine if the callsite is hot relative to caller's entry. We could
829   // potentially cache the computation of scaled entry frequency, but the added
830   // complexity is not worth it unless this scaling shows up high in the
831   // profiles.
832   auto CallSiteBB = CS.getInstruction()->getParent();
833   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
834   auto CallerEntryFreq = CallerBFI->getEntryFreq();
835   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
836     return Params.LocallyHotCallSiteThreshold;
837 
838   // Otherwise treat it normally.
839   return None;
840 }
841 
842 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
843   // If no size growth is allowed for this inlining, set Threshold to 0.
844   if (!allowSizeGrowth(CS)) {
845     Threshold = 0;
846     return;
847   }
848 
849   Function *Caller = CS.getCaller();
850 
851   // return min(A, B) if B is valid.
852   auto MinIfValid = [](int A, Optional<int> B) {
853     return B ? std::min(A, B.getValue()) : A;
854   };
855 
856   // return max(A, B) if B is valid.
857   auto MaxIfValid = [](int A, Optional<int> B) {
858     return B ? std::max(A, B.getValue()) : A;
859   };
860 
861   // Various bonus percentages. These are multiplied by Threshold to get the
862   // bonus values.
863   // SingleBBBonus: This bonus is applied if the callee has a single reachable
864   // basic block at the given callsite context. This is speculatively applied
865   // and withdrawn if more than one basic block is seen.
866   //
867   // Vector bonuses: We want to more aggressively inline vector-dense kernels
868   // and apply this bonus based on the percentage of vector instructions. A
869   // bonus is applied if the vector instructions exceed 50% and half that amount
870   // is applied if it exceeds 10%. Note that these bonuses are some what
871   // arbitrary and evolved over time by accident as much as because they are
872   // principled bonuses.
873   // FIXME: It would be nice to base the bonus values on something more
874   // scientific.
875   //
876   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
877   // of the last call to a static function as inlining such functions is
878   // guaranteed to reduce code size.
879   //
880   // These bonus percentages may be set to 0 based on properties of the caller
881   // and the callsite.
882   int SingleBBBonusPercent = 50;
883   int VectorBonusPercent = 150;
884   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
885 
886   // Lambda to set all the above bonus and bonus percentages to 0.
887   auto DisallowAllBonuses = [&]() {
888     SingleBBBonusPercent = 0;
889     VectorBonusPercent = 0;
890     LastCallToStaticBonus = 0;
891   };
892 
893   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
894   // and reduce the threshold if the caller has the necessary attribute.
895   if (Caller->optForMinSize()) {
896     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
897     // For minsize, we want to disable the single BB bonus and the vector
898     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
899     // a static function will, at the minimum, eliminate the parameter setup and
900     // call/return instructions.
901     SingleBBBonusPercent = 0;
902     VectorBonusPercent = 0;
903   } else if (Caller->optForSize())
904     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
905 
906   // Adjust the threshold based on inlinehint attribute and profile based
907   // hotness information if the caller does not have MinSize attribute.
908   if (!Caller->optForMinSize()) {
909     if (Callee.hasFnAttribute(Attribute::InlineHint))
910       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
911 
912     // FIXME: After switching to the new passmanager, simplify the logic below
913     // by checking only the callsite hotness/coldness as we will reliably
914     // have local profile information.
915     //
916     // Callsite hotness and coldness can be determined if sample profile is
917     // used (which adds hotness metadata to calls) or if caller's
918     // BlockFrequencyInfo is available.
919     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
920     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
921     if (!Caller->optForSize() && HotCallSiteThreshold) {
922       DEBUG(dbgs() << "Hot callsite.\n");
923       // FIXME: This should update the threshold only if it exceeds the
924       // current threshold, but AutoFDO + ThinLTO currently relies on this
925       // behavior to prevent inlining of hot callsites during ThinLTO
926       // compile phase.
927       Threshold = HotCallSiteThreshold.getValue();
928     } else if (isColdCallSite(CS, CallerBFI)) {
929       DEBUG(dbgs() << "Cold callsite.\n");
930       // Do not apply bonuses for a cold callsite including the
931       // LastCallToStatic bonus. While this bonus might result in code size
932       // reduction, it can cause the size of a non-cold caller to increase
933       // preventing it from being inlined.
934       DisallowAllBonuses();
935       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
936     } else if (PSI) {
937       // Use callee's global profile information only if we have no way of
938       // determining this via callsite information.
939       if (PSI->isFunctionEntryHot(&Callee)) {
940         DEBUG(dbgs() << "Hot callee.\n");
941         // If callsite hotness can not be determined, we may still know
942         // that the callee is hot and treat it as a weaker hint for threshold
943         // increase.
944         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
945       } else if (PSI->isFunctionEntryCold(&Callee)) {
946         DEBUG(dbgs() << "Cold callee.\n");
947         // Do not apply bonuses for a cold callee including the
948         // LastCallToStatic bonus. While this bonus might result in code size
949         // reduction, it can cause the size of a non-cold caller to increase
950         // preventing it from being inlined.
951         DisallowAllBonuses();
952         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
953       }
954     }
955   }
956 
957   // Finally, take the target-specific inlining threshold multiplier into
958   // account.
959   Threshold *= TTI.getInliningThresholdMultiplier();
960 
961   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
962   VectorBonus = Threshold * VectorBonusPercent / 100;
963 
964   bool OnlyOneCallAndLocalLinkage =
965       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
966   // If there is only one call of the function, and it has internal linkage,
967   // the cost of inlining it drops dramatically. It may seem odd to update
968   // Cost in updateThreshold, but the bonus depends on the logic in this method.
969   if (OnlyOneCallAndLocalLinkage)
970     Cost -= LastCallToStaticBonus;
971 }
972 
973 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
974   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
975   // First try to handle simplified comparisons.
976   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
977         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
978       }))
979     return true;
980 
981   if (I.getOpcode() == Instruction::FCmp)
982     return false;
983 
984   // Otherwise look for a comparison between constant offset pointers with
985   // a common base.
986   Value *LHSBase, *RHSBase;
987   APInt LHSOffset, RHSOffset;
988   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
989   if (LHSBase) {
990     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
991     if (RHSBase && LHSBase == RHSBase) {
992       // We have common bases, fold the icmp to a constant based on the
993       // offsets.
994       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
995       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
996       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
997         SimplifiedValues[&I] = C;
998         ++NumConstantPtrCmps;
999         return true;
1000       }
1001     }
1002   }
1003 
1004   // If the comparison is an equality comparison with null, we can simplify it
1005   // if we know the value (argument) can't be null
1006   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1007       isKnownNonNullInCallee(I.getOperand(0))) {
1008     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1009     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1010                                       : ConstantInt::getFalse(I.getType());
1011     return true;
1012   }
1013   // Finally check for SROA candidates in comparisons.
1014   Value *SROAArg;
1015   DenseMap<Value *, int>::iterator CostIt;
1016   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1017     if (isa<ConstantPointerNull>(I.getOperand(1))) {
1018       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1019       return true;
1020     }
1021 
1022     disableSROA(CostIt);
1023   }
1024 
1025   return false;
1026 }
1027 
1028 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1029   // Try to handle a special case: we can fold computing the difference of two
1030   // constant-related pointers.
1031   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1032   Value *LHSBase, *RHSBase;
1033   APInt LHSOffset, RHSOffset;
1034   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1035   if (LHSBase) {
1036     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1037     if (RHSBase && LHSBase == RHSBase) {
1038       // We have common bases, fold the subtract to a constant based on the
1039       // offsets.
1040       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1041       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1042       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1043         SimplifiedValues[&I] = C;
1044         ++NumConstantPtrDiffs;
1045         return true;
1046       }
1047     }
1048   }
1049 
1050   // Otherwise, fall back to the generic logic for simplifying and handling
1051   // instructions.
1052   return Base::visitSub(I);
1053 }
1054 
1055 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1056   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1057   Constant *CLHS = dyn_cast<Constant>(LHS);
1058   if (!CLHS)
1059     CLHS = SimplifiedValues.lookup(LHS);
1060   Constant *CRHS = dyn_cast<Constant>(RHS);
1061   if (!CRHS)
1062     CRHS = SimplifiedValues.lookup(RHS);
1063 
1064   Value *SimpleV = nullptr;
1065   if (auto FI = dyn_cast<FPMathOperator>(&I))
1066     SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1067                               CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1068   else
1069     SimpleV =
1070         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1071 
1072   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1073     SimplifiedValues[&I] = C;
1074 
1075   if (SimpleV)
1076     return true;
1077 
1078   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1079   disableSROA(LHS);
1080   disableSROA(RHS);
1081 
1082   // If the instruction is floating point, and the target says this operation
1083   // is expensive, this may eventually become a library call. Treat the cost
1084   // as such.
1085   if (I.getType()->isFloatingPointTy() &&
1086       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1087     Cost += InlineConstants::CallPenalty;
1088 
1089   return false;
1090 }
1091 
1092 bool CallAnalyzer::visitLoad(LoadInst &I) {
1093   Value *SROAArg;
1094   DenseMap<Value *, int>::iterator CostIt;
1095   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1096     if (I.isSimple()) {
1097       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1098       return true;
1099     }
1100 
1101     disableSROA(CostIt);
1102   }
1103 
1104   // If the data is already loaded from this address and hasn't been clobbered
1105   // by any stores or calls, this load is likely to be redundant and can be
1106   // eliminated.
1107   if (EnableLoadElimination &&
1108       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1109     LoadEliminationCost += InlineConstants::InstrCost;
1110     return true;
1111   }
1112 
1113   return false;
1114 }
1115 
1116 bool CallAnalyzer::visitStore(StoreInst &I) {
1117   Value *SROAArg;
1118   DenseMap<Value *, int>::iterator CostIt;
1119   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1120     if (I.isSimple()) {
1121       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1122       return true;
1123     }
1124 
1125     disableSROA(CostIt);
1126   }
1127 
1128   // The store can potentially clobber loads and prevent repeated loads from
1129   // being eliminated.
1130   // FIXME:
1131   // 1. We can probably keep an initial set of eliminatable loads substracted
1132   // from the cost even when we finally see a store. We just need to disable
1133   // *further* accumulation of elimination savings.
1134   // 2. We should probably at some point thread MemorySSA for the callee into
1135   // this and then use that to actually compute *really* precise savings.
1136   disableLoadElimination();
1137   return false;
1138 }
1139 
1140 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1141   // Constant folding for extract value is trivial.
1142   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1143         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1144       }))
1145     return true;
1146 
1147   // SROA can look through these but give them a cost.
1148   return false;
1149 }
1150 
1151 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1152   // Constant folding for insert value is trivial.
1153   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1154         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1155                                             /*InsertedValueOperand*/ COps[1],
1156                                             I.getIndices());
1157       }))
1158     return true;
1159 
1160   // SROA can look through these but give them a cost.
1161   return false;
1162 }
1163 
1164 /// \brief Try to simplify a call site.
1165 ///
1166 /// Takes a concrete function and callsite and tries to actually simplify it by
1167 /// analyzing the arguments and call itself with instsimplify. Returns true if
1168 /// it has simplified the callsite to some other entity (a constant), making it
1169 /// free.
1170 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1171   // FIXME: Using the instsimplify logic directly for this is inefficient
1172   // because we have to continually rebuild the argument list even when no
1173   // simplifications can be performed. Until that is fixed with remapping
1174   // inside of instsimplify, directly constant fold calls here.
1175   if (!canConstantFoldCallTo(CS, F))
1176     return false;
1177 
1178   // Try to re-map the arguments to constants.
1179   SmallVector<Constant *, 4> ConstantArgs;
1180   ConstantArgs.reserve(CS.arg_size());
1181   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1182        ++I) {
1183     Constant *C = dyn_cast<Constant>(*I);
1184     if (!C)
1185       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1186     if (!C)
1187       return false; // This argument doesn't map to a constant.
1188 
1189     ConstantArgs.push_back(C);
1190   }
1191   if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1192     SimplifiedValues[CS.getInstruction()] = C;
1193     return true;
1194   }
1195 
1196   return false;
1197 }
1198 
1199 bool CallAnalyzer::visitCallSite(CallSite CS) {
1200   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1201       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1202     // This aborts the entire analysis.
1203     ExposesReturnsTwice = true;
1204     return false;
1205   }
1206   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1207     ContainsNoDuplicateCall = true;
1208 
1209   if (Function *F = CS.getCalledFunction()) {
1210     // When we have a concrete function, first try to simplify it directly.
1211     if (simplifyCallSite(F, CS))
1212       return true;
1213 
1214     // Next check if it is an intrinsic we know about.
1215     // FIXME: Lift this into part of the InstVisitor.
1216     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1217       switch (II->getIntrinsicID()) {
1218       default:
1219         if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1220           disableLoadElimination();
1221         return Base::visitCallSite(CS);
1222 
1223       case Intrinsic::load_relative:
1224         // This is normally lowered to 4 LLVM instructions.
1225         Cost += 3 * InlineConstants::InstrCost;
1226         return false;
1227 
1228       case Intrinsic::memset:
1229       case Intrinsic::memcpy:
1230       case Intrinsic::memmove:
1231         disableLoadElimination();
1232         // SROA can usually chew through these intrinsics, but they aren't free.
1233         return false;
1234       case Intrinsic::localescape:
1235         HasFrameEscape = true;
1236         return false;
1237       case Intrinsic::vastart:
1238       case Intrinsic::vaend:
1239         UsesVarArgs = true;
1240         return false;
1241       }
1242     }
1243 
1244     if (F == CS.getInstruction()->getFunction()) {
1245       // This flag will fully abort the analysis, so don't bother with anything
1246       // else.
1247       IsRecursiveCall = true;
1248       return false;
1249     }
1250 
1251     if (TTI.isLoweredToCall(F)) {
1252       // We account for the average 1 instruction per call argument setup
1253       // here.
1254       Cost += CS.arg_size() * InlineConstants::InstrCost;
1255 
1256       // Everything other than inline ASM will also have a significant cost
1257       // merely from making the call.
1258       if (!isa<InlineAsm>(CS.getCalledValue()))
1259         Cost += InlineConstants::CallPenalty;
1260     }
1261 
1262     if (!CS.onlyReadsMemory())
1263       disableLoadElimination();
1264     return Base::visitCallSite(CS);
1265   }
1266 
1267   // Otherwise we're in a very special case -- an indirect function call. See
1268   // if we can be particularly clever about this.
1269   Value *Callee = CS.getCalledValue();
1270 
1271   // First, pay the price of the argument setup. We account for the average
1272   // 1 instruction per call argument setup here.
1273   Cost += CS.arg_size() * InlineConstants::InstrCost;
1274 
1275   // Next, check if this happens to be an indirect function call to a known
1276   // function in this inline context. If not, we've done all we can.
1277   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1278   if (!F) {
1279     if (!CS.onlyReadsMemory())
1280       disableLoadElimination();
1281     return Base::visitCallSite(CS);
1282   }
1283 
1284   // If we have a constant that we are calling as a function, we can peer
1285   // through it and see the function target. This happens not infrequently
1286   // during devirtualization and so we want to give it a hefty bonus for
1287   // inlining, but cap that bonus in the event that inlining wouldn't pan
1288   // out. Pretend to inline the function, with a custom threshold.
1289   auto IndirectCallParams = Params;
1290   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1291   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1292                   IndirectCallParams);
1293   if (CA.analyzeCall(CS)) {
1294     // We were able to inline the indirect call! Subtract the cost from the
1295     // threshold to get the bonus we want to apply, but don't go below zero.
1296     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1297   }
1298 
1299   if (!F->onlyReadsMemory())
1300     disableLoadElimination();
1301   return Base::visitCallSite(CS);
1302 }
1303 
1304 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1305   // At least one return instruction will be free after inlining.
1306   bool Free = !HasReturn;
1307   HasReturn = true;
1308   return Free;
1309 }
1310 
1311 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1312   // We model unconditional branches as essentially free -- they really
1313   // shouldn't exist at all, but handling them makes the behavior of the
1314   // inliner more regular and predictable. Interestingly, conditional branches
1315   // which will fold away are also free.
1316   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1317          dyn_cast_or_null<ConstantInt>(
1318              SimplifiedValues.lookup(BI.getCondition()));
1319 }
1320 
1321 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1322   bool CheckSROA = SI.getType()->isPointerTy();
1323   Value *TrueVal = SI.getTrueValue();
1324   Value *FalseVal = SI.getFalseValue();
1325 
1326   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1327   if (!TrueC)
1328     TrueC = SimplifiedValues.lookup(TrueVal);
1329   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1330   if (!FalseC)
1331     FalseC = SimplifiedValues.lookup(FalseVal);
1332   Constant *CondC =
1333       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1334 
1335   if (!CondC) {
1336     // Select C, X, X => X
1337     if (TrueC == FalseC && TrueC) {
1338       SimplifiedValues[&SI] = TrueC;
1339       return true;
1340     }
1341 
1342     if (!CheckSROA)
1343       return Base::visitSelectInst(SI);
1344 
1345     std::pair<Value *, APInt> TrueBaseAndOffset =
1346         ConstantOffsetPtrs.lookup(TrueVal);
1347     std::pair<Value *, APInt> FalseBaseAndOffset =
1348         ConstantOffsetPtrs.lookup(FalseVal);
1349     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1350       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1351 
1352       Value *SROAArg;
1353       DenseMap<Value *, int>::iterator CostIt;
1354       if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1355         SROAArgValues[&SI] = SROAArg;
1356       return true;
1357     }
1358 
1359     return Base::visitSelectInst(SI);
1360   }
1361 
1362   // Select condition is a constant.
1363   Value *SelectedV = CondC->isAllOnesValue()
1364                          ? TrueVal
1365                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1366   if (!SelectedV) {
1367     // Condition is a vector constant that is not all 1s or all 0s.  If all
1368     // operands are constants, ConstantExpr::getSelect() can handle the cases
1369     // such as select vectors.
1370     if (TrueC && FalseC) {
1371       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1372         SimplifiedValues[&SI] = C;
1373         return true;
1374       }
1375     }
1376     return Base::visitSelectInst(SI);
1377   }
1378 
1379   // Condition is either all 1s or all 0s. SI can be simplified.
1380   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1381     SimplifiedValues[&SI] = SelectedC;
1382     return true;
1383   }
1384 
1385   if (!CheckSROA)
1386     return true;
1387 
1388   std::pair<Value *, APInt> BaseAndOffset =
1389       ConstantOffsetPtrs.lookup(SelectedV);
1390   if (BaseAndOffset.first) {
1391     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1392 
1393     Value *SROAArg;
1394     DenseMap<Value *, int>::iterator CostIt;
1395     if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1396       SROAArgValues[&SI] = SROAArg;
1397   }
1398 
1399   return true;
1400 }
1401 
1402 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1403   // We model unconditional switches as free, see the comments on handling
1404   // branches.
1405   if (isa<ConstantInt>(SI.getCondition()))
1406     return true;
1407   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1408     if (isa<ConstantInt>(V))
1409       return true;
1410 
1411   // Assume the most general case where the switch is lowered into
1412   // either a jump table, bit test, or a balanced binary tree consisting of
1413   // case clusters without merging adjacent clusters with the same
1414   // destination. We do not consider the switches that are lowered with a mix
1415   // of jump table/bit test/binary search tree. The cost of the switch is
1416   // proportional to the size of the tree or the size of jump table range.
1417   //
1418   // NB: We convert large switches which are just used to initialize large phi
1419   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1420   // inlining those. It will prevent inlining in cases where the optimization
1421   // does not (yet) fire.
1422 
1423   // Maximum valid cost increased in this function.
1424   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1425 
1426   // Exit early for a large switch, assuming one case needs at least one
1427   // instruction.
1428   // FIXME: This is not true for a bit test, but ignore such case for now to
1429   // save compile-time.
1430   int64_t CostLowerBound =
1431       std::min((int64_t)CostUpperBound,
1432                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1433 
1434   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1435     Cost = CostLowerBound;
1436     return false;
1437   }
1438 
1439   unsigned JumpTableSize = 0;
1440   unsigned NumCaseCluster =
1441       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1442 
1443   // If suitable for a jump table, consider the cost for the table size and
1444   // branch to destination.
1445   if (JumpTableSize) {
1446     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1447                      4 * InlineConstants::InstrCost;
1448 
1449     Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1450     return false;
1451   }
1452 
1453   // Considering forming a binary search, we should find the number of nodes
1454   // which is same as the number of comparisons when lowered. For a given
1455   // number of clusters, n, we can define a recursive function, f(n), to find
1456   // the number of nodes in the tree. The recursion is :
1457   // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1458   // and f(n) = n, when n <= 3.
1459   // This will lead a binary tree where the leaf should be either f(2) or f(3)
1460   // when n > 3.  So, the number of comparisons from leaves should be n, while
1461   // the number of non-leaf should be :
1462   //   2^(log2(n) - 1) - 1
1463   //   = 2^log2(n) * 2^-1 - 1
1464   //   = n / 2 - 1.
1465   // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1466   // number of comparisons in a simple closed form :
1467   //   n + n / 2 - 1 = n * 3 / 2 - 1
1468   if (NumCaseCluster <= 3) {
1469     // Suppose a comparison includes one compare and one conditional branch.
1470     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1471     return false;
1472   }
1473 
1474   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1475   int64_t SwitchCost =
1476       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1477 
1478   Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1479   return false;
1480 }
1481 
1482 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1483   // We never want to inline functions that contain an indirectbr.  This is
1484   // incorrect because all the blockaddress's (in static global initializers
1485   // for example) would be referring to the original function, and this
1486   // indirect jump would jump from the inlined copy of the function into the
1487   // original function which is extremely undefined behavior.
1488   // FIXME: This logic isn't really right; we can safely inline functions with
1489   // indirectbr's as long as no other function or global references the
1490   // blockaddress of a block within the current function.
1491   HasIndirectBr = true;
1492   return false;
1493 }
1494 
1495 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1496   // FIXME: It's not clear that a single instruction is an accurate model for
1497   // the inline cost of a resume instruction.
1498   return false;
1499 }
1500 
1501 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1502   // FIXME: It's not clear that a single instruction is an accurate model for
1503   // the inline cost of a cleanupret instruction.
1504   return false;
1505 }
1506 
1507 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1508   // FIXME: It's not clear that a single instruction is an accurate model for
1509   // the inline cost of a catchret instruction.
1510   return false;
1511 }
1512 
1513 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1514   // FIXME: It might be reasonably to discount the cost of instructions leading
1515   // to unreachable as they have the lowest possible impact on both runtime and
1516   // code size.
1517   return true; // No actual code is needed for unreachable.
1518 }
1519 
1520 bool CallAnalyzer::visitInstruction(Instruction &I) {
1521   // Some instructions are free. All of the free intrinsics can also be
1522   // handled by SROA, etc.
1523   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1524     return true;
1525 
1526   // We found something we don't understand or can't handle. Mark any SROA-able
1527   // values in the operand list as no longer viable.
1528   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1529     disableSROA(*OI);
1530 
1531   return false;
1532 }
1533 
1534 /// \brief Analyze a basic block for its contribution to the inline cost.
1535 ///
1536 /// This method walks the analyzer over every instruction in the given basic
1537 /// block and accounts for their cost during inlining at this callsite. It
1538 /// aborts early if the threshold has been exceeded or an impossible to inline
1539 /// construct has been detected. It returns false if inlining is no longer
1540 /// viable, and true if inlining remains viable.
1541 bool CallAnalyzer::analyzeBlock(BasicBlock *BB,
1542                                 SmallPtrSetImpl<const Value *> &EphValues) {
1543   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1544     // FIXME: Currently, the number of instructions in a function regardless of
1545     // our ability to simplify them during inline to constants or dead code,
1546     // are actually used by the vector bonus heuristic. As long as that's true,
1547     // we have to special case debug intrinsics here to prevent differences in
1548     // inlining due to debug symbols. Eventually, the number of unsimplified
1549     // instructions shouldn't factor into the cost computation, but until then,
1550     // hack around it here.
1551     if (isa<DbgInfoIntrinsic>(I))
1552       continue;
1553 
1554     // Skip ephemeral values.
1555     if (EphValues.count(&*I))
1556       continue;
1557 
1558     ++NumInstructions;
1559     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1560       ++NumVectorInstructions;
1561 
1562     // If the instruction simplified to a constant, there is no cost to this
1563     // instruction. Visit the instructions using our InstVisitor to account for
1564     // all of the per-instruction logic. The visit tree returns true if we
1565     // consumed the instruction in any way, and false if the instruction's base
1566     // cost should count against inlining.
1567     if (Base::visit(&*I))
1568       ++NumInstructionsSimplified;
1569     else
1570       Cost += InlineConstants::InstrCost;
1571 
1572     using namespace ore;
1573     // If the visit this instruction detected an uninlinable pattern, abort.
1574     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1575         HasIndirectBr || HasFrameEscape || UsesVarArgs) {
1576       if (ORE)
1577         ORE->emit([&]() {
1578           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1579                                           CandidateCS.getInstruction())
1580                  << NV("Callee", &F)
1581                  << " has uninlinable pattern and cost is not fully computed";
1582         });
1583       return false;
1584     }
1585 
1586     // If the caller is a recursive function then we don't want to inline
1587     // functions which allocate a lot of stack space because it would increase
1588     // the caller stack usage dramatically.
1589     if (IsCallerRecursive &&
1590         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1591       if (ORE)
1592         ORE->emit([&]() {
1593           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1594                                           CandidateCS.getInstruction())
1595                  << NV("Callee", &F)
1596                  << " is recursive and allocates too much stack space. Cost is "
1597                     "not fully computed";
1598         });
1599       return false;
1600     }
1601 
1602     // Check if we've past the maximum possible threshold so we don't spin in
1603     // huge basic blocks that will never inline.
1604     if (Cost >= Threshold && !ComputeFullInlineCost)
1605       return false;
1606   }
1607 
1608   return true;
1609 }
1610 
1611 /// \brief Compute the base pointer and cumulative constant offsets for V.
1612 ///
1613 /// This strips all constant offsets off of V, leaving it the base pointer, and
1614 /// accumulates the total constant offset applied in the returned constant. It
1615 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1616 /// no constant offsets applied.
1617 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1618   if (!V->getType()->isPointerTy())
1619     return nullptr;
1620 
1621   unsigned AS = V->getType()->getPointerAddressSpace();
1622   unsigned IntPtrWidth = DL.getPointerSizeInBits(AS);
1623   APInt Offset = APInt::getNullValue(IntPtrWidth);
1624 
1625   // Even though we don't look through PHI nodes, we could be called on an
1626   // instruction in an unreachable block, which may be on a cycle.
1627   SmallPtrSet<Value *, 4> Visited;
1628   Visited.insert(V);
1629   do {
1630     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1631       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1632         return nullptr;
1633       V = GEP->getPointerOperand();
1634     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1635       V = cast<Operator>(V)->getOperand(0);
1636     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1637       if (GA->isInterposable())
1638         break;
1639       V = GA->getAliasee();
1640     } else {
1641       break;
1642     }
1643     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1644   } while (Visited.insert(V).second);
1645 
1646   Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1647   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1648 }
1649 
1650 /// \brief Find dead blocks due to deleted CFG edges during inlining.
1651 ///
1652 /// If we know the successor of the current block, \p CurrBB, has to be \p
1653 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1654 /// no live incoming CFG edges.  If one block is found to be dead, we can
1655 /// continue growing the dead block list by checking the successors of the dead
1656 /// blocks to see if all their incoming edges are dead or not.
1657 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1658   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1659     // A CFG edge is dead if the predecessor is dead or the predessor has a
1660     // known successor which is not the one under exam.
1661     return (DeadBlocks.count(Pred) ||
1662             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1663   };
1664 
1665   auto IsNewlyDead = [&](BasicBlock *BB) {
1666     // If all the edges to a block are dead, the block is also dead.
1667     return (!DeadBlocks.count(BB) &&
1668             llvm::all_of(predecessors(BB),
1669                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1670   };
1671 
1672   for (BasicBlock *Succ : successors(CurrBB)) {
1673     if (Succ == NextBB || !IsNewlyDead(Succ))
1674       continue;
1675     SmallVector<BasicBlock *, 4> NewDead;
1676     NewDead.push_back(Succ);
1677     while (!NewDead.empty()) {
1678       BasicBlock *Dead = NewDead.pop_back_val();
1679       if (DeadBlocks.insert(Dead))
1680         // Continue growing the dead block lists.
1681         for (BasicBlock *S : successors(Dead))
1682           if (IsNewlyDead(S))
1683             NewDead.push_back(S);
1684     }
1685   }
1686 }
1687 
1688 /// \brief Analyze a call site for potential inlining.
1689 ///
1690 /// Returns true if inlining this call is viable, and false if it is not
1691 /// viable. It computes the cost and adjusts the threshold based on numerous
1692 /// factors and heuristics. If this method returns false but the computed cost
1693 /// is below the computed threshold, then inlining was forcibly disabled by
1694 /// some artifact of the routine.
1695 bool CallAnalyzer::analyzeCall(CallSite CS) {
1696   ++NumCallsAnalyzed;
1697 
1698   // Perform some tweaks to the cost and threshold based on the direct
1699   // callsite information.
1700 
1701   // We want to more aggressively inline vector-dense kernels, so up the
1702   // threshold, and we'll lower it if the % of vector instructions gets too
1703   // low. Note that these bonuses are some what arbitrary and evolved over time
1704   // by accident as much as because they are principled bonuses.
1705   //
1706   // FIXME: It would be nice to remove all such bonuses. At least it would be
1707   // nice to base the bonus values on something more scientific.
1708   assert(NumInstructions == 0);
1709   assert(NumVectorInstructions == 0);
1710 
1711   // Update the threshold based on callsite properties
1712   updateThreshold(CS, F);
1713 
1714   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1715   // this Threshold any time, and cost cannot decrease, we can stop processing
1716   // the rest of the function body.
1717   Threshold += (SingleBBBonus + VectorBonus);
1718 
1719   // Give out bonuses for the callsite, as the instructions setting them up
1720   // will be gone after inlining.
1721   Cost -= getCallsiteCost(CS, DL);
1722 
1723   // If this function uses the coldcc calling convention, prefer not to inline
1724   // it.
1725   if (F.getCallingConv() == CallingConv::Cold)
1726     Cost += InlineConstants::ColdccPenalty;
1727 
1728   // Check if we're done. This can happen due to bonuses and penalties.
1729   if (Cost >= Threshold && !ComputeFullInlineCost)
1730     return false;
1731 
1732   if (F.empty())
1733     return true;
1734 
1735   Function *Caller = CS.getInstruction()->getFunction();
1736   // Check if the caller function is recursive itself.
1737   for (User *U : Caller->users()) {
1738     CallSite Site(U);
1739     if (!Site)
1740       continue;
1741     Instruction *I = Site.getInstruction();
1742     if (I->getFunction() == Caller) {
1743       IsCallerRecursive = true;
1744       break;
1745     }
1746   }
1747 
1748   // Populate our simplified values by mapping from function arguments to call
1749   // arguments with known important simplifications.
1750   CallSite::arg_iterator CAI = CS.arg_begin();
1751   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1752        FAI != FAE; ++FAI, ++CAI) {
1753     assert(CAI != CS.arg_end());
1754     if (Constant *C = dyn_cast<Constant>(CAI))
1755       SimplifiedValues[&*FAI] = C;
1756 
1757     Value *PtrArg = *CAI;
1758     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1759       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1760 
1761       // We can SROA any pointer arguments derived from alloca instructions.
1762       if (isa<AllocaInst>(PtrArg)) {
1763         SROAArgValues[&*FAI] = PtrArg;
1764         SROAArgCosts[PtrArg] = 0;
1765       }
1766     }
1767   }
1768   NumConstantArgs = SimplifiedValues.size();
1769   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1770   NumAllocaArgs = SROAArgValues.size();
1771 
1772   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1773   // the ephemeral values multiple times (and they're completely determined by
1774   // the callee, so this is purely duplicate work).
1775   SmallPtrSet<const Value *, 32> EphValues;
1776   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1777 
1778   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1779   // adding basic blocks of the callee which can be proven to be dead for this
1780   // particular call site in order to get more accurate cost estimates. This
1781   // requires a somewhat heavyweight iteration pattern: we need to walk the
1782   // basic blocks in a breadth-first order as we insert live successors. To
1783   // accomplish this, prioritizing for small iterations because we exit after
1784   // crossing our threshold, we use a small-size optimized SetVector.
1785   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1786                     SmallPtrSet<BasicBlock *, 16>>
1787       BBSetVector;
1788   BBSetVector BBWorklist;
1789   BBWorklist.insert(&F.getEntryBlock());
1790   bool SingleBB = true;
1791   // Note that we *must not* cache the size, this loop grows the worklist.
1792   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1793     // Bail out the moment we cross the threshold. This means we'll under-count
1794     // the cost, but only when undercounting doesn't matter.
1795     if (Cost >= Threshold && !ComputeFullInlineCost)
1796       break;
1797 
1798     BasicBlock *BB = BBWorklist[Idx];
1799     if (BB->empty())
1800       continue;
1801 
1802     // Disallow inlining a blockaddress. A blockaddress only has defined
1803     // behavior for an indirect branch in the same function, and we do not
1804     // currently support inlining indirect branches. But, the inliner may not
1805     // see an indirect branch that ends up being dead code at a particular call
1806     // site. If the blockaddress escapes the function, e.g., via a global
1807     // variable, inlining may lead to an invalid cross-function reference.
1808     if (BB->hasAddressTaken())
1809       return false;
1810 
1811     // Analyze the cost of this block. If we blow through the threshold, this
1812     // returns false, and we can bail on out.
1813     if (!analyzeBlock(BB, EphValues))
1814       return false;
1815 
1816     TerminatorInst *TI = BB->getTerminator();
1817 
1818     // Add in the live successors by first checking whether we have terminator
1819     // that may be simplified based on the values simplified by this call.
1820     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1821       if (BI->isConditional()) {
1822         Value *Cond = BI->getCondition();
1823         if (ConstantInt *SimpleCond =
1824                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1825           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1826           BBWorklist.insert(NextBB);
1827           KnownSuccessors[BB] = NextBB;
1828           findDeadBlocks(BB, NextBB);
1829           continue;
1830         }
1831       }
1832     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1833       Value *Cond = SI->getCondition();
1834       if (ConstantInt *SimpleCond =
1835               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1836         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1837         BBWorklist.insert(NextBB);
1838         KnownSuccessors[BB] = NextBB;
1839         findDeadBlocks(BB, NextBB);
1840         continue;
1841       }
1842     }
1843 
1844     // If we're unable to select a particular successor, just count all of
1845     // them.
1846     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1847          ++TIdx)
1848       BBWorklist.insert(TI->getSuccessor(TIdx));
1849 
1850     // If we had any successors at this point, than post-inlining is likely to
1851     // have them as well. Note that we assume any basic blocks which existed
1852     // due to branches or switches which folded above will also fold after
1853     // inlining.
1854     if (SingleBB && TI->getNumSuccessors() > 1) {
1855       // Take off the bonus we applied to the threshold.
1856       Threshold -= SingleBBBonus;
1857       SingleBB = false;
1858     }
1859   }
1860 
1861   bool OnlyOneCallAndLocalLinkage =
1862       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1863   // If this is a noduplicate call, we can still inline as long as
1864   // inlining this would cause the removal of the caller (so the instruction
1865   // is not actually duplicated, just moved).
1866   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1867     return false;
1868 
1869   // We applied the maximum possible vector bonus at the beginning. Now,
1870   // subtract the excess bonus, if any, from the Threshold before
1871   // comparing against Cost.
1872   if (NumVectorInstructions <= NumInstructions / 10)
1873     Threshold -= VectorBonus;
1874   else if (NumVectorInstructions <= NumInstructions / 2)
1875     Threshold -= VectorBonus/2;
1876 
1877   return Cost < std::max(1, Threshold);
1878 }
1879 
1880 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1881 /// \brief Dump stats about this call's analysis.
1882 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1883 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1884   DEBUG_PRINT_STAT(NumConstantArgs);
1885   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1886   DEBUG_PRINT_STAT(NumAllocaArgs);
1887   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1888   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1889   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1890   DEBUG_PRINT_STAT(NumInstructions);
1891   DEBUG_PRINT_STAT(SROACostSavings);
1892   DEBUG_PRINT_STAT(SROACostSavingsLost);
1893   DEBUG_PRINT_STAT(LoadEliminationCost);
1894   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1895   DEBUG_PRINT_STAT(Cost);
1896   DEBUG_PRINT_STAT(Threshold);
1897 #undef DEBUG_PRINT_STAT
1898 }
1899 #endif
1900 
1901 /// \brief Test that there are no attribute conflicts between Caller and Callee
1902 ///        that prevent inlining.
1903 static bool functionsHaveCompatibleAttributes(Function *Caller,
1904                                               Function *Callee,
1905                                               TargetTransformInfo &TTI) {
1906   return TTI.areInlineCompatible(Caller, Callee) &&
1907          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1908 }
1909 
1910 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1911   int Cost = 0;
1912   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1913     if (CS.isByValArgument(I)) {
1914       // We approximate the number of loads and stores needed by dividing the
1915       // size of the byval type by the target's pointer size.
1916       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1917       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1918       unsigned AS = PTy->getAddressSpace();
1919       unsigned PointerSize = DL.getPointerSizeInBits(AS);
1920       // Ceiling division.
1921       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1922 
1923       // If it generates more than 8 stores it is likely to be expanded as an
1924       // inline memcpy so we take that as an upper bound. Otherwise we assume
1925       // one load and one store per word copied.
1926       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1927       // here instead of a magic number of 8, but it's not available via
1928       // DataLayout.
1929       NumStores = std::min(NumStores, 8U);
1930 
1931       Cost += 2 * NumStores * InlineConstants::InstrCost;
1932     } else {
1933       // For non-byval arguments subtract off one instruction per call
1934       // argument.
1935       Cost += InlineConstants::InstrCost;
1936     }
1937   }
1938   // The call instruction also disappears after inlining.
1939   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1940   return Cost;
1941 }
1942 
1943 InlineCost llvm::getInlineCost(
1944     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1945     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1946     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1947     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1948   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1949                        GetAssumptionCache, GetBFI, PSI, ORE);
1950 }
1951 
1952 InlineCost llvm::getInlineCost(
1953     CallSite CS, Function *Callee, const InlineParams &Params,
1954     TargetTransformInfo &CalleeTTI,
1955     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1956     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1957     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1958 
1959   // Cannot inline indirect calls.
1960   if (!Callee)
1961     return llvm::InlineCost::getNever();
1962 
1963   // Never inline calls with byval arguments that does not have the alloca
1964   // address space. Since byval arguments can be replaced with a copy to an
1965   // alloca, the inlined code would need to be adjusted to handle that the
1966   // argument is in the alloca address space (so it is a little bit complicated
1967   // to solve).
1968   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
1969   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
1970     if (CS.isByValArgument(I)) {
1971       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1972       if (PTy->getAddressSpace() != AllocaAS)
1973         return llvm::InlineCost::getNever();
1974     }
1975 
1976   // Calls to functions with always-inline attributes should be inlined
1977   // whenever possible.
1978   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
1979     if (isInlineViable(*Callee))
1980       return llvm::InlineCost::getAlways();
1981     return llvm::InlineCost::getNever();
1982   }
1983 
1984   // Never inline functions with conflicting attributes (unless callee has
1985   // always-inline attribute).
1986   Function *Caller = CS.getCaller();
1987   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1988     return llvm::InlineCost::getNever();
1989 
1990   // Don't inline this call if the caller has the optnone attribute.
1991   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1992     return llvm::InlineCost::getNever();
1993 
1994   // Don't inline functions which can be interposed at link-time.  Don't inline
1995   // functions marked noinline or call sites marked noinline.
1996   // Note: inlining non-exact non-interposable functions is fine, since we know
1997   // we have *a* correct implementation of the source level function.
1998   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1999       CS.isNoInline())
2000     return llvm::InlineCost::getNever();
2001 
2002   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2003                      << "... (caller:" << Caller->getName() << ")\n");
2004 
2005   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2006                   Params);
2007   bool ShouldInline = CA.analyzeCall(CS);
2008 
2009   DEBUG(CA.dump());
2010 
2011   // Check if there was a reason to force inlining or no inlining.
2012   if (!ShouldInline && CA.getCost() < CA.getThreshold())
2013     return InlineCost::getNever();
2014   if (ShouldInline && CA.getCost() >= CA.getThreshold())
2015     return InlineCost::getAlways();
2016 
2017   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2018 }
2019 
2020 bool llvm::isInlineViable(Function &F) {
2021   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2022   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2023     // Disallow inlining of functions which contain indirect branches or
2024     // blockaddresses.
2025     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2026       return false;
2027 
2028     for (auto &II : *BI) {
2029       CallSite CS(&II);
2030       if (!CS)
2031         continue;
2032 
2033       // Disallow recursive calls.
2034       if (&F == CS.getCalledFunction())
2035         return false;
2036 
2037       // Disallow calls which expose returns-twice to a function not previously
2038       // attributed as such.
2039       if (!ReturnsTwice && CS.isCall() &&
2040           cast<CallInst>(CS.getInstruction())->canReturnTwice())
2041         return false;
2042 
2043       // Disallow inlining functions that call @llvm.localescape. Doing this
2044       // correctly would require major changes to the inliner.
2045       if (CS.getCalledFunction() &&
2046           CS.getCalledFunction()->getIntrinsicID() ==
2047               llvm::Intrinsic::localescape)
2048         return false;
2049     }
2050   }
2051 
2052   return true;
2053 }
2054 
2055 // APIs to create InlineParams based on command line flags and/or other
2056 // parameters.
2057 
2058 InlineParams llvm::getInlineParams(int Threshold) {
2059   InlineParams Params;
2060 
2061   // This field is the threshold to use for a callee by default. This is
2062   // derived from one or more of:
2063   //  * optimization or size-optimization levels,
2064   //  * a value passed to createFunctionInliningPass function, or
2065   //  * the -inline-threshold flag.
2066   //  If the -inline-threshold flag is explicitly specified, that is used
2067   //  irrespective of anything else.
2068   if (InlineThreshold.getNumOccurrences() > 0)
2069     Params.DefaultThreshold = InlineThreshold;
2070   else
2071     Params.DefaultThreshold = Threshold;
2072 
2073   // Set the HintThreshold knob from the -inlinehint-threshold.
2074   Params.HintThreshold = HintThreshold;
2075 
2076   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2077   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2078 
2079   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2080   // populate LocallyHotCallSiteThreshold. Later, we populate
2081   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2082   // we know that optimization level is O3 (in the getInlineParams variant that
2083   // takes the opt and size levels).
2084   // FIXME: Remove this check (and make the assignment unconditional) after
2085   // addressing size regression issues at O2.
2086   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2087     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2088 
2089   // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2090   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2091 
2092   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2093   // -inlinehint-threshold commandline option is not explicitly given. If that
2094   // option is present, then its value applies even for callees with size and
2095   // minsize attributes.
2096   // If the -inline-threshold is not specified, set the ColdThreshold from the
2097   // -inlinecold-threshold even if it is not explicitly passed. If
2098   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2099   // explicitly specified to set the ColdThreshold knob
2100   if (InlineThreshold.getNumOccurrences() == 0) {
2101     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2102     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2103     Params.ColdThreshold = ColdThreshold;
2104   } else if (ColdThreshold.getNumOccurrences() > 0) {
2105     Params.ColdThreshold = ColdThreshold;
2106   }
2107   return Params;
2108 }
2109 
2110 InlineParams llvm::getInlineParams() {
2111   return getInlineParams(InlineThreshold);
2112 }
2113 
2114 // Compute the default threshold for inlining based on the opt level and the
2115 // size opt level.
2116 static int computeThresholdFromOptLevels(unsigned OptLevel,
2117                                          unsigned SizeOptLevel) {
2118   if (OptLevel > 2)
2119     return InlineConstants::OptAggressiveThreshold;
2120   if (SizeOptLevel == 1) // -Os
2121     return InlineConstants::OptSizeThreshold;
2122   if (SizeOptLevel == 2) // -Oz
2123     return InlineConstants::OptMinSizeThreshold;
2124   return InlineThreshold;
2125 }
2126 
2127 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2128   auto Params =
2129       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2130   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2131   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2132   // when it is specified explicitly.
2133   if (OptLevel > 2)
2134     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2135   return Params;
2136 }
2137