1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/CallSite.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/raw_ostream.h"
42 
43 using namespace llvm;
44 
45 #define DEBUG_TYPE "inline-cost"
46 
47 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
48 
49 static cl::opt<int> InlineThreshold(
50     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
51     cl::desc("Control the amount of inlining to perform (default = 225)"));
52 
53 static cl::opt<int> HintThreshold(
54     "inlinehint-threshold", cl::Hidden, cl::init(325),
55     cl::desc("Threshold for inlining functions with inline hint"));
56 
57 static cl::opt<int>
58     ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
59                           cl::init(45),
60                           cl::desc("Threshold for inlining cold callsites"));
61 
62 // We introduce this threshold to help performance of instrumentation based
63 // PGO before we actually hook up inliner with analysis passes such as BPI and
64 // BFI.
65 static cl::opt<int> ColdThreshold(
66     "inlinecold-threshold", cl::Hidden, cl::init(45),
67     cl::desc("Threshold for inlining functions with cold attribute"));
68 
69 static cl::opt<int>
70     HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
71                          cl::ZeroOrMore,
72                          cl::desc("Threshold for hot callsites "));
73 
74 static cl::opt<int> LocallyHotCallSiteThreshold(
75     "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
76     cl::desc("Threshold for locally hot callsites "));
77 
78 static cl::opt<int> ColdCallSiteRelFreq(
79     "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
80     cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
81              "entry frequency, for a callsite to be cold in the absence of "
82              "profile information."));
83 
84 static cl::opt<int> HotCallSiteRelFreq(
85     "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
86     cl::desc("Minimum block frequency, expressed as a multiple of caller's "
87              "entry frequency, for a callsite to be hot in the absence of "
88              "profile information."));
89 
90 static cl::opt<bool> OptComputeFullInlineCost(
91     "inline-cost-full", cl::Hidden, cl::init(false),
92     cl::desc("Compute the full inline cost of a call site even when the cost "
93              "exceeds the threshold."));
94 
95 namespace {
96 
97 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
98   typedef InstVisitor<CallAnalyzer, bool> Base;
99   friend class InstVisitor<CallAnalyzer, bool>;
100 
101   /// The TargetTransformInfo available for this compilation.
102   const TargetTransformInfo &TTI;
103 
104   /// Getter for the cache of @llvm.assume intrinsics.
105   std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
106 
107   /// Getter for BlockFrequencyInfo
108   Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
109 
110   /// Profile summary information.
111   ProfileSummaryInfo *PSI;
112 
113   /// The called function.
114   Function &F;
115 
116   // Cache the DataLayout since we use it a lot.
117   const DataLayout &DL;
118 
119   /// The OptimizationRemarkEmitter available for this compilation.
120   OptimizationRemarkEmitter *ORE;
121 
122   /// The candidate callsite being analyzed. Please do not use this to do
123   /// analysis in the caller function; we want the inline cost query to be
124   /// easily cacheable. Instead, use the cover function paramHasAttr.
125   CallSite CandidateCS;
126 
127   /// Tunable parameters that control the analysis.
128   const InlineParams &Params;
129 
130   int Threshold;
131   int Cost;
132   bool ComputeFullInlineCost;
133 
134   bool IsCallerRecursive;
135   bool IsRecursiveCall;
136   bool ExposesReturnsTwice;
137   bool HasDynamicAlloca;
138   bool ContainsNoDuplicateCall;
139   bool HasReturn;
140   bool HasIndirectBr;
141   bool HasUninlineableIntrinsic;
142   bool InitsVargArgs;
143 
144   /// Number of bytes allocated statically by the callee.
145   uint64_t AllocatedSize;
146   unsigned NumInstructions, NumVectorInstructions;
147   int VectorBonus, TenPercentVectorBonus;
148   // Bonus to be applied when the callee has only one reachable basic block.
149   int SingleBBBonus;
150 
151   /// While we walk the potentially-inlined instructions, we build up and
152   /// maintain a mapping of simplified values specific to this callsite. The
153   /// idea is to propagate any special information we have about arguments to
154   /// this call through the inlinable section of the function, and account for
155   /// likely simplifications post-inlining. The most important aspect we track
156   /// is CFG altering simplifications -- when we prove a basic block dead, that
157   /// can cause dramatic shifts in the cost of inlining a function.
158   DenseMap<Value *, Constant *> SimplifiedValues;
159 
160   /// Keep track of the values which map back (through function arguments) to
161   /// allocas on the caller stack which could be simplified through SROA.
162   DenseMap<Value *, Value *> SROAArgValues;
163 
164   /// The mapping of caller Alloca values to their accumulated cost savings. If
165   /// we have to disable SROA for one of the allocas, this tells us how much
166   /// cost must be added.
167   DenseMap<Value *, int> SROAArgCosts;
168 
169   /// Keep track of values which map to a pointer base and constant offset.
170   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
171 
172   /// Keep track of dead blocks due to the constant arguments.
173   SetVector<BasicBlock *> DeadBlocks;
174 
175   /// The mapping of the blocks to their known unique successors due to the
176   /// constant arguments.
177   DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
178 
179   /// Model the elimination of repeated loads that is expected to happen
180   /// whenever we simplify away the stores that would otherwise cause them to be
181   /// loads.
182   bool EnableLoadElimination;
183   SmallPtrSet<Value *, 16> LoadAddrSet;
184   int LoadEliminationCost;
185 
186   // Custom simplification helper routines.
187   bool isAllocaDerivedArg(Value *V);
188   bool lookupSROAArgAndCost(Value *V, Value *&Arg,
189                             DenseMap<Value *, int>::iterator &CostIt);
190   void disableSROA(DenseMap<Value *, int>::iterator CostIt);
191   void disableSROA(Value *V);
192   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
193   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
194                           int InstructionCost);
195   void disableLoadElimination();
196   bool isGEPFree(GetElementPtrInst &GEP);
197   bool canFoldInboundsGEP(GetElementPtrInst &I);
198   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
199   bool simplifyCallSite(Function *F, CallSite CS);
200   template <typename Callable>
201   bool simplifyInstruction(Instruction &I, Callable Evaluate);
202   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
203 
204   /// Return true if the given argument to the function being considered for
205   /// inlining has the given attribute set either at the call site or the
206   /// function declaration.  Primarily used to inspect call site specific
207   /// attributes since these can be more precise than the ones on the callee
208   /// itself.
209   bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
210 
211   /// Return true if the given value is known non null within the callee if
212   /// inlined through this particular callsite.
213   bool isKnownNonNullInCallee(Value *V);
214 
215   /// Update Threshold based on callsite properties such as callee
216   /// attributes and callee hotness for PGO builds. The Callee is explicitly
217   /// passed to support analyzing indirect calls whose target is inferred by
218   /// analysis.
219   void updateThreshold(CallSite CS, Function &Callee);
220 
221   /// Return true if size growth is allowed when inlining the callee at CS.
222   bool allowSizeGrowth(CallSite CS);
223 
224   /// Return true if \p CS is a cold callsite.
225   bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
226 
227   /// Return a higher threshold if \p CS is a hot callsite.
228   Optional<int> getHotCallSiteThreshold(CallSite CS,
229                                         BlockFrequencyInfo *CallerBFI);
230 
231   // Custom analysis routines.
232   InlineResult analyzeBlock(BasicBlock *BB,
233                             SmallPtrSetImpl<const Value *> &EphValues);
234 
235   // Disable several entry points to the visitor so we don't accidentally use
236   // them by declaring but not defining them here.
237   void visit(Module *);
238   void visit(Module &);
239   void visit(Function *);
240   void visit(Function &);
241   void visit(BasicBlock *);
242   void visit(BasicBlock &);
243 
244   // Provide base case for our instruction visit.
245   bool visitInstruction(Instruction &I);
246 
247   // Our visit overrides.
248   bool visitAlloca(AllocaInst &I);
249   bool visitPHI(PHINode &I);
250   bool visitGetElementPtr(GetElementPtrInst &I);
251   bool visitBitCast(BitCastInst &I);
252   bool visitPtrToInt(PtrToIntInst &I);
253   bool visitIntToPtr(IntToPtrInst &I);
254   bool visitCastInst(CastInst &I);
255   bool visitUnaryInstruction(UnaryInstruction &I);
256   bool visitCmpInst(CmpInst &I);
257   bool visitSub(BinaryOperator &I);
258   bool visitBinaryOperator(BinaryOperator &I);
259   bool visitLoad(LoadInst &I);
260   bool visitStore(StoreInst &I);
261   bool visitExtractValue(ExtractValueInst &I);
262   bool visitInsertValue(InsertValueInst &I);
263   bool visitCallSite(CallSite CS);
264   bool visitReturnInst(ReturnInst &RI);
265   bool visitBranchInst(BranchInst &BI);
266   bool visitSelectInst(SelectInst &SI);
267   bool visitSwitchInst(SwitchInst &SI);
268   bool visitIndirectBrInst(IndirectBrInst &IBI);
269   bool visitResumeInst(ResumeInst &RI);
270   bool visitCleanupReturnInst(CleanupReturnInst &RI);
271   bool visitCatchReturnInst(CatchReturnInst &RI);
272   bool visitUnreachableInst(UnreachableInst &I);
273 
274 public:
CallAnalyzer(const TargetTransformInfo & TTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> & GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallSite CSArg,const InlineParams & Params)275   CallAnalyzer(const TargetTransformInfo &TTI,
276                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
277                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
278                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
279                Function &Callee, CallSite CSArg, const InlineParams &Params)
280       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
281         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
282         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
283         Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
284                                        Params.ComputeFullInlineCost || ORE),
285         IsCallerRecursive(false), IsRecursiveCall(false),
286         ExposesReturnsTwice(false), HasDynamicAlloca(false),
287         ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
288         HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
289         NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
290         SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
291         NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
292         NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
293         NumInstructionsSimplified(0), SROACostSavings(0),
294         SROACostSavingsLost(0) {}
295 
296   InlineResult analyzeCall(CallSite CS);
297 
getThreshold()298   int getThreshold() { return Threshold; }
getCost()299   int getCost() { return Cost; }
300 
301   // Keep a bunch of stats about the cost savings found so we can print them
302   // out when debugging.
303   unsigned NumConstantArgs;
304   unsigned NumConstantOffsetPtrArgs;
305   unsigned NumAllocaArgs;
306   unsigned NumConstantPtrCmps;
307   unsigned NumConstantPtrDiffs;
308   unsigned NumInstructionsSimplified;
309   unsigned SROACostSavings;
310   unsigned SROACostSavingsLost;
311 
312   void dump();
313 };
314 
315 } // namespace
316 
317 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)318 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
319   return SROAArgValues.count(V);
320 }
321 
322 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
323 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)324 bool CallAnalyzer::lookupSROAArgAndCost(
325     Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
326   if (SROAArgValues.empty() || SROAArgCosts.empty())
327     return false;
328 
329   DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
330   if (ArgIt == SROAArgValues.end())
331     return false;
332 
333   Arg = ArgIt->second;
334   CostIt = SROAArgCosts.find(Arg);
335   return CostIt != SROAArgCosts.end();
336 }
337 
338 /// Disable SROA for the candidate marked by this cost iterator.
339 ///
340 /// This marks the candidate as no longer viable for SROA, and adds the cost
341 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)342 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
343   // If we're no longer able to perform SROA we need to undo its cost savings
344   // and prevent subsequent analysis.
345   Cost += CostIt->second;
346   SROACostSavings -= CostIt->second;
347   SROACostSavingsLost += CostIt->second;
348   SROAArgCosts.erase(CostIt);
349   disableLoadElimination();
350 }
351 
352 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)353 void CallAnalyzer::disableSROA(Value *V) {
354   Value *SROAArg;
355   DenseMap<Value *, int>::iterator CostIt;
356   if (lookupSROAArgAndCost(V, SROAArg, CostIt))
357     disableSROA(CostIt);
358 }
359 
360 /// Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)361 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
362                                       int InstructionCost) {
363   CostIt->second += InstructionCost;
364   SROACostSavings += InstructionCost;
365 }
366 
disableLoadElimination()367 void CallAnalyzer::disableLoadElimination() {
368   if (EnableLoadElimination) {
369     Cost += LoadEliminationCost;
370     LoadEliminationCost = 0;
371     EnableLoadElimination = false;
372   }
373 }
374 
375 /// Accumulate a constant GEP offset into an APInt if possible.
376 ///
377 /// Returns false if unable to compute the offset for any reason. Respects any
378 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)379 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
380   unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
381   assert(IntPtrWidth == Offset.getBitWidth());
382 
383   for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
384        GTI != GTE; ++GTI) {
385     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
386     if (!OpC)
387       if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
388         OpC = dyn_cast<ConstantInt>(SimpleOp);
389     if (!OpC)
390       return false;
391     if (OpC->isZero())
392       continue;
393 
394     // Handle a struct index, which adds its field offset to the pointer.
395     if (StructType *STy = GTI.getStructTypeOrNull()) {
396       unsigned ElementIdx = OpC->getZExtValue();
397       const StructLayout *SL = DL.getStructLayout(STy);
398       Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
399       continue;
400     }
401 
402     APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
403     Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
404   }
405   return true;
406 }
407 
408 /// Use TTI to check whether a GEP is free.
409 ///
410 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)411 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
412   SmallVector<Value *, 4> Operands;
413   Operands.push_back(GEP.getOperand(0));
414   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
415     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
416        Operands.push_back(SimpleOp);
417      else
418        Operands.push_back(*I);
419   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
420 }
421 
visitAlloca(AllocaInst & I)422 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
423   // Check whether inlining will turn a dynamic alloca into a static
424   // alloca and handle that case.
425   if (I.isArrayAllocation()) {
426     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
427     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
428       Type *Ty = I.getAllocatedType();
429       AllocatedSize = SaturatingMultiplyAdd(
430           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
431       return Base::visitAlloca(I);
432     }
433   }
434 
435   // Accumulate the allocated size.
436   if (I.isStaticAlloca()) {
437     Type *Ty = I.getAllocatedType();
438     AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
439   }
440 
441   // We will happily inline static alloca instructions.
442   if (I.isStaticAlloca())
443     return Base::visitAlloca(I);
444 
445   // FIXME: This is overly conservative. Dynamic allocas are inefficient for
446   // a variety of reasons, and so we would like to not inline them into
447   // functions which don't currently have a dynamic alloca. This simply
448   // disables inlining altogether in the presence of a dynamic alloca.
449   HasDynamicAlloca = true;
450   return false;
451 }
452 
visitPHI(PHINode & I)453 bool CallAnalyzer::visitPHI(PHINode &I) {
454   // FIXME: We need to propagate SROA *disabling* through phi nodes, even
455   // though we don't want to propagate it's bonuses. The idea is to disable
456   // SROA if it *might* be used in an inappropriate manner.
457 
458   // Phi nodes are always zero-cost.
459   // FIXME: Pointer sizes may differ between different address spaces, so do we
460   // need to use correct address space in the call to getPointerSizeInBits here?
461   // Or could we skip the getPointerSizeInBits call completely? As far as I can
462   // see the ZeroOffset is used as a dummy value, so we can probably use any
463   // bit width for the ZeroOffset?
464   APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
465   bool CheckSROA = I.getType()->isPointerTy();
466 
467   // Track the constant or pointer with constant offset we've seen so far.
468   Constant *FirstC = nullptr;
469   std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
470   Value *FirstV = nullptr;
471 
472   for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
473     BasicBlock *Pred = I.getIncomingBlock(i);
474     // If the incoming block is dead, skip the incoming block.
475     if (DeadBlocks.count(Pred))
476       continue;
477     // If the parent block of phi is not the known successor of the incoming
478     // block, skip the incoming block.
479     BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
480     if (KnownSuccessor && KnownSuccessor != I.getParent())
481       continue;
482 
483     Value *V = I.getIncomingValue(i);
484     // If the incoming value is this phi itself, skip the incoming value.
485     if (&I == V)
486       continue;
487 
488     Constant *C = dyn_cast<Constant>(V);
489     if (!C)
490       C = SimplifiedValues.lookup(V);
491 
492     std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
493     if (!C && CheckSROA)
494       BaseAndOffset = ConstantOffsetPtrs.lookup(V);
495 
496     if (!C && !BaseAndOffset.first)
497       // The incoming value is neither a constant nor a pointer with constant
498       // offset, exit early.
499       return true;
500 
501     if (FirstC) {
502       if (FirstC == C)
503         // If we've seen a constant incoming value before and it is the same
504         // constant we see this time, continue checking the next incoming value.
505         continue;
506       // Otherwise early exit because we either see a different constant or saw
507       // a constant before but we have a pointer with constant offset this time.
508       return true;
509     }
510 
511     if (FirstV) {
512       // The same logic as above, but check pointer with constant offset here.
513       if (FirstBaseAndOffset == BaseAndOffset)
514         continue;
515       return true;
516     }
517 
518     if (C) {
519       // This is the 1st time we've seen a constant, record it.
520       FirstC = C;
521       continue;
522     }
523 
524     // The remaining case is that this is the 1st time we've seen a pointer with
525     // constant offset, record it.
526     FirstV = V;
527     FirstBaseAndOffset = BaseAndOffset;
528   }
529 
530   // Check if we can map phi to a constant.
531   if (FirstC) {
532     SimplifiedValues[&I] = FirstC;
533     return true;
534   }
535 
536   // Check if we can map phi to a pointer with constant offset.
537   if (FirstBaseAndOffset.first) {
538     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
539 
540     Value *SROAArg;
541     DenseMap<Value *, int>::iterator CostIt;
542     if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
543       SROAArgValues[&I] = SROAArg;
544   }
545 
546   return true;
547 }
548 
549 /// Check we can fold GEPs of constant-offset call site argument pointers.
550 /// This requires target data and inbounds GEPs.
551 ///
552 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)553 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
554   // Check if we have a base + offset for the pointer.
555   std::pair<Value *, APInt> BaseAndOffset =
556       ConstantOffsetPtrs.lookup(I.getPointerOperand());
557   if (!BaseAndOffset.first)
558     return false;
559 
560   // Check if the offset of this GEP is constant, and if so accumulate it
561   // into Offset.
562   if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
563     return false;
564 
565   // Add the result as a new mapping to Base + Offset.
566   ConstantOffsetPtrs[&I] = BaseAndOffset;
567 
568   return true;
569 }
570 
visitGetElementPtr(GetElementPtrInst & I)571 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
572   Value *SROAArg;
573   DenseMap<Value *, int>::iterator CostIt;
574   bool SROACandidate =
575       lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
576 
577   // Lambda to check whether a GEP's indices are all constant.
578   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
579     for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
580       if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
581         return false;
582     return true;
583   };
584 
585   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
586     if (SROACandidate)
587       SROAArgValues[&I] = SROAArg;
588 
589     // Constant GEPs are modeled as free.
590     return true;
591   }
592 
593   // Variable GEPs will require math and will disable SROA.
594   if (SROACandidate)
595     disableSROA(CostIt);
596   return isGEPFree(I);
597 }
598 
599 /// Simplify \p I if its operands are constants and update SimplifiedValues.
600 /// \p Evaluate is a callable specific to instruction type that evaluates the
601 /// instruction when all the operands are constants.
602 template <typename Callable>
simplifyInstruction(Instruction & I,Callable Evaluate)603 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
604   SmallVector<Constant *, 2> COps;
605   for (Value *Op : I.operands()) {
606     Constant *COp = dyn_cast<Constant>(Op);
607     if (!COp)
608       COp = SimplifiedValues.lookup(Op);
609     if (!COp)
610       return false;
611     COps.push_back(COp);
612   }
613   auto *C = Evaluate(COps);
614   if (!C)
615     return false;
616   SimplifiedValues[&I] = C;
617   return true;
618 }
619 
visitBitCast(BitCastInst & I)620 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
621   // Propagate constants through bitcasts.
622   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
623         return ConstantExpr::getBitCast(COps[0], I.getType());
624       }))
625     return true;
626 
627   // Track base/offsets through casts
628   std::pair<Value *, APInt> BaseAndOffset =
629       ConstantOffsetPtrs.lookup(I.getOperand(0));
630   // Casts don't change the offset, just wrap it up.
631   if (BaseAndOffset.first)
632     ConstantOffsetPtrs[&I] = BaseAndOffset;
633 
634   // Also look for SROA candidates here.
635   Value *SROAArg;
636   DenseMap<Value *, int>::iterator CostIt;
637   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
638     SROAArgValues[&I] = SROAArg;
639 
640   // Bitcasts are always zero cost.
641   return true;
642 }
643 
visitPtrToInt(PtrToIntInst & I)644 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
645   // Propagate constants through ptrtoint.
646   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
647         return ConstantExpr::getPtrToInt(COps[0], I.getType());
648       }))
649     return true;
650 
651   // Track base/offset pairs when converted to a plain integer provided the
652   // integer is large enough to represent the pointer.
653   unsigned IntegerSize = I.getType()->getScalarSizeInBits();
654   unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
655   if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
656     std::pair<Value *, APInt> BaseAndOffset =
657         ConstantOffsetPtrs.lookup(I.getOperand(0));
658     if (BaseAndOffset.first)
659       ConstantOffsetPtrs[&I] = BaseAndOffset;
660   }
661 
662   // This is really weird. Technically, ptrtoint will disable SROA. However,
663   // unless that ptrtoint is *used* somewhere in the live basic blocks after
664   // inlining, it will be nuked, and SROA should proceed. All of the uses which
665   // would block SROA would also block SROA if applied directly to a pointer,
666   // and so we can just add the integer in here. The only places where SROA is
667   // preserved either cannot fire on an integer, or won't in-and-of themselves
668   // disable SROA (ext) w/o some later use that we would see and disable.
669   Value *SROAArg;
670   DenseMap<Value *, int>::iterator CostIt;
671   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
672     SROAArgValues[&I] = SROAArg;
673 
674   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
675 }
676 
visitIntToPtr(IntToPtrInst & I)677 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
678   // Propagate constants through ptrtoint.
679   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
680         return ConstantExpr::getIntToPtr(COps[0], I.getType());
681       }))
682     return true;
683 
684   // Track base/offset pairs when round-tripped through a pointer without
685   // modifications provided the integer is not too large.
686   Value *Op = I.getOperand(0);
687   unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
688   if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
689     std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
690     if (BaseAndOffset.first)
691       ConstantOffsetPtrs[&I] = BaseAndOffset;
692   }
693 
694   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
695   Value *SROAArg;
696   DenseMap<Value *, int>::iterator CostIt;
697   if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
698     SROAArgValues[&I] = SROAArg;
699 
700   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
701 }
702 
visitCastInst(CastInst & I)703 bool CallAnalyzer::visitCastInst(CastInst &I) {
704   // Propagate constants through ptrtoint.
705   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
706         return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
707       }))
708     return true;
709 
710   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
711   disableSROA(I.getOperand(0));
712 
713   // If this is a floating-point cast, and the target says this operation
714   // is expensive, this may eventually become a library call. Treat the cost
715   // as such.
716   switch (I.getOpcode()) {
717   case Instruction::FPTrunc:
718   case Instruction::FPExt:
719   case Instruction::UIToFP:
720   case Instruction::SIToFP:
721   case Instruction::FPToUI:
722   case Instruction::FPToSI:
723     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
724       Cost += InlineConstants::CallPenalty;
725     break;
726   default:
727     break;
728   }
729 
730   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
731 }
732 
visitUnaryInstruction(UnaryInstruction & I)733 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
734   Value *Operand = I.getOperand(0);
735   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
736         return ConstantFoldInstOperands(&I, COps[0], DL);
737       }))
738     return true;
739 
740   // Disable any SROA on the argument to arbitrary unary operators.
741   disableSROA(Operand);
742 
743   return false;
744 }
745 
paramHasAttr(Argument * A,Attribute::AttrKind Attr)746 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
747   return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
748 }
749 
isKnownNonNullInCallee(Value * V)750 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
751   // Does the *call site* have the NonNull attribute set on an argument?  We
752   // use the attribute on the call site to memoize any analysis done in the
753   // caller. This will also trip if the callee function has a non-null
754   // parameter attribute, but that's a less interesting case because hopefully
755   // the callee would already have been simplified based on that.
756   if (Argument *A = dyn_cast<Argument>(V))
757     if (paramHasAttr(A, Attribute::NonNull))
758       return true;
759 
760   // Is this an alloca in the caller?  This is distinct from the attribute case
761   // above because attributes aren't updated within the inliner itself and we
762   // always want to catch the alloca derived case.
763   if (isAllocaDerivedArg(V))
764     // We can actually predict the result of comparisons between an
765     // alloca-derived value and null. Note that this fires regardless of
766     // SROA firing.
767     return true;
768 
769   return false;
770 }
771 
allowSizeGrowth(CallSite CS)772 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
773   // If the normal destination of the invoke or the parent block of the call
774   // site is unreachable-terminated, there is little point in inlining this
775   // unless there is literally zero cost.
776   // FIXME: Note that it is possible that an unreachable-terminated block has a
777   // hot entry. For example, in below scenario inlining hot_call_X() may be
778   // beneficial :
779   // main() {
780   //   hot_call_1();
781   //   ...
782   //   hot_call_N()
783   //   exit(0);
784   // }
785   // For now, we are not handling this corner case here as it is rare in real
786   // code. In future, we should elaborate this based on BPI and BFI in more
787   // general threshold adjusting heuristics in updateThreshold().
788   Instruction *Instr = CS.getInstruction();
789   if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
790     if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
791       return false;
792   } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
793     return false;
794 
795   return true;
796 }
797 
isColdCallSite(CallSite CS,BlockFrequencyInfo * CallerBFI)798 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
799   // If global profile summary is available, then callsite's coldness is
800   // determined based on that.
801   if (PSI && PSI->hasProfileSummary())
802     return PSI->isColdCallSite(CS, CallerBFI);
803 
804   // Otherwise we need BFI to be available.
805   if (!CallerBFI)
806     return false;
807 
808   // Determine if the callsite is cold relative to caller's entry. We could
809   // potentially cache the computation of scaled entry frequency, but the added
810   // complexity is not worth it unless this scaling shows up high in the
811   // profiles.
812   const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
813   auto CallSiteBB = CS.getInstruction()->getParent();
814   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
815   auto CallerEntryFreq =
816       CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
817   return CallSiteFreq < CallerEntryFreq * ColdProb;
818 }
819 
820 Optional<int>
getHotCallSiteThreshold(CallSite CS,BlockFrequencyInfo * CallerBFI)821 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
822                                       BlockFrequencyInfo *CallerBFI) {
823 
824   // If global profile summary is available, then callsite's hotness is
825   // determined based on that.
826   if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
827     return Params.HotCallSiteThreshold;
828 
829   // Otherwise we need BFI to be available and to have a locally hot callsite
830   // threshold.
831   if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
832     return None;
833 
834   // Determine if the callsite is hot relative to caller's entry. We could
835   // potentially cache the computation of scaled entry frequency, but the added
836   // complexity is not worth it unless this scaling shows up high in the
837   // profiles.
838   auto CallSiteBB = CS.getInstruction()->getParent();
839   auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
840   auto CallerEntryFreq = CallerBFI->getEntryFreq();
841   if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
842     return Params.LocallyHotCallSiteThreshold;
843 
844   // Otherwise treat it normally.
845   return None;
846 }
847 
updateThreshold(CallSite CS,Function & Callee)848 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
849   // If no size growth is allowed for this inlining, set Threshold to 0.
850   if (!allowSizeGrowth(CS)) {
851     Threshold = 0;
852     return;
853   }
854 
855   Function *Caller = CS.getCaller();
856 
857   // return min(A, B) if B is valid.
858   auto MinIfValid = [](int A, Optional<int> B) {
859     return B ? std::min(A, B.getValue()) : A;
860   };
861 
862   // return max(A, B) if B is valid.
863   auto MaxIfValid = [](int A, Optional<int> B) {
864     return B ? std::max(A, B.getValue()) : A;
865   };
866 
867   // Various bonus percentages. These are multiplied by Threshold to get the
868   // bonus values.
869   // SingleBBBonus: This bonus is applied if the callee has a single reachable
870   // basic block at the given callsite context. This is speculatively applied
871   // and withdrawn if more than one basic block is seen.
872   //
873   // Vector bonuses: We want to more aggressively inline vector-dense kernels
874   // and apply this bonus based on the percentage of vector instructions. A
875   // bonus is applied if the vector instructions exceed 50% and half that amount
876   // is applied if it exceeds 10%. Note that these bonuses are some what
877   // arbitrary and evolved over time by accident as much as because they are
878   // principled bonuses.
879   // FIXME: It would be nice to base the bonus values on something more
880   // scientific.
881   //
882   // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
883   // of the last call to a static function as inlining such functions is
884   // guaranteed to reduce code size.
885   //
886   // These bonus percentages may be set to 0 based on properties of the caller
887   // and the callsite.
888   int SingleBBBonusPercent = 50;
889   int VectorBonusPercent = 150;
890   int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
891 
892   // Lambda to set all the above bonus and bonus percentages to 0.
893   auto DisallowAllBonuses = [&]() {
894     SingleBBBonusPercent = 0;
895     VectorBonusPercent = 0;
896     LastCallToStaticBonus = 0;
897   };
898 
899   // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
900   // and reduce the threshold if the caller has the necessary attribute.
901   if (Caller->optForMinSize()) {
902     Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
903     // For minsize, we want to disable the single BB bonus and the vector
904     // bonuses, but not the last-call-to-static bonus. Inlining the last call to
905     // a static function will, at the minimum, eliminate the parameter setup and
906     // call/return instructions.
907     SingleBBBonusPercent = 0;
908     VectorBonusPercent = 0;
909   } else if (Caller->optForSize())
910     Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
911 
912   // Adjust the threshold based on inlinehint attribute and profile based
913   // hotness information if the caller does not have MinSize attribute.
914   if (!Caller->optForMinSize()) {
915     if (Callee.hasFnAttribute(Attribute::InlineHint))
916       Threshold = MaxIfValid(Threshold, Params.HintThreshold);
917 
918     // FIXME: After switching to the new passmanager, simplify the logic below
919     // by checking only the callsite hotness/coldness as we will reliably
920     // have local profile information.
921     //
922     // Callsite hotness and coldness can be determined if sample profile is
923     // used (which adds hotness metadata to calls) or if caller's
924     // BlockFrequencyInfo is available.
925     BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
926     auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
927     if (!Caller->optForSize() && HotCallSiteThreshold) {
928       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
929       // FIXME: This should update the threshold only if it exceeds the
930       // current threshold, but AutoFDO + ThinLTO currently relies on this
931       // behavior to prevent inlining of hot callsites during ThinLTO
932       // compile phase.
933       Threshold = HotCallSiteThreshold.getValue();
934     } else if (isColdCallSite(CS, CallerBFI)) {
935       LLVM_DEBUG(dbgs() << "Cold callsite.\n");
936       // Do not apply bonuses for a cold callsite including the
937       // LastCallToStatic bonus. While this bonus might result in code size
938       // reduction, it can cause the size of a non-cold caller to increase
939       // preventing it from being inlined.
940       DisallowAllBonuses();
941       Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
942     } else if (PSI) {
943       // Use callee's global profile information only if we have no way of
944       // determining this via callsite information.
945       if (PSI->isFunctionEntryHot(&Callee)) {
946         LLVM_DEBUG(dbgs() << "Hot callee.\n");
947         // If callsite hotness can not be determined, we may still know
948         // that the callee is hot and treat it as a weaker hint for threshold
949         // increase.
950         Threshold = MaxIfValid(Threshold, Params.HintThreshold);
951       } else if (PSI->isFunctionEntryCold(&Callee)) {
952         LLVM_DEBUG(dbgs() << "Cold callee.\n");
953         // Do not apply bonuses for a cold callee including the
954         // LastCallToStatic bonus. While this bonus might result in code size
955         // reduction, it can cause the size of a non-cold caller to increase
956         // preventing it from being inlined.
957         DisallowAllBonuses();
958         Threshold = MinIfValid(Threshold, Params.ColdThreshold);
959       }
960     }
961   }
962 
963   // Finally, take the target-specific inlining threshold multiplier into
964   // account.
965   Threshold *= TTI.getInliningThresholdMultiplier();
966 
967   SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
968   VectorBonus = Threshold * VectorBonusPercent / 100;
969 
970   bool OnlyOneCallAndLocalLinkage =
971       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
972   // If there is only one call of the function, and it has internal linkage,
973   // the cost of inlining it drops dramatically. It may seem odd to update
974   // Cost in updateThreshold, but the bonus depends on the logic in this method.
975   if (OnlyOneCallAndLocalLinkage)
976     Cost -= LastCallToStaticBonus;
977 }
978 
visitCmpInst(CmpInst & I)979 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
980   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
981   // First try to handle simplified comparisons.
982   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
983         return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
984       }))
985     return true;
986 
987   if (I.getOpcode() == Instruction::FCmp)
988     return false;
989 
990   // Otherwise look for a comparison between constant offset pointers with
991   // a common base.
992   Value *LHSBase, *RHSBase;
993   APInt LHSOffset, RHSOffset;
994   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
995   if (LHSBase) {
996     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
997     if (RHSBase && LHSBase == RHSBase) {
998       // We have common bases, fold the icmp to a constant based on the
999       // offsets.
1000       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1001       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1002       if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1003         SimplifiedValues[&I] = C;
1004         ++NumConstantPtrCmps;
1005         return true;
1006       }
1007     }
1008   }
1009 
1010   // If the comparison is an equality comparison with null, we can simplify it
1011   // if we know the value (argument) can't be null
1012   if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1013       isKnownNonNullInCallee(I.getOperand(0))) {
1014     bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1015     SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1016                                       : ConstantInt::getFalse(I.getType());
1017     return true;
1018   }
1019   // Finally check for SROA candidates in comparisons.
1020   Value *SROAArg;
1021   DenseMap<Value *, int>::iterator CostIt;
1022   if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1023     if (isa<ConstantPointerNull>(I.getOperand(1))) {
1024       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1025       return true;
1026     }
1027 
1028     disableSROA(CostIt);
1029   }
1030 
1031   return false;
1032 }
1033 
visitSub(BinaryOperator & I)1034 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1035   // Try to handle a special case: we can fold computing the difference of two
1036   // constant-related pointers.
1037   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1038   Value *LHSBase, *RHSBase;
1039   APInt LHSOffset, RHSOffset;
1040   std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1041   if (LHSBase) {
1042     std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1043     if (RHSBase && LHSBase == RHSBase) {
1044       // We have common bases, fold the subtract to a constant based on the
1045       // offsets.
1046       Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1047       Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1048       if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1049         SimplifiedValues[&I] = C;
1050         ++NumConstantPtrDiffs;
1051         return true;
1052       }
1053     }
1054   }
1055 
1056   // Otherwise, fall back to the generic logic for simplifying and handling
1057   // instructions.
1058   return Base::visitSub(I);
1059 }
1060 
visitBinaryOperator(BinaryOperator & I)1061 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1062   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1063   Constant *CLHS = dyn_cast<Constant>(LHS);
1064   if (!CLHS)
1065     CLHS = SimplifiedValues.lookup(LHS);
1066   Constant *CRHS = dyn_cast<Constant>(RHS);
1067   if (!CRHS)
1068     CRHS = SimplifiedValues.lookup(RHS);
1069 
1070   Value *SimpleV = nullptr;
1071   if (auto FI = dyn_cast<FPMathOperator>(&I))
1072     SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1073                               CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1074   else
1075     SimpleV =
1076         SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1077 
1078   if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1079     SimplifiedValues[&I] = C;
1080 
1081   if (SimpleV)
1082     return true;
1083 
1084   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1085   disableSROA(LHS);
1086   disableSROA(RHS);
1087 
1088   // If the instruction is floating point, and the target says this operation
1089   // is expensive, this may eventually become a library call. Treat the cost
1090   // as such.
1091   if (I.getType()->isFloatingPointTy() &&
1092       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1093     Cost += InlineConstants::CallPenalty;
1094 
1095   return false;
1096 }
1097 
visitLoad(LoadInst & I)1098 bool CallAnalyzer::visitLoad(LoadInst &I) {
1099   Value *SROAArg;
1100   DenseMap<Value *, int>::iterator CostIt;
1101   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1102     if (I.isSimple()) {
1103       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1104       return true;
1105     }
1106 
1107     disableSROA(CostIt);
1108   }
1109 
1110   // If the data is already loaded from this address and hasn't been clobbered
1111   // by any stores or calls, this load is likely to be redundant and can be
1112   // eliminated.
1113   if (EnableLoadElimination &&
1114       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1115     LoadEliminationCost += InlineConstants::InstrCost;
1116     return true;
1117   }
1118 
1119   return false;
1120 }
1121 
visitStore(StoreInst & I)1122 bool CallAnalyzer::visitStore(StoreInst &I) {
1123   Value *SROAArg;
1124   DenseMap<Value *, int>::iterator CostIt;
1125   if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1126     if (I.isSimple()) {
1127       accumulateSROACost(CostIt, InlineConstants::InstrCost);
1128       return true;
1129     }
1130 
1131     disableSROA(CostIt);
1132   }
1133 
1134   // The store can potentially clobber loads and prevent repeated loads from
1135   // being eliminated.
1136   // FIXME:
1137   // 1. We can probably keep an initial set of eliminatable loads substracted
1138   // from the cost even when we finally see a store. We just need to disable
1139   // *further* accumulation of elimination savings.
1140   // 2. We should probably at some point thread MemorySSA for the callee into
1141   // this and then use that to actually compute *really* precise savings.
1142   disableLoadElimination();
1143   return false;
1144 }
1145 
visitExtractValue(ExtractValueInst & I)1146 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1147   // Constant folding for extract value is trivial.
1148   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1149         return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1150       }))
1151     return true;
1152 
1153   // SROA can look through these but give them a cost.
1154   return false;
1155 }
1156 
visitInsertValue(InsertValueInst & I)1157 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1158   // Constant folding for insert value is trivial.
1159   if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1160         return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1161                                             /*InsertedValueOperand*/ COps[1],
1162                                             I.getIndices());
1163       }))
1164     return true;
1165 
1166   // SROA can look through these but give them a cost.
1167   return false;
1168 }
1169 
1170 /// Try to simplify a call site.
1171 ///
1172 /// Takes a concrete function and callsite and tries to actually simplify it by
1173 /// analyzing the arguments and call itself with instsimplify. Returns true if
1174 /// it has simplified the callsite to some other entity (a constant), making it
1175 /// free.
simplifyCallSite(Function * F,CallSite CS)1176 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1177   // FIXME: Using the instsimplify logic directly for this is inefficient
1178   // because we have to continually rebuild the argument list even when no
1179   // simplifications can be performed. Until that is fixed with remapping
1180   // inside of instsimplify, directly constant fold calls here.
1181   if (!canConstantFoldCallTo(CS, F))
1182     return false;
1183 
1184   // Try to re-map the arguments to constants.
1185   SmallVector<Constant *, 4> ConstantArgs;
1186   ConstantArgs.reserve(CS.arg_size());
1187   for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1188        ++I) {
1189     Constant *C = dyn_cast<Constant>(*I);
1190     if (!C)
1191       C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1192     if (!C)
1193       return false; // This argument doesn't map to a constant.
1194 
1195     ConstantArgs.push_back(C);
1196   }
1197   if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1198     SimplifiedValues[CS.getInstruction()] = C;
1199     return true;
1200   }
1201 
1202   return false;
1203 }
1204 
visitCallSite(CallSite CS)1205 bool CallAnalyzer::visitCallSite(CallSite CS) {
1206   if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1207       !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1208     // This aborts the entire analysis.
1209     ExposesReturnsTwice = true;
1210     return false;
1211   }
1212   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1213     ContainsNoDuplicateCall = true;
1214 
1215   if (Function *F = CS.getCalledFunction()) {
1216     // When we have a concrete function, first try to simplify it directly.
1217     if (simplifyCallSite(F, CS))
1218       return true;
1219 
1220     // Next check if it is an intrinsic we know about.
1221     // FIXME: Lift this into part of the InstVisitor.
1222     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1223       switch (II->getIntrinsicID()) {
1224       default:
1225         if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1226           disableLoadElimination();
1227         return Base::visitCallSite(CS);
1228 
1229       case Intrinsic::load_relative:
1230         // This is normally lowered to 4 LLVM instructions.
1231         Cost += 3 * InlineConstants::InstrCost;
1232         return false;
1233 
1234       case Intrinsic::memset:
1235       case Intrinsic::memcpy:
1236       case Intrinsic::memmove:
1237         disableLoadElimination();
1238         // SROA can usually chew through these intrinsics, but they aren't free.
1239         return false;
1240       case Intrinsic::icall_branch_funnel:
1241       case Intrinsic::localescape:
1242         HasUninlineableIntrinsic = true;
1243         return false;
1244       case Intrinsic::vastart:
1245         InitsVargArgs = true;
1246         return false;
1247       }
1248     }
1249 
1250     if (F == CS.getInstruction()->getFunction()) {
1251       // This flag will fully abort the analysis, so don't bother with anything
1252       // else.
1253       IsRecursiveCall = true;
1254       return false;
1255     }
1256 
1257     if (TTI.isLoweredToCall(F)) {
1258       // We account for the average 1 instruction per call argument setup
1259       // here.
1260       Cost += CS.arg_size() * InlineConstants::InstrCost;
1261 
1262       // Everything other than inline ASM will also have a significant cost
1263       // merely from making the call.
1264       if (!isa<InlineAsm>(CS.getCalledValue()))
1265         Cost += InlineConstants::CallPenalty;
1266     }
1267 
1268     if (!CS.onlyReadsMemory())
1269       disableLoadElimination();
1270     return Base::visitCallSite(CS);
1271   }
1272 
1273   // Otherwise we're in a very special case -- an indirect function call. See
1274   // if we can be particularly clever about this.
1275   Value *Callee = CS.getCalledValue();
1276 
1277   // First, pay the price of the argument setup. We account for the average
1278   // 1 instruction per call argument setup here.
1279   Cost += CS.arg_size() * InlineConstants::InstrCost;
1280 
1281   // Next, check if this happens to be an indirect function call to a known
1282   // function in this inline context. If not, we've done all we can.
1283   Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1284   if (!F) {
1285     if (!CS.onlyReadsMemory())
1286       disableLoadElimination();
1287     return Base::visitCallSite(CS);
1288   }
1289 
1290   // If we have a constant that we are calling as a function, we can peer
1291   // through it and see the function target. This happens not infrequently
1292   // during devirtualization and so we want to give it a hefty bonus for
1293   // inlining, but cap that bonus in the event that inlining wouldn't pan
1294   // out. Pretend to inline the function, with a custom threshold.
1295   auto IndirectCallParams = Params;
1296   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1297   CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1298                   IndirectCallParams);
1299   if (CA.analyzeCall(CS)) {
1300     // We were able to inline the indirect call! Subtract the cost from the
1301     // threshold to get the bonus we want to apply, but don't go below zero.
1302     Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1303   }
1304 
1305   if (!F->onlyReadsMemory())
1306     disableLoadElimination();
1307   return Base::visitCallSite(CS);
1308 }
1309 
visitReturnInst(ReturnInst & RI)1310 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1311   // At least one return instruction will be free after inlining.
1312   bool Free = !HasReturn;
1313   HasReturn = true;
1314   return Free;
1315 }
1316 
visitBranchInst(BranchInst & BI)1317 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1318   // We model unconditional branches as essentially free -- they really
1319   // shouldn't exist at all, but handling them makes the behavior of the
1320   // inliner more regular and predictable. Interestingly, conditional branches
1321   // which will fold away are also free.
1322   return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1323          dyn_cast_or_null<ConstantInt>(
1324              SimplifiedValues.lookup(BI.getCondition()));
1325 }
1326 
visitSelectInst(SelectInst & SI)1327 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1328   bool CheckSROA = SI.getType()->isPointerTy();
1329   Value *TrueVal = SI.getTrueValue();
1330   Value *FalseVal = SI.getFalseValue();
1331 
1332   Constant *TrueC = dyn_cast<Constant>(TrueVal);
1333   if (!TrueC)
1334     TrueC = SimplifiedValues.lookup(TrueVal);
1335   Constant *FalseC = dyn_cast<Constant>(FalseVal);
1336   if (!FalseC)
1337     FalseC = SimplifiedValues.lookup(FalseVal);
1338   Constant *CondC =
1339       dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1340 
1341   if (!CondC) {
1342     // Select C, X, X => X
1343     if (TrueC == FalseC && TrueC) {
1344       SimplifiedValues[&SI] = TrueC;
1345       return true;
1346     }
1347 
1348     if (!CheckSROA)
1349       return Base::visitSelectInst(SI);
1350 
1351     std::pair<Value *, APInt> TrueBaseAndOffset =
1352         ConstantOffsetPtrs.lookup(TrueVal);
1353     std::pair<Value *, APInt> FalseBaseAndOffset =
1354         ConstantOffsetPtrs.lookup(FalseVal);
1355     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1356       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1357 
1358       Value *SROAArg;
1359       DenseMap<Value *, int>::iterator CostIt;
1360       if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1361         SROAArgValues[&SI] = SROAArg;
1362       return true;
1363     }
1364 
1365     return Base::visitSelectInst(SI);
1366   }
1367 
1368   // Select condition is a constant.
1369   Value *SelectedV = CondC->isAllOnesValue()
1370                          ? TrueVal
1371                          : (CondC->isNullValue()) ? FalseVal : nullptr;
1372   if (!SelectedV) {
1373     // Condition is a vector constant that is not all 1s or all 0s.  If all
1374     // operands are constants, ConstantExpr::getSelect() can handle the cases
1375     // such as select vectors.
1376     if (TrueC && FalseC) {
1377       if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1378         SimplifiedValues[&SI] = C;
1379         return true;
1380       }
1381     }
1382     return Base::visitSelectInst(SI);
1383   }
1384 
1385   // Condition is either all 1s or all 0s. SI can be simplified.
1386   if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1387     SimplifiedValues[&SI] = SelectedC;
1388     return true;
1389   }
1390 
1391   if (!CheckSROA)
1392     return true;
1393 
1394   std::pair<Value *, APInt> BaseAndOffset =
1395       ConstantOffsetPtrs.lookup(SelectedV);
1396   if (BaseAndOffset.first) {
1397     ConstantOffsetPtrs[&SI] = BaseAndOffset;
1398 
1399     Value *SROAArg;
1400     DenseMap<Value *, int>::iterator CostIt;
1401     if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1402       SROAArgValues[&SI] = SROAArg;
1403   }
1404 
1405   return true;
1406 }
1407 
visitSwitchInst(SwitchInst & SI)1408 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1409   // We model unconditional switches as free, see the comments on handling
1410   // branches.
1411   if (isa<ConstantInt>(SI.getCondition()))
1412     return true;
1413   if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1414     if (isa<ConstantInt>(V))
1415       return true;
1416 
1417   // Assume the most general case where the switch is lowered into
1418   // either a jump table, bit test, or a balanced binary tree consisting of
1419   // case clusters without merging adjacent clusters with the same
1420   // destination. We do not consider the switches that are lowered with a mix
1421   // of jump table/bit test/binary search tree. The cost of the switch is
1422   // proportional to the size of the tree or the size of jump table range.
1423   //
1424   // NB: We convert large switches which are just used to initialize large phi
1425   // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1426   // inlining those. It will prevent inlining in cases where the optimization
1427   // does not (yet) fire.
1428 
1429   // Maximum valid cost increased in this function.
1430   int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1431 
1432   // Exit early for a large switch, assuming one case needs at least one
1433   // instruction.
1434   // FIXME: This is not true for a bit test, but ignore such case for now to
1435   // save compile-time.
1436   int64_t CostLowerBound =
1437       std::min((int64_t)CostUpperBound,
1438                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1439 
1440   if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1441     Cost = CostLowerBound;
1442     return false;
1443   }
1444 
1445   unsigned JumpTableSize = 0;
1446   unsigned NumCaseCluster =
1447       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1448 
1449   // If suitable for a jump table, consider the cost for the table size and
1450   // branch to destination.
1451   if (JumpTableSize) {
1452     int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1453                      4 * InlineConstants::InstrCost;
1454 
1455     Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1456     return false;
1457   }
1458 
1459   // Considering forming a binary search, we should find the number of nodes
1460   // which is same as the number of comparisons when lowered. For a given
1461   // number of clusters, n, we can define a recursive function, f(n), to find
1462   // the number of nodes in the tree. The recursion is :
1463   // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1464   // and f(n) = n, when n <= 3.
1465   // This will lead a binary tree where the leaf should be either f(2) or f(3)
1466   // when n > 3.  So, the number of comparisons from leaves should be n, while
1467   // the number of non-leaf should be :
1468   //   2^(log2(n) - 1) - 1
1469   //   = 2^log2(n) * 2^-1 - 1
1470   //   = n / 2 - 1.
1471   // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1472   // number of comparisons in a simple closed form :
1473   //   n + n / 2 - 1 = n * 3 / 2 - 1
1474   if (NumCaseCluster <= 3) {
1475     // Suppose a comparison includes one compare and one conditional branch.
1476     Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1477     return false;
1478   }
1479 
1480   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1481   int64_t SwitchCost =
1482       ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1483 
1484   Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1485   return false;
1486 }
1487 
visitIndirectBrInst(IndirectBrInst & IBI)1488 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1489   // We never want to inline functions that contain an indirectbr.  This is
1490   // incorrect because all the blockaddress's (in static global initializers
1491   // for example) would be referring to the original function, and this
1492   // indirect jump would jump from the inlined copy of the function into the
1493   // original function which is extremely undefined behavior.
1494   // FIXME: This logic isn't really right; we can safely inline functions with
1495   // indirectbr's as long as no other function or global references the
1496   // blockaddress of a block within the current function.
1497   HasIndirectBr = true;
1498   return false;
1499 }
1500 
visitResumeInst(ResumeInst & RI)1501 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1502   // FIXME: It's not clear that a single instruction is an accurate model for
1503   // the inline cost of a resume instruction.
1504   return false;
1505 }
1506 
visitCleanupReturnInst(CleanupReturnInst & CRI)1507 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1508   // FIXME: It's not clear that a single instruction is an accurate model for
1509   // the inline cost of a cleanupret instruction.
1510   return false;
1511 }
1512 
visitCatchReturnInst(CatchReturnInst & CRI)1513 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1514   // FIXME: It's not clear that a single instruction is an accurate model for
1515   // the inline cost of a catchret instruction.
1516   return false;
1517 }
1518 
visitUnreachableInst(UnreachableInst & I)1519 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1520   // FIXME: It might be reasonably to discount the cost of instructions leading
1521   // to unreachable as they have the lowest possible impact on both runtime and
1522   // code size.
1523   return true; // No actual code is needed for unreachable.
1524 }
1525 
visitInstruction(Instruction & I)1526 bool CallAnalyzer::visitInstruction(Instruction &I) {
1527   // Some instructions are free. All of the free intrinsics can also be
1528   // handled by SROA, etc.
1529   if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1530     return true;
1531 
1532   // We found something we don't understand or can't handle. Mark any SROA-able
1533   // values in the operand list as no longer viable.
1534   for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1535     disableSROA(*OI);
1536 
1537   return false;
1538 }
1539 
1540 /// Analyze a basic block for its contribution to the inline cost.
1541 ///
1542 /// This method walks the analyzer over every instruction in the given basic
1543 /// block and accounts for their cost during inlining at this callsite. It
1544 /// aborts early if the threshold has been exceeded or an impossible to inline
1545 /// construct has been detected. It returns false if inlining is no longer
1546 /// viable, and true if inlining remains viable.
1547 InlineResult
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)1548 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1549                            SmallPtrSetImpl<const Value *> &EphValues) {
1550   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1551     // FIXME: Currently, the number of instructions in a function regardless of
1552     // our ability to simplify them during inline to constants or dead code,
1553     // are actually used by the vector bonus heuristic. As long as that's true,
1554     // we have to special case debug intrinsics here to prevent differences in
1555     // inlining due to debug symbols. Eventually, the number of unsimplified
1556     // instructions shouldn't factor into the cost computation, but until then,
1557     // hack around it here.
1558     if (isa<DbgInfoIntrinsic>(I))
1559       continue;
1560 
1561     // Skip ephemeral values.
1562     if (EphValues.count(&*I))
1563       continue;
1564 
1565     ++NumInstructions;
1566     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1567       ++NumVectorInstructions;
1568 
1569     // If the instruction simplified to a constant, there is no cost to this
1570     // instruction. Visit the instructions using our InstVisitor to account for
1571     // all of the per-instruction logic. The visit tree returns true if we
1572     // consumed the instruction in any way, and false if the instruction's base
1573     // cost should count against inlining.
1574     if (Base::visit(&*I))
1575       ++NumInstructionsSimplified;
1576     else
1577       Cost += InlineConstants::InstrCost;
1578 
1579     using namespace ore;
1580     // If the visit this instruction detected an uninlinable pattern, abort.
1581     InlineResult IR;
1582     if (IsRecursiveCall)
1583       IR = "recursive";
1584     else if (ExposesReturnsTwice)
1585       IR = "exposes returns twice";
1586     else if (HasDynamicAlloca)
1587       IR = "dynamic alloca";
1588     else if (HasIndirectBr)
1589       IR = "indirect branch";
1590     else if (HasUninlineableIntrinsic)
1591       IR = "uninlinable intrinsic";
1592     else if (InitsVargArgs)
1593       IR = "varargs";
1594     if (!IR) {
1595       if (ORE)
1596         ORE->emit([&]() {
1597           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1598                                           CandidateCS.getInstruction())
1599                  << NV("Callee", &F) << " has uninlinable pattern ("
1600                  << NV("InlineResult", IR.message)
1601                  << ") and cost is not fully computed";
1602         });
1603       return IR;
1604     }
1605 
1606     // If the caller is a recursive function then we don't want to inline
1607     // functions which allocate a lot of stack space because it would increase
1608     // the caller stack usage dramatically.
1609     if (IsCallerRecursive &&
1610         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1611       InlineResult IR = "recursive and allocates too much stack space";
1612       if (ORE)
1613         ORE->emit([&]() {
1614           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1615                                           CandidateCS.getInstruction())
1616                  << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
1617                  << ". Cost is not fully computed";
1618         });
1619       return IR;
1620     }
1621 
1622     // Check if we've past the maximum possible threshold so we don't spin in
1623     // huge basic blocks that will never inline.
1624     if (Cost >= Threshold && !ComputeFullInlineCost)
1625       return false;
1626   }
1627 
1628   return true;
1629 }
1630 
1631 /// Compute the base pointer and cumulative constant offsets for V.
1632 ///
1633 /// This strips all constant offsets off of V, leaving it the base pointer, and
1634 /// accumulates the total constant offset applied in the returned constant. It
1635 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1636 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1637 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1638   if (!V->getType()->isPointerTy())
1639     return nullptr;
1640 
1641   unsigned AS = V->getType()->getPointerAddressSpace();
1642   unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1643   APInt Offset = APInt::getNullValue(IntPtrWidth);
1644 
1645   // Even though we don't look through PHI nodes, we could be called on an
1646   // instruction in an unreachable block, which may be on a cycle.
1647   SmallPtrSet<Value *, 4> Visited;
1648   Visited.insert(V);
1649   do {
1650     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1651       if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1652         return nullptr;
1653       V = GEP->getPointerOperand();
1654     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1655       V = cast<Operator>(V)->getOperand(0);
1656     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1657       if (GA->isInterposable())
1658         break;
1659       V = GA->getAliasee();
1660     } else {
1661       break;
1662     }
1663     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1664   } while (Visited.insert(V).second);
1665 
1666   Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1667   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1668 }
1669 
1670 /// Find dead blocks due to deleted CFG edges during inlining.
1671 ///
1672 /// If we know the successor of the current block, \p CurrBB, has to be \p
1673 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1674 /// no live incoming CFG edges.  If one block is found to be dead, we can
1675 /// continue growing the dead block list by checking the successors of the dead
1676 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)1677 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1678   auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1679     // A CFG edge is dead if the predecessor is dead or the predessor has a
1680     // known successor which is not the one under exam.
1681     return (DeadBlocks.count(Pred) ||
1682             (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1683   };
1684 
1685   auto IsNewlyDead = [&](BasicBlock *BB) {
1686     // If all the edges to a block are dead, the block is also dead.
1687     return (!DeadBlocks.count(BB) &&
1688             llvm::all_of(predecessors(BB),
1689                          [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1690   };
1691 
1692   for (BasicBlock *Succ : successors(CurrBB)) {
1693     if (Succ == NextBB || !IsNewlyDead(Succ))
1694       continue;
1695     SmallVector<BasicBlock *, 4> NewDead;
1696     NewDead.push_back(Succ);
1697     while (!NewDead.empty()) {
1698       BasicBlock *Dead = NewDead.pop_back_val();
1699       if (DeadBlocks.insert(Dead))
1700         // Continue growing the dead block lists.
1701         for (BasicBlock *S : successors(Dead))
1702           if (IsNewlyDead(S))
1703             NewDead.push_back(S);
1704     }
1705   }
1706 }
1707 
1708 /// Analyze a call site for potential inlining.
1709 ///
1710 /// Returns true if inlining this call is viable, and false if it is not
1711 /// viable. It computes the cost and adjusts the threshold based on numerous
1712 /// factors and heuristics. If this method returns false but the computed cost
1713 /// is below the computed threshold, then inlining was forcibly disabled by
1714 /// some artifact of the routine.
analyzeCall(CallSite CS)1715 InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
1716   ++NumCallsAnalyzed;
1717 
1718   // Perform some tweaks to the cost and threshold based on the direct
1719   // callsite information.
1720 
1721   // We want to more aggressively inline vector-dense kernels, so up the
1722   // threshold, and we'll lower it if the % of vector instructions gets too
1723   // low. Note that these bonuses are some what arbitrary and evolved over time
1724   // by accident as much as because they are principled bonuses.
1725   //
1726   // FIXME: It would be nice to remove all such bonuses. At least it would be
1727   // nice to base the bonus values on something more scientific.
1728   assert(NumInstructions == 0);
1729   assert(NumVectorInstructions == 0);
1730 
1731   // Update the threshold based on callsite properties
1732   updateThreshold(CS, F);
1733 
1734   // While Threshold depends on commandline options that can take negative
1735   // values, we want to enforce the invariant that the computed threshold and
1736   // bonuses are non-negative.
1737   assert(Threshold >= 0);
1738   assert(SingleBBBonus >= 0);
1739   assert(VectorBonus >= 0);
1740 
1741   // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1742   // this Threshold any time, and cost cannot decrease, we can stop processing
1743   // the rest of the function body.
1744   Threshold += (SingleBBBonus + VectorBonus);
1745 
1746   // Give out bonuses for the callsite, as the instructions setting them up
1747   // will be gone after inlining.
1748   Cost -= getCallsiteCost(CS, DL);
1749 
1750   // If this function uses the coldcc calling convention, prefer not to inline
1751   // it.
1752   if (F.getCallingConv() == CallingConv::Cold)
1753     Cost += InlineConstants::ColdccPenalty;
1754 
1755   // Check if we're done. This can happen due to bonuses and penalties.
1756   if (Cost >= Threshold && !ComputeFullInlineCost)
1757     return "high cost";
1758 
1759   if (F.empty())
1760     return true;
1761 
1762   Function *Caller = CS.getInstruction()->getFunction();
1763   // Check if the caller function is recursive itself.
1764   for (User *U : Caller->users()) {
1765     CallSite Site(U);
1766     if (!Site)
1767       continue;
1768     Instruction *I = Site.getInstruction();
1769     if (I->getFunction() == Caller) {
1770       IsCallerRecursive = true;
1771       break;
1772     }
1773   }
1774 
1775   // Populate our simplified values by mapping from function arguments to call
1776   // arguments with known important simplifications.
1777   CallSite::arg_iterator CAI = CS.arg_begin();
1778   for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1779        FAI != FAE; ++FAI, ++CAI) {
1780     assert(CAI != CS.arg_end());
1781     if (Constant *C = dyn_cast<Constant>(CAI))
1782       SimplifiedValues[&*FAI] = C;
1783 
1784     Value *PtrArg = *CAI;
1785     if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1786       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1787 
1788       // We can SROA any pointer arguments derived from alloca instructions.
1789       if (isa<AllocaInst>(PtrArg)) {
1790         SROAArgValues[&*FAI] = PtrArg;
1791         SROAArgCosts[PtrArg] = 0;
1792       }
1793     }
1794   }
1795   NumConstantArgs = SimplifiedValues.size();
1796   NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1797   NumAllocaArgs = SROAArgValues.size();
1798 
1799   // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1800   // the ephemeral values multiple times (and they're completely determined by
1801   // the callee, so this is purely duplicate work).
1802   SmallPtrSet<const Value *, 32> EphValues;
1803   CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1804 
1805   // The worklist of live basic blocks in the callee *after* inlining. We avoid
1806   // adding basic blocks of the callee which can be proven to be dead for this
1807   // particular call site in order to get more accurate cost estimates. This
1808   // requires a somewhat heavyweight iteration pattern: we need to walk the
1809   // basic blocks in a breadth-first order as we insert live successors. To
1810   // accomplish this, prioritizing for small iterations because we exit after
1811   // crossing our threshold, we use a small-size optimized SetVector.
1812   typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1813                     SmallPtrSet<BasicBlock *, 16>>
1814       BBSetVector;
1815   BBSetVector BBWorklist;
1816   BBWorklist.insert(&F.getEntryBlock());
1817   bool SingleBB = true;
1818   // Note that we *must not* cache the size, this loop grows the worklist.
1819   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1820     // Bail out the moment we cross the threshold. This means we'll under-count
1821     // the cost, but only when undercounting doesn't matter.
1822     if (Cost >= Threshold && !ComputeFullInlineCost)
1823       break;
1824 
1825     BasicBlock *BB = BBWorklist[Idx];
1826     if (BB->empty())
1827       continue;
1828 
1829     // Disallow inlining a blockaddress. A blockaddress only has defined
1830     // behavior for an indirect branch in the same function, and we do not
1831     // currently support inlining indirect branches. But, the inliner may not
1832     // see an indirect branch that ends up being dead code at a particular call
1833     // site. If the blockaddress escapes the function, e.g., via a global
1834     // variable, inlining may lead to an invalid cross-function reference.
1835     if (BB->hasAddressTaken())
1836       return "blockaddress";
1837 
1838     // Analyze the cost of this block. If we blow through the threshold, this
1839     // returns false, and we can bail on out.
1840     InlineResult IR = analyzeBlock(BB, EphValues);
1841     if (!IR)
1842       return IR;
1843 
1844     Instruction *TI = BB->getTerminator();
1845 
1846     // Add in the live successors by first checking whether we have terminator
1847     // that may be simplified based on the values simplified by this call.
1848     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1849       if (BI->isConditional()) {
1850         Value *Cond = BI->getCondition();
1851         if (ConstantInt *SimpleCond =
1852                 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1853           BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1854           BBWorklist.insert(NextBB);
1855           KnownSuccessors[BB] = NextBB;
1856           findDeadBlocks(BB, NextBB);
1857           continue;
1858         }
1859       }
1860     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1861       Value *Cond = SI->getCondition();
1862       if (ConstantInt *SimpleCond =
1863               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1864         BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1865         BBWorklist.insert(NextBB);
1866         KnownSuccessors[BB] = NextBB;
1867         findDeadBlocks(BB, NextBB);
1868         continue;
1869       }
1870     }
1871 
1872     // If we're unable to select a particular successor, just count all of
1873     // them.
1874     for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1875          ++TIdx)
1876       BBWorklist.insert(TI->getSuccessor(TIdx));
1877 
1878     // If we had any successors at this point, than post-inlining is likely to
1879     // have them as well. Note that we assume any basic blocks which existed
1880     // due to branches or switches which folded above will also fold after
1881     // inlining.
1882     if (SingleBB && TI->getNumSuccessors() > 1) {
1883       // Take off the bonus we applied to the threshold.
1884       Threshold -= SingleBBBonus;
1885       SingleBB = false;
1886     }
1887   }
1888 
1889   bool OnlyOneCallAndLocalLinkage =
1890       F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1891   // If this is a noduplicate call, we can still inline as long as
1892   // inlining this would cause the removal of the caller (so the instruction
1893   // is not actually duplicated, just moved).
1894   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1895     return "noduplicate";
1896 
1897   // Loops generally act a lot like calls in that they act like barriers to
1898   // movement, require a certain amount of setup, etc. So when optimising for
1899   // size, we penalise any call sites that perform loops. We do this after all
1900   // other costs here, so will likely only be dealing with relatively small
1901   // functions (and hence DT and LI will hopefully be cheap).
1902   if (Caller->optForMinSize()) {
1903     DominatorTree DT(F);
1904     LoopInfo LI(DT);
1905     int NumLoops = 0;
1906     for (Loop *L : LI) {
1907       // Ignore loops that will not be executed
1908       if (DeadBlocks.count(L->getHeader()))
1909         continue;
1910       NumLoops++;
1911     }
1912     Cost += NumLoops * InlineConstants::CallPenalty;
1913   }
1914 
1915   // We applied the maximum possible vector bonus at the beginning. Now,
1916   // subtract the excess bonus, if any, from the Threshold before
1917   // comparing against Cost.
1918   if (NumVectorInstructions <= NumInstructions / 10)
1919     Threshold -= VectorBonus;
1920   else if (NumVectorInstructions <= NumInstructions / 2)
1921     Threshold -= VectorBonus/2;
1922 
1923   return Cost < std::max(1, Threshold);
1924 }
1925 
1926 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1927 /// Dump stats about this call's analysis.
dump()1928 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1929 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
1930   DEBUG_PRINT_STAT(NumConstantArgs);
1931   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1932   DEBUG_PRINT_STAT(NumAllocaArgs);
1933   DEBUG_PRINT_STAT(NumConstantPtrCmps);
1934   DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1935   DEBUG_PRINT_STAT(NumInstructionsSimplified);
1936   DEBUG_PRINT_STAT(NumInstructions);
1937   DEBUG_PRINT_STAT(SROACostSavings);
1938   DEBUG_PRINT_STAT(SROACostSavingsLost);
1939   DEBUG_PRINT_STAT(LoadEliminationCost);
1940   DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1941   DEBUG_PRINT_STAT(Cost);
1942   DEBUG_PRINT_STAT(Threshold);
1943 #undef DEBUG_PRINT_STAT
1944 }
1945 #endif
1946 
1947 /// Test that there are no attribute conflicts between Caller and Callee
1948 ///        that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1949 static bool functionsHaveCompatibleAttributes(Function *Caller,
1950                                               Function *Callee,
1951                                               TargetTransformInfo &TTI) {
1952   return TTI.areInlineCompatible(Caller, Callee) &&
1953          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1954 }
1955 
getCallsiteCost(CallSite CS,const DataLayout & DL)1956 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1957   int Cost = 0;
1958   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1959     if (CS.isByValArgument(I)) {
1960       // We approximate the number of loads and stores needed by dividing the
1961       // size of the byval type by the target's pointer size.
1962       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1963       unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1964       unsigned AS = PTy->getAddressSpace();
1965       unsigned PointerSize = DL.getPointerSizeInBits(AS);
1966       // Ceiling division.
1967       unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1968 
1969       // If it generates more than 8 stores it is likely to be expanded as an
1970       // inline memcpy so we take that as an upper bound. Otherwise we assume
1971       // one load and one store per word copied.
1972       // FIXME: The maxStoresPerMemcpy setting from the target should be used
1973       // here instead of a magic number of 8, but it's not available via
1974       // DataLayout.
1975       NumStores = std::min(NumStores, 8U);
1976 
1977       Cost += 2 * NumStores * InlineConstants::InstrCost;
1978     } else {
1979       // For non-byval arguments subtract off one instruction per call
1980       // argument.
1981       Cost += InlineConstants::InstrCost;
1982     }
1983   }
1984   // The call instruction also disappears after inlining.
1985   Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1986   return Cost;
1987 }
1988 
getInlineCost(CallSite CS,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1989 InlineCost llvm::getInlineCost(
1990     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1991     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1992     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1993     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1994   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1995                        GetAssumptionCache, GetBFI, PSI, ORE);
1996 }
1997 
getInlineCost(CallSite CS,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1998 InlineCost llvm::getInlineCost(
1999     CallSite CS, Function *Callee, const InlineParams &Params,
2000     TargetTransformInfo &CalleeTTI,
2001     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2002     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2003     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2004 
2005   // Cannot inline indirect calls.
2006   if (!Callee)
2007     return llvm::InlineCost::getNever("indirect call");
2008 
2009   // Never inline calls with byval arguments that does not have the alloca
2010   // address space. Since byval arguments can be replaced with a copy to an
2011   // alloca, the inlined code would need to be adjusted to handle that the
2012   // argument is in the alloca address space (so it is a little bit complicated
2013   // to solve).
2014   unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2015   for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
2016     if (CS.isByValArgument(I)) {
2017       PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
2018       if (PTy->getAddressSpace() != AllocaAS)
2019         return llvm::InlineCost::getNever("byval arguments without alloca"
2020                                           " address space");
2021     }
2022 
2023   // Calls to functions with always-inline attributes should be inlined
2024   // whenever possible.
2025   if (CS.hasFnAttr(Attribute::AlwaysInline)) {
2026     if (isInlineViable(*Callee))
2027       return llvm::InlineCost::getAlways("always inline attribute");
2028     return llvm::InlineCost::getNever("inapplicable always inline attribute");
2029   }
2030 
2031   // Never inline functions with conflicting attributes (unless callee has
2032   // always-inline attribute).
2033   Function *Caller = CS.getCaller();
2034   if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2035     return llvm::InlineCost::getNever("conflicting attributes");
2036 
2037   // Don't inline this call if the caller has the optnone attribute.
2038   if (Caller->hasFnAttribute(Attribute::OptimizeNone))
2039     return llvm::InlineCost::getNever("optnone attribute");
2040 
2041   // Don't inline a function that treats null pointer as valid into a caller
2042   // that does not have this attribute.
2043   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2044     return llvm::InlineCost::getNever("nullptr definitions incompatible");
2045 
2046   // Don't inline functions which can be interposed at link-time.
2047   if (Callee->isInterposable())
2048     return llvm::InlineCost::getNever("interposable");
2049 
2050   // Don't inline functions marked noinline.
2051   if (Callee->hasFnAttribute(Attribute::NoInline))
2052     return llvm::InlineCost::getNever("noinline function attribute");
2053 
2054   // Don't inline call sites marked noinline.
2055   if (CS.isNoInline())
2056     return llvm::InlineCost::getNever("noinline call site attribute");
2057 
2058   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
2059                           << "... (caller:" << Caller->getName() << ")\n");
2060 
2061   CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2062                   Params);
2063   InlineResult ShouldInline = CA.analyzeCall(CS);
2064 
2065   LLVM_DEBUG(CA.dump());
2066 
2067   // Check if there was a reason to force inlining or no inlining.
2068   if (!ShouldInline && CA.getCost() < CA.getThreshold())
2069     return InlineCost::getNever(ShouldInline.message);
2070   if (ShouldInline && CA.getCost() >= CA.getThreshold())
2071     return InlineCost::getAlways("empty function");
2072 
2073   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2074 }
2075 
isInlineViable(Function & F)2076 bool llvm::isInlineViable(Function &F) {
2077   bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2078   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2079     // Disallow inlining of functions which contain indirect branches or
2080     // blockaddresses.
2081     if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2082       return false;
2083 
2084     for (auto &II : *BI) {
2085       CallSite CS(&II);
2086       if (!CS)
2087         continue;
2088 
2089       // Disallow recursive calls.
2090       if (&F == CS.getCalledFunction())
2091         return false;
2092 
2093       // Disallow calls which expose returns-twice to a function not previously
2094       // attributed as such.
2095       if (!ReturnsTwice && CS.isCall() &&
2096           cast<CallInst>(CS.getInstruction())->canReturnTwice())
2097         return false;
2098 
2099       if (CS.getCalledFunction())
2100         switch (CS.getCalledFunction()->getIntrinsicID()) {
2101         default:
2102           break;
2103         // Disallow inlining of @llvm.icall.branch.funnel because current
2104         // backend can't separate call targets from call arguments.
2105         case llvm::Intrinsic::icall_branch_funnel:
2106         // Disallow inlining functions that call @llvm.localescape. Doing this
2107         // correctly would require major changes to the inliner.
2108         case llvm::Intrinsic::localescape:
2109         // Disallow inlining of functions that initialize VarArgs with va_start.
2110         case llvm::Intrinsic::vastart:
2111           return false;
2112         }
2113     }
2114   }
2115 
2116   return true;
2117 }
2118 
2119 // APIs to create InlineParams based on command line flags and/or other
2120 // parameters.
2121 
getInlineParams(int Threshold)2122 InlineParams llvm::getInlineParams(int Threshold) {
2123   InlineParams Params;
2124 
2125   // This field is the threshold to use for a callee by default. This is
2126   // derived from one or more of:
2127   //  * optimization or size-optimization levels,
2128   //  * a value passed to createFunctionInliningPass function, or
2129   //  * the -inline-threshold flag.
2130   //  If the -inline-threshold flag is explicitly specified, that is used
2131   //  irrespective of anything else.
2132   if (InlineThreshold.getNumOccurrences() > 0)
2133     Params.DefaultThreshold = InlineThreshold;
2134   else
2135     Params.DefaultThreshold = Threshold;
2136 
2137   // Set the HintThreshold knob from the -inlinehint-threshold.
2138   Params.HintThreshold = HintThreshold;
2139 
2140   // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2141   Params.HotCallSiteThreshold = HotCallSiteThreshold;
2142 
2143   // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2144   // populate LocallyHotCallSiteThreshold. Later, we populate
2145   // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2146   // we know that optimization level is O3 (in the getInlineParams variant that
2147   // takes the opt and size levels).
2148   // FIXME: Remove this check (and make the assignment unconditional) after
2149   // addressing size regression issues at O2.
2150   if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2151     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2152 
2153   // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2154   Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2155 
2156   // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2157   // -inlinehint-threshold commandline option is not explicitly given. If that
2158   // option is present, then its value applies even for callees with size and
2159   // minsize attributes.
2160   // If the -inline-threshold is not specified, set the ColdThreshold from the
2161   // -inlinecold-threshold even if it is not explicitly passed. If
2162   // -inline-threshold is specified, then -inlinecold-threshold needs to be
2163   // explicitly specified to set the ColdThreshold knob
2164   if (InlineThreshold.getNumOccurrences() == 0) {
2165     Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2166     Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2167     Params.ColdThreshold = ColdThreshold;
2168   } else if (ColdThreshold.getNumOccurrences() > 0) {
2169     Params.ColdThreshold = ColdThreshold;
2170   }
2171   return Params;
2172 }
2173 
getInlineParams()2174 InlineParams llvm::getInlineParams() {
2175   return getInlineParams(InlineThreshold);
2176 }
2177 
2178 // Compute the default threshold for inlining based on the opt level and the
2179 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)2180 static int computeThresholdFromOptLevels(unsigned OptLevel,
2181                                          unsigned SizeOptLevel) {
2182   if (OptLevel > 2)
2183     return InlineConstants::OptAggressiveThreshold;
2184   if (SizeOptLevel == 1) // -Os
2185     return InlineConstants::OptSizeThreshold;
2186   if (SizeOptLevel == 2) // -Oz
2187     return InlineConstants::OptMinSizeThreshold;
2188   return InlineThreshold;
2189 }
2190 
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)2191 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2192   auto Params =
2193       getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2194   // At O3, use the value of -locally-hot-callsite-threshold option to populate
2195   // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2196   // when it is specified explicitly.
2197   if (OptLevel > 2)
2198     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2199   return Params;
2200 }
2201