1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements inline cost analysis.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Analysis/InlineCost.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/CodeMetrics.h"
23 #include "llvm/Analysis/ConstantFolding.h"
24 #include "llvm/Analysis/CFG.h"
25 #include "llvm/Analysis/InstructionSimplify.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/CallSite.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/raw_ostream.h"
42
43 using namespace llvm;
44
45 #define DEBUG_TYPE "inline-cost"
46
47 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
48
49 static cl::opt<int> InlineThreshold(
50 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
51 cl::desc("Control the amount of inlining to perform (default = 225)"));
52
53 static cl::opt<int> HintThreshold(
54 "inlinehint-threshold", cl::Hidden, cl::init(325),
55 cl::desc("Threshold for inlining functions with inline hint"));
56
57 static cl::opt<int>
58 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
59 cl::init(45),
60 cl::desc("Threshold for inlining cold callsites"));
61
62 // We introduce this threshold to help performance of instrumentation based
63 // PGO before we actually hook up inliner with analysis passes such as BPI and
64 // BFI.
65 static cl::opt<int> ColdThreshold(
66 "inlinecold-threshold", cl::Hidden, cl::init(45),
67 cl::desc("Threshold for inlining functions with cold attribute"));
68
69 static cl::opt<int>
70 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
71 cl::ZeroOrMore,
72 cl::desc("Threshold for hot callsites "));
73
74 static cl::opt<int> LocallyHotCallSiteThreshold(
75 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
76 cl::desc("Threshold for locally hot callsites "));
77
78 static cl::opt<int> ColdCallSiteRelFreq(
79 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
80 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
81 "entry frequency, for a callsite to be cold in the absence of "
82 "profile information."));
83
84 static cl::opt<int> HotCallSiteRelFreq(
85 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
86 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
87 "entry frequency, for a callsite to be hot in the absence of "
88 "profile information."));
89
90 static cl::opt<bool> OptComputeFullInlineCost(
91 "inline-cost-full", cl::Hidden, cl::init(false),
92 cl::desc("Compute the full inline cost of a call site even when the cost "
93 "exceeds the threshold."));
94
95 namespace {
96
97 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
98 typedef InstVisitor<CallAnalyzer, bool> Base;
99 friend class InstVisitor<CallAnalyzer, bool>;
100
101 /// The TargetTransformInfo available for this compilation.
102 const TargetTransformInfo &TTI;
103
104 /// Getter for the cache of @llvm.assume intrinsics.
105 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
106
107 /// Getter for BlockFrequencyInfo
108 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
109
110 /// Profile summary information.
111 ProfileSummaryInfo *PSI;
112
113 /// The called function.
114 Function &F;
115
116 // Cache the DataLayout since we use it a lot.
117 const DataLayout &DL;
118
119 /// The OptimizationRemarkEmitter available for this compilation.
120 OptimizationRemarkEmitter *ORE;
121
122 /// The candidate callsite being analyzed. Please do not use this to do
123 /// analysis in the caller function; we want the inline cost query to be
124 /// easily cacheable. Instead, use the cover function paramHasAttr.
125 CallSite CandidateCS;
126
127 /// Tunable parameters that control the analysis.
128 const InlineParams &Params;
129
130 int Threshold;
131 int Cost;
132 bool ComputeFullInlineCost;
133
134 bool IsCallerRecursive;
135 bool IsRecursiveCall;
136 bool ExposesReturnsTwice;
137 bool HasDynamicAlloca;
138 bool ContainsNoDuplicateCall;
139 bool HasReturn;
140 bool HasIndirectBr;
141 bool HasUninlineableIntrinsic;
142 bool InitsVargArgs;
143
144 /// Number of bytes allocated statically by the callee.
145 uint64_t AllocatedSize;
146 unsigned NumInstructions, NumVectorInstructions;
147 int VectorBonus, TenPercentVectorBonus;
148 // Bonus to be applied when the callee has only one reachable basic block.
149 int SingleBBBonus;
150
151 /// While we walk the potentially-inlined instructions, we build up and
152 /// maintain a mapping of simplified values specific to this callsite. The
153 /// idea is to propagate any special information we have about arguments to
154 /// this call through the inlinable section of the function, and account for
155 /// likely simplifications post-inlining. The most important aspect we track
156 /// is CFG altering simplifications -- when we prove a basic block dead, that
157 /// can cause dramatic shifts in the cost of inlining a function.
158 DenseMap<Value *, Constant *> SimplifiedValues;
159
160 /// Keep track of the values which map back (through function arguments) to
161 /// allocas on the caller stack which could be simplified through SROA.
162 DenseMap<Value *, Value *> SROAArgValues;
163
164 /// The mapping of caller Alloca values to their accumulated cost savings. If
165 /// we have to disable SROA for one of the allocas, this tells us how much
166 /// cost must be added.
167 DenseMap<Value *, int> SROAArgCosts;
168
169 /// Keep track of values which map to a pointer base and constant offset.
170 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
171
172 /// Keep track of dead blocks due to the constant arguments.
173 SetVector<BasicBlock *> DeadBlocks;
174
175 /// The mapping of the blocks to their known unique successors due to the
176 /// constant arguments.
177 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
178
179 /// Model the elimination of repeated loads that is expected to happen
180 /// whenever we simplify away the stores that would otherwise cause them to be
181 /// loads.
182 bool EnableLoadElimination;
183 SmallPtrSet<Value *, 16> LoadAddrSet;
184 int LoadEliminationCost;
185
186 // Custom simplification helper routines.
187 bool isAllocaDerivedArg(Value *V);
188 bool lookupSROAArgAndCost(Value *V, Value *&Arg,
189 DenseMap<Value *, int>::iterator &CostIt);
190 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
191 void disableSROA(Value *V);
192 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
193 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
194 int InstructionCost);
195 void disableLoadElimination();
196 bool isGEPFree(GetElementPtrInst &GEP);
197 bool canFoldInboundsGEP(GetElementPtrInst &I);
198 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
199 bool simplifyCallSite(Function *F, CallSite CS);
200 template <typename Callable>
201 bool simplifyInstruction(Instruction &I, Callable Evaluate);
202 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
203
204 /// Return true if the given argument to the function being considered for
205 /// inlining has the given attribute set either at the call site or the
206 /// function declaration. Primarily used to inspect call site specific
207 /// attributes since these can be more precise than the ones on the callee
208 /// itself.
209 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
210
211 /// Return true if the given value is known non null within the callee if
212 /// inlined through this particular callsite.
213 bool isKnownNonNullInCallee(Value *V);
214
215 /// Update Threshold based on callsite properties such as callee
216 /// attributes and callee hotness for PGO builds. The Callee is explicitly
217 /// passed to support analyzing indirect calls whose target is inferred by
218 /// analysis.
219 void updateThreshold(CallSite CS, Function &Callee);
220
221 /// Return true if size growth is allowed when inlining the callee at CS.
222 bool allowSizeGrowth(CallSite CS);
223
224 /// Return true if \p CS is a cold callsite.
225 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
226
227 /// Return a higher threshold if \p CS is a hot callsite.
228 Optional<int> getHotCallSiteThreshold(CallSite CS,
229 BlockFrequencyInfo *CallerBFI);
230
231 // Custom analysis routines.
232 InlineResult analyzeBlock(BasicBlock *BB,
233 SmallPtrSetImpl<const Value *> &EphValues);
234
235 // Disable several entry points to the visitor so we don't accidentally use
236 // them by declaring but not defining them here.
237 void visit(Module *);
238 void visit(Module &);
239 void visit(Function *);
240 void visit(Function &);
241 void visit(BasicBlock *);
242 void visit(BasicBlock &);
243
244 // Provide base case for our instruction visit.
245 bool visitInstruction(Instruction &I);
246
247 // Our visit overrides.
248 bool visitAlloca(AllocaInst &I);
249 bool visitPHI(PHINode &I);
250 bool visitGetElementPtr(GetElementPtrInst &I);
251 bool visitBitCast(BitCastInst &I);
252 bool visitPtrToInt(PtrToIntInst &I);
253 bool visitIntToPtr(IntToPtrInst &I);
254 bool visitCastInst(CastInst &I);
255 bool visitUnaryInstruction(UnaryInstruction &I);
256 bool visitCmpInst(CmpInst &I);
257 bool visitSub(BinaryOperator &I);
258 bool visitBinaryOperator(BinaryOperator &I);
259 bool visitLoad(LoadInst &I);
260 bool visitStore(StoreInst &I);
261 bool visitExtractValue(ExtractValueInst &I);
262 bool visitInsertValue(InsertValueInst &I);
263 bool visitCallSite(CallSite CS);
264 bool visitReturnInst(ReturnInst &RI);
265 bool visitBranchInst(BranchInst &BI);
266 bool visitSelectInst(SelectInst &SI);
267 bool visitSwitchInst(SwitchInst &SI);
268 bool visitIndirectBrInst(IndirectBrInst &IBI);
269 bool visitResumeInst(ResumeInst &RI);
270 bool visitCleanupReturnInst(CleanupReturnInst &RI);
271 bool visitCatchReturnInst(CatchReturnInst &RI);
272 bool visitUnreachableInst(UnreachableInst &I);
273
274 public:
CallAnalyzer(const TargetTransformInfo & TTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> & GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallSite CSArg,const InlineParams & Params)275 CallAnalyzer(const TargetTransformInfo &TTI,
276 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
277 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
278 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
279 Function &Callee, CallSite CSArg, const InlineParams &Params)
280 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
281 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
282 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
283 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
284 Params.ComputeFullInlineCost || ORE),
285 IsCallerRecursive(false), IsRecursiveCall(false),
286 ExposesReturnsTwice(false), HasDynamicAlloca(false),
287 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
288 HasUninlineableIntrinsic(false), InitsVargArgs(false), AllocatedSize(0),
289 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0),
290 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0),
291 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
292 NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
293 NumInstructionsSimplified(0), SROACostSavings(0),
294 SROACostSavingsLost(0) {}
295
296 InlineResult analyzeCall(CallSite CS);
297
getThreshold()298 int getThreshold() { return Threshold; }
getCost()299 int getCost() { return Cost; }
300
301 // Keep a bunch of stats about the cost savings found so we can print them
302 // out when debugging.
303 unsigned NumConstantArgs;
304 unsigned NumConstantOffsetPtrArgs;
305 unsigned NumAllocaArgs;
306 unsigned NumConstantPtrCmps;
307 unsigned NumConstantPtrDiffs;
308 unsigned NumInstructionsSimplified;
309 unsigned SROACostSavings;
310 unsigned SROACostSavingsLost;
311
312 void dump();
313 };
314
315 } // namespace
316
317 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)318 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
319 return SROAArgValues.count(V);
320 }
321
322 /// Lookup the SROA-candidate argument and cost iterator which V maps to.
323 /// Returns false if V does not map to a SROA-candidate.
lookupSROAArgAndCost(Value * V,Value * & Arg,DenseMap<Value *,int>::iterator & CostIt)324 bool CallAnalyzer::lookupSROAArgAndCost(
325 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
326 if (SROAArgValues.empty() || SROAArgCosts.empty())
327 return false;
328
329 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
330 if (ArgIt == SROAArgValues.end())
331 return false;
332
333 Arg = ArgIt->second;
334 CostIt = SROAArgCosts.find(Arg);
335 return CostIt != SROAArgCosts.end();
336 }
337
338 /// Disable SROA for the candidate marked by this cost iterator.
339 ///
340 /// This marks the candidate as no longer viable for SROA, and adds the cost
341 /// savings associated with it back into the inline cost measurement.
disableSROA(DenseMap<Value *,int>::iterator CostIt)342 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
343 // If we're no longer able to perform SROA we need to undo its cost savings
344 // and prevent subsequent analysis.
345 Cost += CostIt->second;
346 SROACostSavings -= CostIt->second;
347 SROACostSavingsLost += CostIt->second;
348 SROAArgCosts.erase(CostIt);
349 disableLoadElimination();
350 }
351
352 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)353 void CallAnalyzer::disableSROA(Value *V) {
354 Value *SROAArg;
355 DenseMap<Value *, int>::iterator CostIt;
356 if (lookupSROAArgAndCost(V, SROAArg, CostIt))
357 disableSROA(CostIt);
358 }
359
360 /// Accumulate the given cost for a particular SROA candidate.
accumulateSROACost(DenseMap<Value *,int>::iterator CostIt,int InstructionCost)361 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
362 int InstructionCost) {
363 CostIt->second += InstructionCost;
364 SROACostSavings += InstructionCost;
365 }
366
disableLoadElimination()367 void CallAnalyzer::disableLoadElimination() {
368 if (EnableLoadElimination) {
369 Cost += LoadEliminationCost;
370 LoadEliminationCost = 0;
371 EnableLoadElimination = false;
372 }
373 }
374
375 /// Accumulate a constant GEP offset into an APInt if possible.
376 ///
377 /// Returns false if unable to compute the offset for any reason. Respects any
378 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)379 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
380 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
381 assert(IntPtrWidth == Offset.getBitWidth());
382
383 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
384 GTI != GTE; ++GTI) {
385 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
386 if (!OpC)
387 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
388 OpC = dyn_cast<ConstantInt>(SimpleOp);
389 if (!OpC)
390 return false;
391 if (OpC->isZero())
392 continue;
393
394 // Handle a struct index, which adds its field offset to the pointer.
395 if (StructType *STy = GTI.getStructTypeOrNull()) {
396 unsigned ElementIdx = OpC->getZExtValue();
397 const StructLayout *SL = DL.getStructLayout(STy);
398 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
399 continue;
400 }
401
402 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
403 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
404 }
405 return true;
406 }
407
408 /// Use TTI to check whether a GEP is free.
409 ///
410 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)411 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
412 SmallVector<Value *, 4> Operands;
413 Operands.push_back(GEP.getOperand(0));
414 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
415 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
416 Operands.push_back(SimpleOp);
417 else
418 Operands.push_back(*I);
419 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
420 }
421
visitAlloca(AllocaInst & I)422 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
423 // Check whether inlining will turn a dynamic alloca into a static
424 // alloca and handle that case.
425 if (I.isArrayAllocation()) {
426 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
427 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
428 Type *Ty = I.getAllocatedType();
429 AllocatedSize = SaturatingMultiplyAdd(
430 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
431 return Base::visitAlloca(I);
432 }
433 }
434
435 // Accumulate the allocated size.
436 if (I.isStaticAlloca()) {
437 Type *Ty = I.getAllocatedType();
438 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
439 }
440
441 // We will happily inline static alloca instructions.
442 if (I.isStaticAlloca())
443 return Base::visitAlloca(I);
444
445 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
446 // a variety of reasons, and so we would like to not inline them into
447 // functions which don't currently have a dynamic alloca. This simply
448 // disables inlining altogether in the presence of a dynamic alloca.
449 HasDynamicAlloca = true;
450 return false;
451 }
452
visitPHI(PHINode & I)453 bool CallAnalyzer::visitPHI(PHINode &I) {
454 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
455 // though we don't want to propagate it's bonuses. The idea is to disable
456 // SROA if it *might* be used in an inappropriate manner.
457
458 // Phi nodes are always zero-cost.
459 // FIXME: Pointer sizes may differ between different address spaces, so do we
460 // need to use correct address space in the call to getPointerSizeInBits here?
461 // Or could we skip the getPointerSizeInBits call completely? As far as I can
462 // see the ZeroOffset is used as a dummy value, so we can probably use any
463 // bit width for the ZeroOffset?
464 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0));
465 bool CheckSROA = I.getType()->isPointerTy();
466
467 // Track the constant or pointer with constant offset we've seen so far.
468 Constant *FirstC = nullptr;
469 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
470 Value *FirstV = nullptr;
471
472 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
473 BasicBlock *Pred = I.getIncomingBlock(i);
474 // If the incoming block is dead, skip the incoming block.
475 if (DeadBlocks.count(Pred))
476 continue;
477 // If the parent block of phi is not the known successor of the incoming
478 // block, skip the incoming block.
479 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
480 if (KnownSuccessor && KnownSuccessor != I.getParent())
481 continue;
482
483 Value *V = I.getIncomingValue(i);
484 // If the incoming value is this phi itself, skip the incoming value.
485 if (&I == V)
486 continue;
487
488 Constant *C = dyn_cast<Constant>(V);
489 if (!C)
490 C = SimplifiedValues.lookup(V);
491
492 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
493 if (!C && CheckSROA)
494 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
495
496 if (!C && !BaseAndOffset.first)
497 // The incoming value is neither a constant nor a pointer with constant
498 // offset, exit early.
499 return true;
500
501 if (FirstC) {
502 if (FirstC == C)
503 // If we've seen a constant incoming value before and it is the same
504 // constant we see this time, continue checking the next incoming value.
505 continue;
506 // Otherwise early exit because we either see a different constant or saw
507 // a constant before but we have a pointer with constant offset this time.
508 return true;
509 }
510
511 if (FirstV) {
512 // The same logic as above, but check pointer with constant offset here.
513 if (FirstBaseAndOffset == BaseAndOffset)
514 continue;
515 return true;
516 }
517
518 if (C) {
519 // This is the 1st time we've seen a constant, record it.
520 FirstC = C;
521 continue;
522 }
523
524 // The remaining case is that this is the 1st time we've seen a pointer with
525 // constant offset, record it.
526 FirstV = V;
527 FirstBaseAndOffset = BaseAndOffset;
528 }
529
530 // Check if we can map phi to a constant.
531 if (FirstC) {
532 SimplifiedValues[&I] = FirstC;
533 return true;
534 }
535
536 // Check if we can map phi to a pointer with constant offset.
537 if (FirstBaseAndOffset.first) {
538 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
539
540 Value *SROAArg;
541 DenseMap<Value *, int>::iterator CostIt;
542 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
543 SROAArgValues[&I] = SROAArg;
544 }
545
546 return true;
547 }
548
549 /// Check we can fold GEPs of constant-offset call site argument pointers.
550 /// This requires target data and inbounds GEPs.
551 ///
552 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)553 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
554 // Check if we have a base + offset for the pointer.
555 std::pair<Value *, APInt> BaseAndOffset =
556 ConstantOffsetPtrs.lookup(I.getPointerOperand());
557 if (!BaseAndOffset.first)
558 return false;
559
560 // Check if the offset of this GEP is constant, and if so accumulate it
561 // into Offset.
562 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
563 return false;
564
565 // Add the result as a new mapping to Base + Offset.
566 ConstantOffsetPtrs[&I] = BaseAndOffset;
567
568 return true;
569 }
570
visitGetElementPtr(GetElementPtrInst & I)571 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
572 Value *SROAArg;
573 DenseMap<Value *, int>::iterator CostIt;
574 bool SROACandidate =
575 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
576
577 // Lambda to check whether a GEP's indices are all constant.
578 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
579 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
580 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
581 return false;
582 return true;
583 };
584
585 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
586 if (SROACandidate)
587 SROAArgValues[&I] = SROAArg;
588
589 // Constant GEPs are modeled as free.
590 return true;
591 }
592
593 // Variable GEPs will require math and will disable SROA.
594 if (SROACandidate)
595 disableSROA(CostIt);
596 return isGEPFree(I);
597 }
598
599 /// Simplify \p I if its operands are constants and update SimplifiedValues.
600 /// \p Evaluate is a callable specific to instruction type that evaluates the
601 /// instruction when all the operands are constants.
602 template <typename Callable>
simplifyInstruction(Instruction & I,Callable Evaluate)603 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
604 SmallVector<Constant *, 2> COps;
605 for (Value *Op : I.operands()) {
606 Constant *COp = dyn_cast<Constant>(Op);
607 if (!COp)
608 COp = SimplifiedValues.lookup(Op);
609 if (!COp)
610 return false;
611 COps.push_back(COp);
612 }
613 auto *C = Evaluate(COps);
614 if (!C)
615 return false;
616 SimplifiedValues[&I] = C;
617 return true;
618 }
619
visitBitCast(BitCastInst & I)620 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
621 // Propagate constants through bitcasts.
622 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
623 return ConstantExpr::getBitCast(COps[0], I.getType());
624 }))
625 return true;
626
627 // Track base/offsets through casts
628 std::pair<Value *, APInt> BaseAndOffset =
629 ConstantOffsetPtrs.lookup(I.getOperand(0));
630 // Casts don't change the offset, just wrap it up.
631 if (BaseAndOffset.first)
632 ConstantOffsetPtrs[&I] = BaseAndOffset;
633
634 // Also look for SROA candidates here.
635 Value *SROAArg;
636 DenseMap<Value *, int>::iterator CostIt;
637 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
638 SROAArgValues[&I] = SROAArg;
639
640 // Bitcasts are always zero cost.
641 return true;
642 }
643
visitPtrToInt(PtrToIntInst & I)644 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
645 // Propagate constants through ptrtoint.
646 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
647 return ConstantExpr::getPtrToInt(COps[0], I.getType());
648 }))
649 return true;
650
651 // Track base/offset pairs when converted to a plain integer provided the
652 // integer is large enough to represent the pointer.
653 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
654 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
655 if (IntegerSize >= DL.getPointerSizeInBits(AS)) {
656 std::pair<Value *, APInt> BaseAndOffset =
657 ConstantOffsetPtrs.lookup(I.getOperand(0));
658 if (BaseAndOffset.first)
659 ConstantOffsetPtrs[&I] = BaseAndOffset;
660 }
661
662 // This is really weird. Technically, ptrtoint will disable SROA. However,
663 // unless that ptrtoint is *used* somewhere in the live basic blocks after
664 // inlining, it will be nuked, and SROA should proceed. All of the uses which
665 // would block SROA would also block SROA if applied directly to a pointer,
666 // and so we can just add the integer in here. The only places where SROA is
667 // preserved either cannot fire on an integer, or won't in-and-of themselves
668 // disable SROA (ext) w/o some later use that we would see and disable.
669 Value *SROAArg;
670 DenseMap<Value *, int>::iterator CostIt;
671 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
672 SROAArgValues[&I] = SROAArg;
673
674 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
675 }
676
visitIntToPtr(IntToPtrInst & I)677 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
678 // Propagate constants through ptrtoint.
679 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
680 return ConstantExpr::getIntToPtr(COps[0], I.getType());
681 }))
682 return true;
683
684 // Track base/offset pairs when round-tripped through a pointer without
685 // modifications provided the integer is not too large.
686 Value *Op = I.getOperand(0);
687 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
688 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
689 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
690 if (BaseAndOffset.first)
691 ConstantOffsetPtrs[&I] = BaseAndOffset;
692 }
693
694 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
695 Value *SROAArg;
696 DenseMap<Value *, int>::iterator CostIt;
697 if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
698 SROAArgValues[&I] = SROAArg;
699
700 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
701 }
702
visitCastInst(CastInst & I)703 bool CallAnalyzer::visitCastInst(CastInst &I) {
704 // Propagate constants through ptrtoint.
705 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
706 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
707 }))
708 return true;
709
710 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
711 disableSROA(I.getOperand(0));
712
713 // If this is a floating-point cast, and the target says this operation
714 // is expensive, this may eventually become a library call. Treat the cost
715 // as such.
716 switch (I.getOpcode()) {
717 case Instruction::FPTrunc:
718 case Instruction::FPExt:
719 case Instruction::UIToFP:
720 case Instruction::SIToFP:
721 case Instruction::FPToUI:
722 case Instruction::FPToSI:
723 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
724 Cost += InlineConstants::CallPenalty;
725 break;
726 default:
727 break;
728 }
729
730 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
731 }
732
visitUnaryInstruction(UnaryInstruction & I)733 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
734 Value *Operand = I.getOperand(0);
735 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
736 return ConstantFoldInstOperands(&I, COps[0], DL);
737 }))
738 return true;
739
740 // Disable any SROA on the argument to arbitrary unary operators.
741 disableSROA(Operand);
742
743 return false;
744 }
745
paramHasAttr(Argument * A,Attribute::AttrKind Attr)746 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
747 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
748 }
749
isKnownNonNullInCallee(Value * V)750 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
751 // Does the *call site* have the NonNull attribute set on an argument? We
752 // use the attribute on the call site to memoize any analysis done in the
753 // caller. This will also trip if the callee function has a non-null
754 // parameter attribute, but that's a less interesting case because hopefully
755 // the callee would already have been simplified based on that.
756 if (Argument *A = dyn_cast<Argument>(V))
757 if (paramHasAttr(A, Attribute::NonNull))
758 return true;
759
760 // Is this an alloca in the caller? This is distinct from the attribute case
761 // above because attributes aren't updated within the inliner itself and we
762 // always want to catch the alloca derived case.
763 if (isAllocaDerivedArg(V))
764 // We can actually predict the result of comparisons between an
765 // alloca-derived value and null. Note that this fires regardless of
766 // SROA firing.
767 return true;
768
769 return false;
770 }
771
allowSizeGrowth(CallSite CS)772 bool CallAnalyzer::allowSizeGrowth(CallSite CS) {
773 // If the normal destination of the invoke or the parent block of the call
774 // site is unreachable-terminated, there is little point in inlining this
775 // unless there is literally zero cost.
776 // FIXME: Note that it is possible that an unreachable-terminated block has a
777 // hot entry. For example, in below scenario inlining hot_call_X() may be
778 // beneficial :
779 // main() {
780 // hot_call_1();
781 // ...
782 // hot_call_N()
783 // exit(0);
784 // }
785 // For now, we are not handling this corner case here as it is rare in real
786 // code. In future, we should elaborate this based on BPI and BFI in more
787 // general threshold adjusting heuristics in updateThreshold().
788 Instruction *Instr = CS.getInstruction();
789 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
790 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
791 return false;
792 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator()))
793 return false;
794
795 return true;
796 }
797
isColdCallSite(CallSite CS,BlockFrequencyInfo * CallerBFI)798 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
799 // If global profile summary is available, then callsite's coldness is
800 // determined based on that.
801 if (PSI && PSI->hasProfileSummary())
802 return PSI->isColdCallSite(CS, CallerBFI);
803
804 // Otherwise we need BFI to be available.
805 if (!CallerBFI)
806 return false;
807
808 // Determine if the callsite is cold relative to caller's entry. We could
809 // potentially cache the computation of scaled entry frequency, but the added
810 // complexity is not worth it unless this scaling shows up high in the
811 // profiles.
812 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
813 auto CallSiteBB = CS.getInstruction()->getParent();
814 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
815 auto CallerEntryFreq =
816 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
817 return CallSiteFreq < CallerEntryFreq * ColdProb;
818 }
819
820 Optional<int>
getHotCallSiteThreshold(CallSite CS,BlockFrequencyInfo * CallerBFI)821 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
822 BlockFrequencyInfo *CallerBFI) {
823
824 // If global profile summary is available, then callsite's hotness is
825 // determined based on that.
826 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
827 return Params.HotCallSiteThreshold;
828
829 // Otherwise we need BFI to be available and to have a locally hot callsite
830 // threshold.
831 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
832 return None;
833
834 // Determine if the callsite is hot relative to caller's entry. We could
835 // potentially cache the computation of scaled entry frequency, but the added
836 // complexity is not worth it unless this scaling shows up high in the
837 // profiles.
838 auto CallSiteBB = CS.getInstruction()->getParent();
839 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
840 auto CallerEntryFreq = CallerBFI->getEntryFreq();
841 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
842 return Params.LocallyHotCallSiteThreshold;
843
844 // Otherwise treat it normally.
845 return None;
846 }
847
updateThreshold(CallSite CS,Function & Callee)848 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
849 // If no size growth is allowed for this inlining, set Threshold to 0.
850 if (!allowSizeGrowth(CS)) {
851 Threshold = 0;
852 return;
853 }
854
855 Function *Caller = CS.getCaller();
856
857 // return min(A, B) if B is valid.
858 auto MinIfValid = [](int A, Optional<int> B) {
859 return B ? std::min(A, B.getValue()) : A;
860 };
861
862 // return max(A, B) if B is valid.
863 auto MaxIfValid = [](int A, Optional<int> B) {
864 return B ? std::max(A, B.getValue()) : A;
865 };
866
867 // Various bonus percentages. These are multiplied by Threshold to get the
868 // bonus values.
869 // SingleBBBonus: This bonus is applied if the callee has a single reachable
870 // basic block at the given callsite context. This is speculatively applied
871 // and withdrawn if more than one basic block is seen.
872 //
873 // Vector bonuses: We want to more aggressively inline vector-dense kernels
874 // and apply this bonus based on the percentage of vector instructions. A
875 // bonus is applied if the vector instructions exceed 50% and half that amount
876 // is applied if it exceeds 10%. Note that these bonuses are some what
877 // arbitrary and evolved over time by accident as much as because they are
878 // principled bonuses.
879 // FIXME: It would be nice to base the bonus values on something more
880 // scientific.
881 //
882 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
883 // of the last call to a static function as inlining such functions is
884 // guaranteed to reduce code size.
885 //
886 // These bonus percentages may be set to 0 based on properties of the caller
887 // and the callsite.
888 int SingleBBBonusPercent = 50;
889 int VectorBonusPercent = 150;
890 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
891
892 // Lambda to set all the above bonus and bonus percentages to 0.
893 auto DisallowAllBonuses = [&]() {
894 SingleBBBonusPercent = 0;
895 VectorBonusPercent = 0;
896 LastCallToStaticBonus = 0;
897 };
898
899 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
900 // and reduce the threshold if the caller has the necessary attribute.
901 if (Caller->optForMinSize()) {
902 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
903 // For minsize, we want to disable the single BB bonus and the vector
904 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
905 // a static function will, at the minimum, eliminate the parameter setup and
906 // call/return instructions.
907 SingleBBBonusPercent = 0;
908 VectorBonusPercent = 0;
909 } else if (Caller->optForSize())
910 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
911
912 // Adjust the threshold based on inlinehint attribute and profile based
913 // hotness information if the caller does not have MinSize attribute.
914 if (!Caller->optForMinSize()) {
915 if (Callee.hasFnAttribute(Attribute::InlineHint))
916 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
917
918 // FIXME: After switching to the new passmanager, simplify the logic below
919 // by checking only the callsite hotness/coldness as we will reliably
920 // have local profile information.
921 //
922 // Callsite hotness and coldness can be determined if sample profile is
923 // used (which adds hotness metadata to calls) or if caller's
924 // BlockFrequencyInfo is available.
925 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
926 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
927 if (!Caller->optForSize() && HotCallSiteThreshold) {
928 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
929 // FIXME: This should update the threshold only if it exceeds the
930 // current threshold, but AutoFDO + ThinLTO currently relies on this
931 // behavior to prevent inlining of hot callsites during ThinLTO
932 // compile phase.
933 Threshold = HotCallSiteThreshold.getValue();
934 } else if (isColdCallSite(CS, CallerBFI)) {
935 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
936 // Do not apply bonuses for a cold callsite including the
937 // LastCallToStatic bonus. While this bonus might result in code size
938 // reduction, it can cause the size of a non-cold caller to increase
939 // preventing it from being inlined.
940 DisallowAllBonuses();
941 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
942 } else if (PSI) {
943 // Use callee's global profile information only if we have no way of
944 // determining this via callsite information.
945 if (PSI->isFunctionEntryHot(&Callee)) {
946 LLVM_DEBUG(dbgs() << "Hot callee.\n");
947 // If callsite hotness can not be determined, we may still know
948 // that the callee is hot and treat it as a weaker hint for threshold
949 // increase.
950 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
951 } else if (PSI->isFunctionEntryCold(&Callee)) {
952 LLVM_DEBUG(dbgs() << "Cold callee.\n");
953 // Do not apply bonuses for a cold callee including the
954 // LastCallToStatic bonus. While this bonus might result in code size
955 // reduction, it can cause the size of a non-cold caller to increase
956 // preventing it from being inlined.
957 DisallowAllBonuses();
958 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
959 }
960 }
961 }
962
963 // Finally, take the target-specific inlining threshold multiplier into
964 // account.
965 Threshold *= TTI.getInliningThresholdMultiplier();
966
967 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
968 VectorBonus = Threshold * VectorBonusPercent / 100;
969
970 bool OnlyOneCallAndLocalLinkage =
971 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
972 // If there is only one call of the function, and it has internal linkage,
973 // the cost of inlining it drops dramatically. It may seem odd to update
974 // Cost in updateThreshold, but the bonus depends on the logic in this method.
975 if (OnlyOneCallAndLocalLinkage)
976 Cost -= LastCallToStaticBonus;
977 }
978
visitCmpInst(CmpInst & I)979 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
980 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
981 // First try to handle simplified comparisons.
982 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
983 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
984 }))
985 return true;
986
987 if (I.getOpcode() == Instruction::FCmp)
988 return false;
989
990 // Otherwise look for a comparison between constant offset pointers with
991 // a common base.
992 Value *LHSBase, *RHSBase;
993 APInt LHSOffset, RHSOffset;
994 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
995 if (LHSBase) {
996 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
997 if (RHSBase && LHSBase == RHSBase) {
998 // We have common bases, fold the icmp to a constant based on the
999 // offsets.
1000 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1001 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1002 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1003 SimplifiedValues[&I] = C;
1004 ++NumConstantPtrCmps;
1005 return true;
1006 }
1007 }
1008 }
1009
1010 // If the comparison is an equality comparison with null, we can simplify it
1011 // if we know the value (argument) can't be null
1012 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1013 isKnownNonNullInCallee(I.getOperand(0))) {
1014 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1015 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1016 : ConstantInt::getFalse(I.getType());
1017 return true;
1018 }
1019 // Finally check for SROA candidates in comparisons.
1020 Value *SROAArg;
1021 DenseMap<Value *, int>::iterator CostIt;
1022 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
1023 if (isa<ConstantPointerNull>(I.getOperand(1))) {
1024 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1025 return true;
1026 }
1027
1028 disableSROA(CostIt);
1029 }
1030
1031 return false;
1032 }
1033
visitSub(BinaryOperator & I)1034 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1035 // Try to handle a special case: we can fold computing the difference of two
1036 // constant-related pointers.
1037 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1038 Value *LHSBase, *RHSBase;
1039 APInt LHSOffset, RHSOffset;
1040 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1041 if (LHSBase) {
1042 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1043 if (RHSBase && LHSBase == RHSBase) {
1044 // We have common bases, fold the subtract to a constant based on the
1045 // offsets.
1046 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1047 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1048 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1049 SimplifiedValues[&I] = C;
1050 ++NumConstantPtrDiffs;
1051 return true;
1052 }
1053 }
1054 }
1055
1056 // Otherwise, fall back to the generic logic for simplifying and handling
1057 // instructions.
1058 return Base::visitSub(I);
1059 }
1060
visitBinaryOperator(BinaryOperator & I)1061 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1062 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1063 Constant *CLHS = dyn_cast<Constant>(LHS);
1064 if (!CLHS)
1065 CLHS = SimplifiedValues.lookup(LHS);
1066 Constant *CRHS = dyn_cast<Constant>(RHS);
1067 if (!CRHS)
1068 CRHS = SimplifiedValues.lookup(RHS);
1069
1070 Value *SimpleV = nullptr;
1071 if (auto FI = dyn_cast<FPMathOperator>(&I))
1072 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS,
1073 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL);
1074 else
1075 SimpleV =
1076 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1077
1078 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1079 SimplifiedValues[&I] = C;
1080
1081 if (SimpleV)
1082 return true;
1083
1084 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1085 disableSROA(LHS);
1086 disableSROA(RHS);
1087
1088 // If the instruction is floating point, and the target says this operation
1089 // is expensive, this may eventually become a library call. Treat the cost
1090 // as such.
1091 if (I.getType()->isFloatingPointTy() &&
1092 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1093 Cost += InlineConstants::CallPenalty;
1094
1095 return false;
1096 }
1097
visitLoad(LoadInst & I)1098 bool CallAnalyzer::visitLoad(LoadInst &I) {
1099 Value *SROAArg;
1100 DenseMap<Value *, int>::iterator CostIt;
1101 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1102 if (I.isSimple()) {
1103 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1104 return true;
1105 }
1106
1107 disableSROA(CostIt);
1108 }
1109
1110 // If the data is already loaded from this address and hasn't been clobbered
1111 // by any stores or calls, this load is likely to be redundant and can be
1112 // eliminated.
1113 if (EnableLoadElimination &&
1114 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
1115 LoadEliminationCost += InlineConstants::InstrCost;
1116 return true;
1117 }
1118
1119 return false;
1120 }
1121
visitStore(StoreInst & I)1122 bool CallAnalyzer::visitStore(StoreInst &I) {
1123 Value *SROAArg;
1124 DenseMap<Value *, int>::iterator CostIt;
1125 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
1126 if (I.isSimple()) {
1127 accumulateSROACost(CostIt, InlineConstants::InstrCost);
1128 return true;
1129 }
1130
1131 disableSROA(CostIt);
1132 }
1133
1134 // The store can potentially clobber loads and prevent repeated loads from
1135 // being eliminated.
1136 // FIXME:
1137 // 1. We can probably keep an initial set of eliminatable loads substracted
1138 // from the cost even when we finally see a store. We just need to disable
1139 // *further* accumulation of elimination savings.
1140 // 2. We should probably at some point thread MemorySSA for the callee into
1141 // this and then use that to actually compute *really* precise savings.
1142 disableLoadElimination();
1143 return false;
1144 }
1145
visitExtractValue(ExtractValueInst & I)1146 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
1147 // Constant folding for extract value is trivial.
1148 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1149 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
1150 }))
1151 return true;
1152
1153 // SROA can look through these but give them a cost.
1154 return false;
1155 }
1156
visitInsertValue(InsertValueInst & I)1157 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
1158 // Constant folding for insert value is trivial.
1159 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
1160 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
1161 /*InsertedValueOperand*/ COps[1],
1162 I.getIndices());
1163 }))
1164 return true;
1165
1166 // SROA can look through these but give them a cost.
1167 return false;
1168 }
1169
1170 /// Try to simplify a call site.
1171 ///
1172 /// Takes a concrete function and callsite and tries to actually simplify it by
1173 /// analyzing the arguments and call itself with instsimplify. Returns true if
1174 /// it has simplified the callsite to some other entity (a constant), making it
1175 /// free.
simplifyCallSite(Function * F,CallSite CS)1176 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
1177 // FIXME: Using the instsimplify logic directly for this is inefficient
1178 // because we have to continually rebuild the argument list even when no
1179 // simplifications can be performed. Until that is fixed with remapping
1180 // inside of instsimplify, directly constant fold calls here.
1181 if (!canConstantFoldCallTo(CS, F))
1182 return false;
1183
1184 // Try to re-map the arguments to constants.
1185 SmallVector<Constant *, 4> ConstantArgs;
1186 ConstantArgs.reserve(CS.arg_size());
1187 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
1188 ++I) {
1189 Constant *C = dyn_cast<Constant>(*I);
1190 if (!C)
1191 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I));
1192 if (!C)
1193 return false; // This argument doesn't map to a constant.
1194
1195 ConstantArgs.push_back(C);
1196 }
1197 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
1198 SimplifiedValues[CS.getInstruction()] = C;
1199 return true;
1200 }
1201
1202 return false;
1203 }
1204
visitCallSite(CallSite CS)1205 bool CallAnalyzer::visitCallSite(CallSite CS) {
1206 if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
1207 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
1208 // This aborts the entire analysis.
1209 ExposesReturnsTwice = true;
1210 return false;
1211 }
1212 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate())
1213 ContainsNoDuplicateCall = true;
1214
1215 if (Function *F = CS.getCalledFunction()) {
1216 // When we have a concrete function, first try to simplify it directly.
1217 if (simplifyCallSite(F, CS))
1218 return true;
1219
1220 // Next check if it is an intrinsic we know about.
1221 // FIXME: Lift this into part of the InstVisitor.
1222 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
1223 switch (II->getIntrinsicID()) {
1224 default:
1225 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
1226 disableLoadElimination();
1227 return Base::visitCallSite(CS);
1228
1229 case Intrinsic::load_relative:
1230 // This is normally lowered to 4 LLVM instructions.
1231 Cost += 3 * InlineConstants::InstrCost;
1232 return false;
1233
1234 case Intrinsic::memset:
1235 case Intrinsic::memcpy:
1236 case Intrinsic::memmove:
1237 disableLoadElimination();
1238 // SROA can usually chew through these intrinsics, but they aren't free.
1239 return false;
1240 case Intrinsic::icall_branch_funnel:
1241 case Intrinsic::localescape:
1242 HasUninlineableIntrinsic = true;
1243 return false;
1244 case Intrinsic::vastart:
1245 InitsVargArgs = true;
1246 return false;
1247 }
1248 }
1249
1250 if (F == CS.getInstruction()->getFunction()) {
1251 // This flag will fully abort the analysis, so don't bother with anything
1252 // else.
1253 IsRecursiveCall = true;
1254 return false;
1255 }
1256
1257 if (TTI.isLoweredToCall(F)) {
1258 // We account for the average 1 instruction per call argument setup
1259 // here.
1260 Cost += CS.arg_size() * InlineConstants::InstrCost;
1261
1262 // Everything other than inline ASM will also have a significant cost
1263 // merely from making the call.
1264 if (!isa<InlineAsm>(CS.getCalledValue()))
1265 Cost += InlineConstants::CallPenalty;
1266 }
1267
1268 if (!CS.onlyReadsMemory())
1269 disableLoadElimination();
1270 return Base::visitCallSite(CS);
1271 }
1272
1273 // Otherwise we're in a very special case -- an indirect function call. See
1274 // if we can be particularly clever about this.
1275 Value *Callee = CS.getCalledValue();
1276
1277 // First, pay the price of the argument setup. We account for the average
1278 // 1 instruction per call argument setup here.
1279 Cost += CS.arg_size() * InlineConstants::InstrCost;
1280
1281 // Next, check if this happens to be an indirect function call to a known
1282 // function in this inline context. If not, we've done all we can.
1283 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
1284 if (!F) {
1285 if (!CS.onlyReadsMemory())
1286 disableLoadElimination();
1287 return Base::visitCallSite(CS);
1288 }
1289
1290 // If we have a constant that we are calling as a function, we can peer
1291 // through it and see the function target. This happens not infrequently
1292 // during devirtualization and so we want to give it a hefty bonus for
1293 // inlining, but cap that bonus in the event that inlining wouldn't pan
1294 // out. Pretend to inline the function, with a custom threshold.
1295 auto IndirectCallParams = Params;
1296 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
1297 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1298 IndirectCallParams);
1299 if (CA.analyzeCall(CS)) {
1300 // We were able to inline the indirect call! Subtract the cost from the
1301 // threshold to get the bonus we want to apply, but don't go below zero.
1302 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
1303 }
1304
1305 if (!F->onlyReadsMemory())
1306 disableLoadElimination();
1307 return Base::visitCallSite(CS);
1308 }
1309
visitReturnInst(ReturnInst & RI)1310 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
1311 // At least one return instruction will be free after inlining.
1312 bool Free = !HasReturn;
1313 HasReturn = true;
1314 return Free;
1315 }
1316
visitBranchInst(BranchInst & BI)1317 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
1318 // We model unconditional branches as essentially free -- they really
1319 // shouldn't exist at all, but handling them makes the behavior of the
1320 // inliner more regular and predictable. Interestingly, conditional branches
1321 // which will fold away are also free.
1322 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
1323 dyn_cast_or_null<ConstantInt>(
1324 SimplifiedValues.lookup(BI.getCondition()));
1325 }
1326
visitSelectInst(SelectInst & SI)1327 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1328 bool CheckSROA = SI.getType()->isPointerTy();
1329 Value *TrueVal = SI.getTrueValue();
1330 Value *FalseVal = SI.getFalseValue();
1331
1332 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1333 if (!TrueC)
1334 TrueC = SimplifiedValues.lookup(TrueVal);
1335 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1336 if (!FalseC)
1337 FalseC = SimplifiedValues.lookup(FalseVal);
1338 Constant *CondC =
1339 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1340
1341 if (!CondC) {
1342 // Select C, X, X => X
1343 if (TrueC == FalseC && TrueC) {
1344 SimplifiedValues[&SI] = TrueC;
1345 return true;
1346 }
1347
1348 if (!CheckSROA)
1349 return Base::visitSelectInst(SI);
1350
1351 std::pair<Value *, APInt> TrueBaseAndOffset =
1352 ConstantOffsetPtrs.lookup(TrueVal);
1353 std::pair<Value *, APInt> FalseBaseAndOffset =
1354 ConstantOffsetPtrs.lookup(FalseVal);
1355 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1356 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1357
1358 Value *SROAArg;
1359 DenseMap<Value *, int>::iterator CostIt;
1360 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1361 SROAArgValues[&SI] = SROAArg;
1362 return true;
1363 }
1364
1365 return Base::visitSelectInst(SI);
1366 }
1367
1368 // Select condition is a constant.
1369 Value *SelectedV = CondC->isAllOnesValue()
1370 ? TrueVal
1371 : (CondC->isNullValue()) ? FalseVal : nullptr;
1372 if (!SelectedV) {
1373 // Condition is a vector constant that is not all 1s or all 0s. If all
1374 // operands are constants, ConstantExpr::getSelect() can handle the cases
1375 // such as select vectors.
1376 if (TrueC && FalseC) {
1377 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1378 SimplifiedValues[&SI] = C;
1379 return true;
1380 }
1381 }
1382 return Base::visitSelectInst(SI);
1383 }
1384
1385 // Condition is either all 1s or all 0s. SI can be simplified.
1386 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1387 SimplifiedValues[&SI] = SelectedC;
1388 return true;
1389 }
1390
1391 if (!CheckSROA)
1392 return true;
1393
1394 std::pair<Value *, APInt> BaseAndOffset =
1395 ConstantOffsetPtrs.lookup(SelectedV);
1396 if (BaseAndOffset.first) {
1397 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1398
1399 Value *SROAArg;
1400 DenseMap<Value *, int>::iterator CostIt;
1401 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1402 SROAArgValues[&SI] = SROAArg;
1403 }
1404
1405 return true;
1406 }
1407
visitSwitchInst(SwitchInst & SI)1408 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
1409 // We model unconditional switches as free, see the comments on handling
1410 // branches.
1411 if (isa<ConstantInt>(SI.getCondition()))
1412 return true;
1413 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
1414 if (isa<ConstantInt>(V))
1415 return true;
1416
1417 // Assume the most general case where the switch is lowered into
1418 // either a jump table, bit test, or a balanced binary tree consisting of
1419 // case clusters without merging adjacent clusters with the same
1420 // destination. We do not consider the switches that are lowered with a mix
1421 // of jump table/bit test/binary search tree. The cost of the switch is
1422 // proportional to the size of the tree or the size of jump table range.
1423 //
1424 // NB: We convert large switches which are just used to initialize large phi
1425 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1426 // inlining those. It will prevent inlining in cases where the optimization
1427 // does not (yet) fire.
1428
1429 // Maximum valid cost increased in this function.
1430 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1431
1432 // Exit early for a large switch, assuming one case needs at least one
1433 // instruction.
1434 // FIXME: This is not true for a bit test, but ignore such case for now to
1435 // save compile-time.
1436 int64_t CostLowerBound =
1437 std::min((int64_t)CostUpperBound,
1438 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1439
1440 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1441 Cost = CostLowerBound;
1442 return false;
1443 }
1444
1445 unsigned JumpTableSize = 0;
1446 unsigned NumCaseCluster =
1447 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1448
1449 // If suitable for a jump table, consider the cost for the table size and
1450 // branch to destination.
1451 if (JumpTableSize) {
1452 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1453 4 * InlineConstants::InstrCost;
1454
1455 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1456 return false;
1457 }
1458
1459 // Considering forming a binary search, we should find the number of nodes
1460 // which is same as the number of comparisons when lowered. For a given
1461 // number of clusters, n, we can define a recursive function, f(n), to find
1462 // the number of nodes in the tree. The recursion is :
1463 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1464 // and f(n) = n, when n <= 3.
1465 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1466 // when n > 3. So, the number of comparisons from leaves should be n, while
1467 // the number of non-leaf should be :
1468 // 2^(log2(n) - 1) - 1
1469 // = 2^log2(n) * 2^-1 - 1
1470 // = n / 2 - 1.
1471 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1472 // number of comparisons in a simple closed form :
1473 // n + n / 2 - 1 = n * 3 / 2 - 1
1474 if (NumCaseCluster <= 3) {
1475 // Suppose a comparison includes one compare and one conditional branch.
1476 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1477 return false;
1478 }
1479
1480 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1481 int64_t SwitchCost =
1482 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1483
1484 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1485 return false;
1486 }
1487
visitIndirectBrInst(IndirectBrInst & IBI)1488 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1489 // We never want to inline functions that contain an indirectbr. This is
1490 // incorrect because all the blockaddress's (in static global initializers
1491 // for example) would be referring to the original function, and this
1492 // indirect jump would jump from the inlined copy of the function into the
1493 // original function which is extremely undefined behavior.
1494 // FIXME: This logic isn't really right; we can safely inline functions with
1495 // indirectbr's as long as no other function or global references the
1496 // blockaddress of a block within the current function.
1497 HasIndirectBr = true;
1498 return false;
1499 }
1500
visitResumeInst(ResumeInst & RI)1501 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
1502 // FIXME: It's not clear that a single instruction is an accurate model for
1503 // the inline cost of a resume instruction.
1504 return false;
1505 }
1506
visitCleanupReturnInst(CleanupReturnInst & CRI)1507 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
1508 // FIXME: It's not clear that a single instruction is an accurate model for
1509 // the inline cost of a cleanupret instruction.
1510 return false;
1511 }
1512
visitCatchReturnInst(CatchReturnInst & CRI)1513 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
1514 // FIXME: It's not clear that a single instruction is an accurate model for
1515 // the inline cost of a catchret instruction.
1516 return false;
1517 }
1518
visitUnreachableInst(UnreachableInst & I)1519 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
1520 // FIXME: It might be reasonably to discount the cost of instructions leading
1521 // to unreachable as they have the lowest possible impact on both runtime and
1522 // code size.
1523 return true; // No actual code is needed for unreachable.
1524 }
1525
visitInstruction(Instruction & I)1526 bool CallAnalyzer::visitInstruction(Instruction &I) {
1527 // Some instructions are free. All of the free intrinsics can also be
1528 // handled by SROA, etc.
1529 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
1530 return true;
1531
1532 // We found something we don't understand or can't handle. Mark any SROA-able
1533 // values in the operand list as no longer viable.
1534 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI)
1535 disableSROA(*OI);
1536
1537 return false;
1538 }
1539
1540 /// Analyze a basic block for its contribution to the inline cost.
1541 ///
1542 /// This method walks the analyzer over every instruction in the given basic
1543 /// block and accounts for their cost during inlining at this callsite. It
1544 /// aborts early if the threshold has been exceeded or an impossible to inline
1545 /// construct has been detected. It returns false if inlining is no longer
1546 /// viable, and true if inlining remains viable.
1547 InlineResult
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)1548 CallAnalyzer::analyzeBlock(BasicBlock *BB,
1549 SmallPtrSetImpl<const Value *> &EphValues) {
1550 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
1551 // FIXME: Currently, the number of instructions in a function regardless of
1552 // our ability to simplify them during inline to constants or dead code,
1553 // are actually used by the vector bonus heuristic. As long as that's true,
1554 // we have to special case debug intrinsics here to prevent differences in
1555 // inlining due to debug symbols. Eventually, the number of unsimplified
1556 // instructions shouldn't factor into the cost computation, but until then,
1557 // hack around it here.
1558 if (isa<DbgInfoIntrinsic>(I))
1559 continue;
1560
1561 // Skip ephemeral values.
1562 if (EphValues.count(&*I))
1563 continue;
1564
1565 ++NumInstructions;
1566 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
1567 ++NumVectorInstructions;
1568
1569 // If the instruction simplified to a constant, there is no cost to this
1570 // instruction. Visit the instructions using our InstVisitor to account for
1571 // all of the per-instruction logic. The visit tree returns true if we
1572 // consumed the instruction in any way, and false if the instruction's base
1573 // cost should count against inlining.
1574 if (Base::visit(&*I))
1575 ++NumInstructionsSimplified;
1576 else
1577 Cost += InlineConstants::InstrCost;
1578
1579 using namespace ore;
1580 // If the visit this instruction detected an uninlinable pattern, abort.
1581 InlineResult IR;
1582 if (IsRecursiveCall)
1583 IR = "recursive";
1584 else if (ExposesReturnsTwice)
1585 IR = "exposes returns twice";
1586 else if (HasDynamicAlloca)
1587 IR = "dynamic alloca";
1588 else if (HasIndirectBr)
1589 IR = "indirect branch";
1590 else if (HasUninlineableIntrinsic)
1591 IR = "uninlinable intrinsic";
1592 else if (InitsVargArgs)
1593 IR = "varargs";
1594 if (!IR) {
1595 if (ORE)
1596 ORE->emit([&]() {
1597 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1598 CandidateCS.getInstruction())
1599 << NV("Callee", &F) << " has uninlinable pattern ("
1600 << NV("InlineResult", IR.message)
1601 << ") and cost is not fully computed";
1602 });
1603 return IR;
1604 }
1605
1606 // If the caller is a recursive function then we don't want to inline
1607 // functions which allocate a lot of stack space because it would increase
1608 // the caller stack usage dramatically.
1609 if (IsCallerRecursive &&
1610 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1611 InlineResult IR = "recursive and allocates too much stack space";
1612 if (ORE)
1613 ORE->emit([&]() {
1614 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1615 CandidateCS.getInstruction())
1616 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
1617 << ". Cost is not fully computed";
1618 });
1619 return IR;
1620 }
1621
1622 // Check if we've past the maximum possible threshold so we don't spin in
1623 // huge basic blocks that will never inline.
1624 if (Cost >= Threshold && !ComputeFullInlineCost)
1625 return false;
1626 }
1627
1628 return true;
1629 }
1630
1631 /// Compute the base pointer and cumulative constant offsets for V.
1632 ///
1633 /// This strips all constant offsets off of V, leaving it the base pointer, and
1634 /// accumulates the total constant offset applied in the returned constant. It
1635 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
1636 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)1637 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1638 if (!V->getType()->isPointerTy())
1639 return nullptr;
1640
1641 unsigned AS = V->getType()->getPointerAddressSpace();
1642 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
1643 APInt Offset = APInt::getNullValue(IntPtrWidth);
1644
1645 // Even though we don't look through PHI nodes, we could be called on an
1646 // instruction in an unreachable block, which may be on a cycle.
1647 SmallPtrSet<Value *, 4> Visited;
1648 Visited.insert(V);
1649 do {
1650 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
1651 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
1652 return nullptr;
1653 V = GEP->getPointerOperand();
1654 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
1655 V = cast<Operator>(V)->getOperand(0);
1656 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
1657 if (GA->isInterposable())
1658 break;
1659 V = GA->getAliasee();
1660 } else {
1661 break;
1662 }
1663 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
1664 } while (Visited.insert(V).second);
1665
1666 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS);
1667 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
1668 }
1669
1670 /// Find dead blocks due to deleted CFG edges during inlining.
1671 ///
1672 /// If we know the successor of the current block, \p CurrBB, has to be \p
1673 /// NextBB, the other successors of \p CurrBB are dead if these successors have
1674 /// no live incoming CFG edges. If one block is found to be dead, we can
1675 /// continue growing the dead block list by checking the successors of the dead
1676 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)1677 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
1678 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
1679 // A CFG edge is dead if the predecessor is dead or the predessor has a
1680 // known successor which is not the one under exam.
1681 return (DeadBlocks.count(Pred) ||
1682 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
1683 };
1684
1685 auto IsNewlyDead = [&](BasicBlock *BB) {
1686 // If all the edges to a block are dead, the block is also dead.
1687 return (!DeadBlocks.count(BB) &&
1688 llvm::all_of(predecessors(BB),
1689 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
1690 };
1691
1692 for (BasicBlock *Succ : successors(CurrBB)) {
1693 if (Succ == NextBB || !IsNewlyDead(Succ))
1694 continue;
1695 SmallVector<BasicBlock *, 4> NewDead;
1696 NewDead.push_back(Succ);
1697 while (!NewDead.empty()) {
1698 BasicBlock *Dead = NewDead.pop_back_val();
1699 if (DeadBlocks.insert(Dead))
1700 // Continue growing the dead block lists.
1701 for (BasicBlock *S : successors(Dead))
1702 if (IsNewlyDead(S))
1703 NewDead.push_back(S);
1704 }
1705 }
1706 }
1707
1708 /// Analyze a call site for potential inlining.
1709 ///
1710 /// Returns true if inlining this call is viable, and false if it is not
1711 /// viable. It computes the cost and adjusts the threshold based on numerous
1712 /// factors and heuristics. If this method returns false but the computed cost
1713 /// is below the computed threshold, then inlining was forcibly disabled by
1714 /// some artifact of the routine.
analyzeCall(CallSite CS)1715 InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
1716 ++NumCallsAnalyzed;
1717
1718 // Perform some tweaks to the cost and threshold based on the direct
1719 // callsite information.
1720
1721 // We want to more aggressively inline vector-dense kernels, so up the
1722 // threshold, and we'll lower it if the % of vector instructions gets too
1723 // low. Note that these bonuses are some what arbitrary and evolved over time
1724 // by accident as much as because they are principled bonuses.
1725 //
1726 // FIXME: It would be nice to remove all such bonuses. At least it would be
1727 // nice to base the bonus values on something more scientific.
1728 assert(NumInstructions == 0);
1729 assert(NumVectorInstructions == 0);
1730
1731 // Update the threshold based on callsite properties
1732 updateThreshold(CS, F);
1733
1734 // While Threshold depends on commandline options that can take negative
1735 // values, we want to enforce the invariant that the computed threshold and
1736 // bonuses are non-negative.
1737 assert(Threshold >= 0);
1738 assert(SingleBBBonus >= 0);
1739 assert(VectorBonus >= 0);
1740
1741 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1742 // this Threshold any time, and cost cannot decrease, we can stop processing
1743 // the rest of the function body.
1744 Threshold += (SingleBBBonus + VectorBonus);
1745
1746 // Give out bonuses for the callsite, as the instructions setting them up
1747 // will be gone after inlining.
1748 Cost -= getCallsiteCost(CS, DL);
1749
1750 // If this function uses the coldcc calling convention, prefer not to inline
1751 // it.
1752 if (F.getCallingConv() == CallingConv::Cold)
1753 Cost += InlineConstants::ColdccPenalty;
1754
1755 // Check if we're done. This can happen due to bonuses and penalties.
1756 if (Cost >= Threshold && !ComputeFullInlineCost)
1757 return "high cost";
1758
1759 if (F.empty())
1760 return true;
1761
1762 Function *Caller = CS.getInstruction()->getFunction();
1763 // Check if the caller function is recursive itself.
1764 for (User *U : Caller->users()) {
1765 CallSite Site(U);
1766 if (!Site)
1767 continue;
1768 Instruction *I = Site.getInstruction();
1769 if (I->getFunction() == Caller) {
1770 IsCallerRecursive = true;
1771 break;
1772 }
1773 }
1774
1775 // Populate our simplified values by mapping from function arguments to call
1776 // arguments with known important simplifications.
1777 CallSite::arg_iterator CAI = CS.arg_begin();
1778 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end();
1779 FAI != FAE; ++FAI, ++CAI) {
1780 assert(CAI != CS.arg_end());
1781 if (Constant *C = dyn_cast<Constant>(CAI))
1782 SimplifiedValues[&*FAI] = C;
1783
1784 Value *PtrArg = *CAI;
1785 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
1786 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
1787
1788 // We can SROA any pointer arguments derived from alloca instructions.
1789 if (isa<AllocaInst>(PtrArg)) {
1790 SROAArgValues[&*FAI] = PtrArg;
1791 SROAArgCosts[PtrArg] = 0;
1792 }
1793 }
1794 }
1795 NumConstantArgs = SimplifiedValues.size();
1796 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
1797 NumAllocaArgs = SROAArgValues.size();
1798
1799 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
1800 // the ephemeral values multiple times (and they're completely determined by
1801 // the callee, so this is purely duplicate work).
1802 SmallPtrSet<const Value *, 32> EphValues;
1803 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
1804
1805 // The worklist of live basic blocks in the callee *after* inlining. We avoid
1806 // adding basic blocks of the callee which can be proven to be dead for this
1807 // particular call site in order to get more accurate cost estimates. This
1808 // requires a somewhat heavyweight iteration pattern: we need to walk the
1809 // basic blocks in a breadth-first order as we insert live successors. To
1810 // accomplish this, prioritizing for small iterations because we exit after
1811 // crossing our threshold, we use a small-size optimized SetVector.
1812 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1813 SmallPtrSet<BasicBlock *, 16>>
1814 BBSetVector;
1815 BBSetVector BBWorklist;
1816 BBWorklist.insert(&F.getEntryBlock());
1817 bool SingleBB = true;
1818 // Note that we *must not* cache the size, this loop grows the worklist.
1819 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1820 // Bail out the moment we cross the threshold. This means we'll under-count
1821 // the cost, but only when undercounting doesn't matter.
1822 if (Cost >= Threshold && !ComputeFullInlineCost)
1823 break;
1824
1825 BasicBlock *BB = BBWorklist[Idx];
1826 if (BB->empty())
1827 continue;
1828
1829 // Disallow inlining a blockaddress. A blockaddress only has defined
1830 // behavior for an indirect branch in the same function, and we do not
1831 // currently support inlining indirect branches. But, the inliner may not
1832 // see an indirect branch that ends up being dead code at a particular call
1833 // site. If the blockaddress escapes the function, e.g., via a global
1834 // variable, inlining may lead to an invalid cross-function reference.
1835 if (BB->hasAddressTaken())
1836 return "blockaddress";
1837
1838 // Analyze the cost of this block. If we blow through the threshold, this
1839 // returns false, and we can bail on out.
1840 InlineResult IR = analyzeBlock(BB, EphValues);
1841 if (!IR)
1842 return IR;
1843
1844 Instruction *TI = BB->getTerminator();
1845
1846 // Add in the live successors by first checking whether we have terminator
1847 // that may be simplified based on the values simplified by this call.
1848 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
1849 if (BI->isConditional()) {
1850 Value *Cond = BI->getCondition();
1851 if (ConstantInt *SimpleCond =
1852 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1853 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
1854 BBWorklist.insert(NextBB);
1855 KnownSuccessors[BB] = NextBB;
1856 findDeadBlocks(BB, NextBB);
1857 continue;
1858 }
1859 }
1860 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1861 Value *Cond = SI->getCondition();
1862 if (ConstantInt *SimpleCond =
1863 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1864 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
1865 BBWorklist.insert(NextBB);
1866 KnownSuccessors[BB] = NextBB;
1867 findDeadBlocks(BB, NextBB);
1868 continue;
1869 }
1870 }
1871
1872 // If we're unable to select a particular successor, just count all of
1873 // them.
1874 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
1875 ++TIdx)
1876 BBWorklist.insert(TI->getSuccessor(TIdx));
1877
1878 // If we had any successors at this point, than post-inlining is likely to
1879 // have them as well. Note that we assume any basic blocks which existed
1880 // due to branches or switches which folded above will also fold after
1881 // inlining.
1882 if (SingleBB && TI->getNumSuccessors() > 1) {
1883 // Take off the bonus we applied to the threshold.
1884 Threshold -= SingleBBBonus;
1885 SingleBB = false;
1886 }
1887 }
1888
1889 bool OnlyOneCallAndLocalLinkage =
1890 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1891 // If this is a noduplicate call, we can still inline as long as
1892 // inlining this would cause the removal of the caller (so the instruction
1893 // is not actually duplicated, just moved).
1894 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1895 return "noduplicate";
1896
1897 // Loops generally act a lot like calls in that they act like barriers to
1898 // movement, require a certain amount of setup, etc. So when optimising for
1899 // size, we penalise any call sites that perform loops. We do this after all
1900 // other costs here, so will likely only be dealing with relatively small
1901 // functions (and hence DT and LI will hopefully be cheap).
1902 if (Caller->optForMinSize()) {
1903 DominatorTree DT(F);
1904 LoopInfo LI(DT);
1905 int NumLoops = 0;
1906 for (Loop *L : LI) {
1907 // Ignore loops that will not be executed
1908 if (DeadBlocks.count(L->getHeader()))
1909 continue;
1910 NumLoops++;
1911 }
1912 Cost += NumLoops * InlineConstants::CallPenalty;
1913 }
1914
1915 // We applied the maximum possible vector bonus at the beginning. Now,
1916 // subtract the excess bonus, if any, from the Threshold before
1917 // comparing against Cost.
1918 if (NumVectorInstructions <= NumInstructions / 10)
1919 Threshold -= VectorBonus;
1920 else if (NumVectorInstructions <= NumInstructions / 2)
1921 Threshold -= VectorBonus/2;
1922
1923 return Cost < std::max(1, Threshold);
1924 }
1925
1926 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1927 /// Dump stats about this call's analysis.
dump()1928 LLVM_DUMP_METHOD void CallAnalyzer::dump() {
1929 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n"
1930 DEBUG_PRINT_STAT(NumConstantArgs);
1931 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
1932 DEBUG_PRINT_STAT(NumAllocaArgs);
1933 DEBUG_PRINT_STAT(NumConstantPtrCmps);
1934 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
1935 DEBUG_PRINT_STAT(NumInstructionsSimplified);
1936 DEBUG_PRINT_STAT(NumInstructions);
1937 DEBUG_PRINT_STAT(SROACostSavings);
1938 DEBUG_PRINT_STAT(SROACostSavingsLost);
1939 DEBUG_PRINT_STAT(LoadEliminationCost);
1940 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
1941 DEBUG_PRINT_STAT(Cost);
1942 DEBUG_PRINT_STAT(Threshold);
1943 #undef DEBUG_PRINT_STAT
1944 }
1945 #endif
1946
1947 /// Test that there are no attribute conflicts between Caller and Callee
1948 /// that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI)1949 static bool functionsHaveCompatibleAttributes(Function *Caller,
1950 Function *Callee,
1951 TargetTransformInfo &TTI) {
1952 return TTI.areInlineCompatible(Caller, Callee) &&
1953 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1954 }
1955
getCallsiteCost(CallSite CS,const DataLayout & DL)1956 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1957 int Cost = 0;
1958 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1959 if (CS.isByValArgument(I)) {
1960 // We approximate the number of loads and stores needed by dividing the
1961 // size of the byval type by the target's pointer size.
1962 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1963 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1964 unsigned AS = PTy->getAddressSpace();
1965 unsigned PointerSize = DL.getPointerSizeInBits(AS);
1966 // Ceiling division.
1967 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1968
1969 // If it generates more than 8 stores it is likely to be expanded as an
1970 // inline memcpy so we take that as an upper bound. Otherwise we assume
1971 // one load and one store per word copied.
1972 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1973 // here instead of a magic number of 8, but it's not available via
1974 // DataLayout.
1975 NumStores = std::min(NumStores, 8U);
1976
1977 Cost += 2 * NumStores * InlineConstants::InstrCost;
1978 } else {
1979 // For non-byval arguments subtract off one instruction per call
1980 // argument.
1981 Cost += InlineConstants::InstrCost;
1982 }
1983 }
1984 // The call instruction also disappears after inlining.
1985 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1986 return Cost;
1987 }
1988
getInlineCost(CallSite CS,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1989 InlineCost llvm::getInlineCost(
1990 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1991 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1992 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1993 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1994 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1995 GetAssumptionCache, GetBFI, PSI, ORE);
1996 }
1997
getInlineCost(CallSite CS,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,std::function<AssumptionCache & (Function &)> & GetAssumptionCache,Optional<function_ref<BlockFrequencyInfo & (Function &)>> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)1998 InlineCost llvm::getInlineCost(
1999 CallSite CS, Function *Callee, const InlineParams &Params,
2000 TargetTransformInfo &CalleeTTI,
2001 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
2002 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
2003 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2004
2005 // Cannot inline indirect calls.
2006 if (!Callee)
2007 return llvm::InlineCost::getNever("indirect call");
2008
2009 // Never inline calls with byval arguments that does not have the alloca
2010 // address space. Since byval arguments can be replaced with a copy to an
2011 // alloca, the inlined code would need to be adjusted to handle that the
2012 // argument is in the alloca address space (so it is a little bit complicated
2013 // to solve).
2014 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2015 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I)
2016 if (CS.isByValArgument(I)) {
2017 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
2018 if (PTy->getAddressSpace() != AllocaAS)
2019 return llvm::InlineCost::getNever("byval arguments without alloca"
2020 " address space");
2021 }
2022
2023 // Calls to functions with always-inline attributes should be inlined
2024 // whenever possible.
2025 if (CS.hasFnAttr(Attribute::AlwaysInline)) {
2026 if (isInlineViable(*Callee))
2027 return llvm::InlineCost::getAlways("always inline attribute");
2028 return llvm::InlineCost::getNever("inapplicable always inline attribute");
2029 }
2030
2031 // Never inline functions with conflicting attributes (unless callee has
2032 // always-inline attribute).
2033 Function *Caller = CS.getCaller();
2034 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
2035 return llvm::InlineCost::getNever("conflicting attributes");
2036
2037 // Don't inline this call if the caller has the optnone attribute.
2038 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
2039 return llvm::InlineCost::getNever("optnone attribute");
2040
2041 // Don't inline a function that treats null pointer as valid into a caller
2042 // that does not have this attribute.
2043 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2044 return llvm::InlineCost::getNever("nullptr definitions incompatible");
2045
2046 // Don't inline functions which can be interposed at link-time.
2047 if (Callee->isInterposable())
2048 return llvm::InlineCost::getNever("interposable");
2049
2050 // Don't inline functions marked noinline.
2051 if (Callee->hasFnAttribute(Attribute::NoInline))
2052 return llvm::InlineCost::getNever("noinline function attribute");
2053
2054 // Don't inline call sites marked noinline.
2055 if (CS.isNoInline())
2056 return llvm::InlineCost::getNever("noinline call site attribute");
2057
2058 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2059 << "... (caller:" << Caller->getName() << ")\n");
2060
2061 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
2062 Params);
2063 InlineResult ShouldInline = CA.analyzeCall(CS);
2064
2065 LLVM_DEBUG(CA.dump());
2066
2067 // Check if there was a reason to force inlining or no inlining.
2068 if (!ShouldInline && CA.getCost() < CA.getThreshold())
2069 return InlineCost::getNever(ShouldInline.message);
2070 if (ShouldInline && CA.getCost() >= CA.getThreshold())
2071 return InlineCost::getAlways("empty function");
2072
2073 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
2074 }
2075
isInlineViable(Function & F)2076 bool llvm::isInlineViable(Function &F) {
2077 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2078 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
2079 // Disallow inlining of functions which contain indirect branches or
2080 // blockaddresses.
2081 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken())
2082 return false;
2083
2084 for (auto &II : *BI) {
2085 CallSite CS(&II);
2086 if (!CS)
2087 continue;
2088
2089 // Disallow recursive calls.
2090 if (&F == CS.getCalledFunction())
2091 return false;
2092
2093 // Disallow calls which expose returns-twice to a function not previously
2094 // attributed as such.
2095 if (!ReturnsTwice && CS.isCall() &&
2096 cast<CallInst>(CS.getInstruction())->canReturnTwice())
2097 return false;
2098
2099 if (CS.getCalledFunction())
2100 switch (CS.getCalledFunction()->getIntrinsicID()) {
2101 default:
2102 break;
2103 // Disallow inlining of @llvm.icall.branch.funnel because current
2104 // backend can't separate call targets from call arguments.
2105 case llvm::Intrinsic::icall_branch_funnel:
2106 // Disallow inlining functions that call @llvm.localescape. Doing this
2107 // correctly would require major changes to the inliner.
2108 case llvm::Intrinsic::localescape:
2109 // Disallow inlining of functions that initialize VarArgs with va_start.
2110 case llvm::Intrinsic::vastart:
2111 return false;
2112 }
2113 }
2114 }
2115
2116 return true;
2117 }
2118
2119 // APIs to create InlineParams based on command line flags and/or other
2120 // parameters.
2121
getInlineParams(int Threshold)2122 InlineParams llvm::getInlineParams(int Threshold) {
2123 InlineParams Params;
2124
2125 // This field is the threshold to use for a callee by default. This is
2126 // derived from one or more of:
2127 // * optimization or size-optimization levels,
2128 // * a value passed to createFunctionInliningPass function, or
2129 // * the -inline-threshold flag.
2130 // If the -inline-threshold flag is explicitly specified, that is used
2131 // irrespective of anything else.
2132 if (InlineThreshold.getNumOccurrences() > 0)
2133 Params.DefaultThreshold = InlineThreshold;
2134 else
2135 Params.DefaultThreshold = Threshold;
2136
2137 // Set the HintThreshold knob from the -inlinehint-threshold.
2138 Params.HintThreshold = HintThreshold;
2139
2140 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
2141 Params.HotCallSiteThreshold = HotCallSiteThreshold;
2142
2143 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
2144 // populate LocallyHotCallSiteThreshold. Later, we populate
2145 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
2146 // we know that optimization level is O3 (in the getInlineParams variant that
2147 // takes the opt and size levels).
2148 // FIXME: Remove this check (and make the assignment unconditional) after
2149 // addressing size regression issues at O2.
2150 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
2151 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2152
2153 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
2154 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
2155
2156 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
2157 // -inlinehint-threshold commandline option is not explicitly given. If that
2158 // option is present, then its value applies even for callees with size and
2159 // minsize attributes.
2160 // If the -inline-threshold is not specified, set the ColdThreshold from the
2161 // -inlinecold-threshold even if it is not explicitly passed. If
2162 // -inline-threshold is specified, then -inlinecold-threshold needs to be
2163 // explicitly specified to set the ColdThreshold knob
2164 if (InlineThreshold.getNumOccurrences() == 0) {
2165 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
2166 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
2167 Params.ColdThreshold = ColdThreshold;
2168 } else if (ColdThreshold.getNumOccurrences() > 0) {
2169 Params.ColdThreshold = ColdThreshold;
2170 }
2171 return Params;
2172 }
2173
getInlineParams()2174 InlineParams llvm::getInlineParams() {
2175 return getInlineParams(InlineThreshold);
2176 }
2177
2178 // Compute the default threshold for inlining based on the opt level and the
2179 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)2180 static int computeThresholdFromOptLevels(unsigned OptLevel,
2181 unsigned SizeOptLevel) {
2182 if (OptLevel > 2)
2183 return InlineConstants::OptAggressiveThreshold;
2184 if (SizeOptLevel == 1) // -Os
2185 return InlineConstants::OptSizeThreshold;
2186 if (SizeOptLevel == 2) // -Oz
2187 return InlineConstants::OptMinSizeThreshold;
2188 return InlineThreshold;
2189 }
2190
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)2191 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
2192 auto Params =
2193 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
2194 // At O3, use the value of -locally-hot-callsite-threshold option to populate
2195 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
2196 // when it is specified explicitly.
2197 if (OptLevel > 2)
2198 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
2199 return Params;
2200 }
2201