1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements inline cost analysis.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Analysis/InlineCost.h"
14 #include "llvm/ADT/STLExtras.h"
15 #include "llvm/ADT/SetVector.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/Analysis/ValueTracking.h"
30 #include "llvm/Config/llvm-config.h"
31 #include "llvm/IR/AssemblyAnnotationWriter.h"
32 #include "llvm/IR/CallingConv.h"
33 #include "llvm/IR/DataLayout.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/GetElementPtrTypeIterator.h"
36 #include "llvm/IR/GlobalAlias.h"
37 #include "llvm/IR/InstVisitor.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Operator.h"
40 #include "llvm/IR/PatternMatch.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/Debug.h"
43 #include "llvm/Support/FormattedStream.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include <limits>
46
47 using namespace llvm;
48
49 #define DEBUG_TYPE "inline-cost"
50
51 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
52
53 static cl::opt<int>
54 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
55 cl::desc("Default amount of inlining to perform"));
56
57 // We introduce this option since there is a minor compile-time win by avoiding
58 // addition of TTI attributes (target-features in particular) to inline
59 // candidates when they are guaranteed to be the same as top level methods in
60 // some use cases. If we avoid adding the attribute, we need an option to avoid
61 // checking these attributes.
62 static cl::opt<bool> IgnoreTTIInlineCompatible(
63 "ignore-tti-inline-compatible", cl::Hidden, cl::init(false),
64 cl::desc("Ignore TTI attributes compatibility check between callee/caller "
65 "during inline cost calculation"));
66
67 static cl::opt<bool> PrintInstructionComments(
68 "print-instruction-comments", cl::Hidden, cl::init(false),
69 cl::desc("Prints comments for instruction based on inline cost analysis"));
70
71 static cl::opt<int> InlineThreshold(
72 "inline-threshold", cl::Hidden, cl::init(225),
73 cl::desc("Control the amount of inlining to perform (default = 225)"));
74
75 static cl::opt<int> HintThreshold(
76 "inlinehint-threshold", cl::Hidden, cl::init(325),
77 cl::desc("Threshold for inlining functions with inline hint"));
78
79 static cl::opt<int>
80 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
81 cl::init(45),
82 cl::desc("Threshold for inlining cold callsites"));
83
84 static cl::opt<bool> InlineEnableCostBenefitAnalysis(
85 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
86 cl::desc("Enable the cost-benefit analysis for the inliner"));
87
88 static cl::opt<int> InlineSavingsMultiplier(
89 "inline-savings-multiplier", cl::Hidden, cl::init(8),
90 cl::desc("Multiplier to multiply cycle savings by during inlining"));
91
92 static cl::opt<int>
93 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
94 cl::desc("The maximum size of a callee that get's "
95 "inlined without sufficient cycle savings"));
96
97 // We introduce this threshold to help performance of instrumentation based
98 // PGO before we actually hook up inliner with analysis passes such as BPI and
99 // BFI.
100 static cl::opt<int> ColdThreshold(
101 "inlinecold-threshold", cl::Hidden, cl::init(45),
102 cl::desc("Threshold for inlining functions with cold attribute"));
103
104 static cl::opt<int>
105 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
106 cl::desc("Threshold for hot callsites "));
107
108 static cl::opt<int> LocallyHotCallSiteThreshold(
109 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525),
110 cl::desc("Threshold for locally hot callsites "));
111
112 static cl::opt<int> ColdCallSiteRelFreq(
113 "cold-callsite-rel-freq", cl::Hidden, cl::init(2),
114 cl::desc("Maximum block frequency, expressed as a percentage of caller's "
115 "entry frequency, for a callsite to be cold in the absence of "
116 "profile information."));
117
118 static cl::opt<int> HotCallSiteRelFreq(
119 "hot-callsite-rel-freq", cl::Hidden, cl::init(60),
120 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
121 "entry frequency, for a callsite to be hot in the absence of "
122 "profile information."));
123
124 static cl::opt<int> CallPenalty(
125 "inline-call-penalty", cl::Hidden, cl::init(25),
126 cl::desc("Call penalty that is applied per callsite when inlining"));
127
128 static cl::opt<size_t>
129 StackSizeThreshold("inline-max-stacksize", cl::Hidden,
130 cl::init(std::numeric_limits<size_t>::max()),
131 cl::desc("Do not inline functions with a stack size "
132 "that exceeds the specified limit"));
133
134 static cl::opt<size_t>
135 RecurStackSizeThreshold("recursive-inline-max-stacksize", cl::Hidden,
136 cl::init(InlineConstants::TotalAllocaSizeRecursiveCaller),
137 cl::desc("Do not inline recursive functions with a stack "
138 "size that exceeds the specified limit"));
139
140 static cl::opt<bool> OptComputeFullInlineCost(
141 "inline-cost-full", cl::Hidden,
142 cl::desc("Compute the full inline cost of a call site even when the cost "
143 "exceeds the threshold."));
144
145 static cl::opt<bool> InlineCallerSupersetNoBuiltin(
146 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
147 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
148 "attributes."));
149
150 static cl::opt<bool> DisableGEPConstOperand(
151 "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
152 cl::desc("Disables evaluation of GetElementPtr with constant operands"));
153
154 namespace llvm {
getStringFnAttrAsInt(CallBase & CB,StringRef AttrKind)155 Optional<int> getStringFnAttrAsInt(CallBase &CB, StringRef AttrKind) {
156 Attribute Attr = CB.getFnAttr(AttrKind);
157 int AttrValue;
158 if (Attr.getValueAsString().getAsInteger(10, AttrValue))
159 return None;
160 return AttrValue;
161 }
162 } // namespace llvm
163
164 namespace {
165 class InlineCostCallAnalyzer;
166
167 // This struct is used to store information about inline cost of a
168 // particular instruction
169 struct InstructionCostDetail {
170 int CostBefore = 0;
171 int CostAfter = 0;
172 int ThresholdBefore = 0;
173 int ThresholdAfter = 0;
174
getThresholdDelta__anon9893a50b0111::InstructionCostDetail175 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
176
getCostDelta__anon9893a50b0111::InstructionCostDetail177 int getCostDelta() const { return CostAfter - CostBefore; }
178
hasThresholdChanged__anon9893a50b0111::InstructionCostDetail179 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
180 };
181
182 class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
183 private:
184 InlineCostCallAnalyzer *const ICCA;
185
186 public:
InlineCostAnnotationWriter(InlineCostCallAnalyzer * ICCA)187 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
188 void emitInstructionAnnot(const Instruction *I,
189 formatted_raw_ostream &OS) override;
190 };
191
192 /// Carry out call site analysis, in order to evaluate inlinability.
193 /// NOTE: the type is currently used as implementation detail of functions such
194 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the
195 /// expectation is that they come from the outer scope, from the wrapper
196 /// functions. If we want to support constructing CallAnalyzer objects where
197 /// lambdas are provided inline at construction, or where the object needs to
198 /// otherwise survive past the scope of the provided functions, we need to
199 /// revisit the argument types.
200 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
201 typedef InstVisitor<CallAnalyzer, bool> Base;
202 friend class InstVisitor<CallAnalyzer, bool>;
203
204 protected:
205 virtual ~CallAnalyzer() = default;
206 /// The TargetTransformInfo available for this compilation.
207 const TargetTransformInfo &TTI;
208
209 /// Getter for the cache of @llvm.assume intrinsics.
210 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
211
212 /// Getter for BlockFrequencyInfo
213 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
214
215 /// Profile summary information.
216 ProfileSummaryInfo *PSI;
217
218 /// The called function.
219 Function &F;
220
221 // Cache the DataLayout since we use it a lot.
222 const DataLayout &DL;
223
224 /// The OptimizationRemarkEmitter available for this compilation.
225 OptimizationRemarkEmitter *ORE;
226
227 /// The candidate callsite being analyzed. Please do not use this to do
228 /// analysis in the caller function; we want the inline cost query to be
229 /// easily cacheable. Instead, use the cover function paramHasAttr.
230 CallBase &CandidateCall;
231
232 /// Extension points for handling callsite features.
233 // Called before a basic block was analyzed.
onBlockStart(const BasicBlock * BB)234 virtual void onBlockStart(const BasicBlock *BB) {}
235
236 /// Called after a basic block was analyzed.
onBlockAnalyzed(const BasicBlock * BB)237 virtual void onBlockAnalyzed(const BasicBlock *BB) {}
238
239 /// Called before an instruction was analyzed
onInstructionAnalysisStart(const Instruction * I)240 virtual void onInstructionAnalysisStart(const Instruction *I) {}
241
242 /// Called after an instruction was analyzed
onInstructionAnalysisFinish(const Instruction * I)243 virtual void onInstructionAnalysisFinish(const Instruction *I) {}
244
245 /// Called at the end of the analysis of the callsite. Return the outcome of
246 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
247 /// the reason it can't.
finalizeAnalysis()248 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
249 /// Called when we're about to start processing a basic block, and every time
250 /// we are done processing an instruction. Return true if there is no point in
251 /// continuing the analysis (e.g. we've determined already the call site is
252 /// too expensive to inline)
shouldStop()253 virtual bool shouldStop() { return false; }
254
255 /// Called before the analysis of the callee body starts (with callsite
256 /// contexts propagated). It checks callsite-specific information. Return a
257 /// reason analysis can't continue if that's the case, or 'true' if it may
258 /// continue.
onAnalysisStart()259 virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
260 /// Called if the analysis engine decides SROA cannot be done for the given
261 /// alloca.
onDisableSROA(AllocaInst * Arg)262 virtual void onDisableSROA(AllocaInst *Arg) {}
263
264 /// Called the analysis engine determines load elimination won't happen.
onDisableLoadElimination()265 virtual void onDisableLoadElimination() {}
266
267 /// Called when we visit a CallBase, before the analysis starts. Return false
268 /// to stop further processing of the instruction.
onCallBaseVisitStart(CallBase & Call)269 virtual bool onCallBaseVisitStart(CallBase &Call) { return true; }
270
271 /// Called to account for a call.
onCallPenalty()272 virtual void onCallPenalty() {}
273
274 /// Called to account for the expectation the inlining would result in a load
275 /// elimination.
onLoadEliminationOpportunity()276 virtual void onLoadEliminationOpportunity() {}
277
278 /// Called to account for the cost of argument setup for the Call in the
279 /// callee's body (not the callsite currently under analysis).
onCallArgumentSetup(const CallBase & Call)280 virtual void onCallArgumentSetup(const CallBase &Call) {}
281
282 /// Called to account for a load relative intrinsic.
onLoadRelativeIntrinsic()283 virtual void onLoadRelativeIntrinsic() {}
284
285 /// Called to account for a lowered call.
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)286 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
287 }
288
289 /// Account for a jump table of given size. Return false to stop further
290 /// processing the switch instruction
onJumpTable(unsigned JumpTableSize)291 virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
292
293 /// Account for a case cluster of given size. Return false to stop further
294 /// processing of the instruction.
onCaseCluster(unsigned NumCaseCluster)295 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
296
297 /// Called at the end of processing a switch instruction, with the given
298 /// number of case clusters.
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster)299 virtual void onFinalizeSwitch(unsigned JumpTableSize,
300 unsigned NumCaseCluster) {}
301
302 /// Called to account for any other instruction not specifically accounted
303 /// for.
onMissedSimplification()304 virtual void onMissedSimplification() {}
305
306 /// Start accounting potential benefits due to SROA for the given alloca.
onInitializeSROAArg(AllocaInst * Arg)307 virtual void onInitializeSROAArg(AllocaInst *Arg) {}
308
309 /// Account SROA savings for the AllocaInst value.
onAggregateSROAUse(AllocaInst * V)310 virtual void onAggregateSROAUse(AllocaInst *V) {}
311
handleSROA(Value * V,bool DoNotDisable)312 bool handleSROA(Value *V, bool DoNotDisable) {
313 // Check for SROA candidates in comparisons.
314 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
315 if (DoNotDisable) {
316 onAggregateSROAUse(SROAArg);
317 return true;
318 }
319 disableSROAForArg(SROAArg);
320 }
321 return false;
322 }
323
324 bool IsCallerRecursive = false;
325 bool IsRecursiveCall = false;
326 bool ExposesReturnsTwice = false;
327 bool HasDynamicAlloca = false;
328 bool ContainsNoDuplicateCall = false;
329 bool HasReturn = false;
330 bool HasIndirectBr = false;
331 bool HasUninlineableIntrinsic = false;
332 bool InitsVargArgs = false;
333
334 /// Number of bytes allocated statically by the callee.
335 uint64_t AllocatedSize = 0;
336 unsigned NumInstructions = 0;
337 unsigned NumVectorInstructions = 0;
338
339 /// While we walk the potentially-inlined instructions, we build up and
340 /// maintain a mapping of simplified values specific to this callsite. The
341 /// idea is to propagate any special information we have about arguments to
342 /// this call through the inlinable section of the function, and account for
343 /// likely simplifications post-inlining. The most important aspect we track
344 /// is CFG altering simplifications -- when we prove a basic block dead, that
345 /// can cause dramatic shifts in the cost of inlining a function.
346 DenseMap<Value *, Constant *> SimplifiedValues;
347
348 /// Keep track of the values which map back (through function arguments) to
349 /// allocas on the caller stack which could be simplified through SROA.
350 DenseMap<Value *, AllocaInst *> SROAArgValues;
351
352 /// Keep track of Allocas for which we believe we may get SROA optimization.
353 DenseSet<AllocaInst *> EnabledSROAAllocas;
354
355 /// Keep track of values which map to a pointer base and constant offset.
356 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
357
358 /// Keep track of dead blocks due to the constant arguments.
359 SmallPtrSet<BasicBlock *, 16> DeadBlocks;
360
361 /// The mapping of the blocks to their known unique successors due to the
362 /// constant arguments.
363 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors;
364
365 /// Model the elimination of repeated loads that is expected to happen
366 /// whenever we simplify away the stores that would otherwise cause them to be
367 /// loads.
368 bool EnableLoadElimination = true;
369
370 /// Whether we allow inlining for recursive call.
371 bool AllowRecursiveCall = false;
372
373 SmallPtrSet<Value *, 16> LoadAddrSet;
374
getSROAArgForValueOrNull(Value * V) const375 AllocaInst *getSROAArgForValueOrNull(Value *V) const {
376 auto It = SROAArgValues.find(V);
377 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
378 return nullptr;
379 return It->second;
380 }
381
382 // Custom simplification helper routines.
383 bool isAllocaDerivedArg(Value *V);
384 void disableSROAForArg(AllocaInst *SROAArg);
385 void disableSROA(Value *V);
386 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
387 void disableLoadElimination();
388 bool isGEPFree(GetElementPtrInst &GEP);
389 bool canFoldInboundsGEP(GetElementPtrInst &I);
390 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
391 bool simplifyCallSite(Function *F, CallBase &Call);
392 bool simplifyInstruction(Instruction &I);
393 bool simplifyIntrinsicCallIsConstant(CallBase &CB);
394 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
395
396 /// Return true if the given argument to the function being considered for
397 /// inlining has the given attribute set either at the call site or the
398 /// function declaration. Primarily used to inspect call site specific
399 /// attributes since these can be more precise than the ones on the callee
400 /// itself.
401 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr);
402
403 /// Return true if the given value is known non null within the callee if
404 /// inlined through this particular callsite.
405 bool isKnownNonNullInCallee(Value *V);
406
407 /// Return true if size growth is allowed when inlining the callee at \p Call.
408 bool allowSizeGrowth(CallBase &Call);
409
410 // Custom analysis routines.
411 InlineResult analyzeBlock(BasicBlock *BB,
412 SmallPtrSetImpl<const Value *> &EphValues);
413
414 // Disable several entry points to the visitor so we don't accidentally use
415 // them by declaring but not defining them here.
416 void visit(Module *);
417 void visit(Module &);
418 void visit(Function *);
419 void visit(Function &);
420 void visit(BasicBlock *);
421 void visit(BasicBlock &);
422
423 // Provide base case for our instruction visit.
424 bool visitInstruction(Instruction &I);
425
426 // Our visit overrides.
427 bool visitAlloca(AllocaInst &I);
428 bool visitPHI(PHINode &I);
429 bool visitGetElementPtr(GetElementPtrInst &I);
430 bool visitBitCast(BitCastInst &I);
431 bool visitPtrToInt(PtrToIntInst &I);
432 bool visitIntToPtr(IntToPtrInst &I);
433 bool visitCastInst(CastInst &I);
434 bool visitCmpInst(CmpInst &I);
435 bool visitSub(BinaryOperator &I);
436 bool visitBinaryOperator(BinaryOperator &I);
437 bool visitFNeg(UnaryOperator &I);
438 bool visitLoad(LoadInst &I);
439 bool visitStore(StoreInst &I);
440 bool visitExtractValue(ExtractValueInst &I);
441 bool visitInsertValue(InsertValueInst &I);
442 bool visitCallBase(CallBase &Call);
443 bool visitReturnInst(ReturnInst &RI);
444 bool visitBranchInst(BranchInst &BI);
445 bool visitSelectInst(SelectInst &SI);
446 bool visitSwitchInst(SwitchInst &SI);
447 bool visitIndirectBrInst(IndirectBrInst &IBI);
448 bool visitResumeInst(ResumeInst &RI);
449 bool visitCleanupReturnInst(CleanupReturnInst &RI);
450 bool visitCatchReturnInst(CatchReturnInst &RI);
451 bool visitUnreachableInst(UnreachableInst &I);
452
453 public:
CallAnalyzer(Function & Callee,CallBase & Call,const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI=nullptr,ProfileSummaryInfo * PSI=nullptr,OptimizationRemarkEmitter * ORE=nullptr)454 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
455 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
456 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
457 ProfileSummaryInfo *PSI = nullptr,
458 OptimizationRemarkEmitter *ORE = nullptr)
459 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
460 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
461 CandidateCall(Call) {}
462
463 InlineResult analyze();
464
getSimplifiedValue(Instruction * I)465 Optional<Constant *> getSimplifiedValue(Instruction *I) {
466 if (SimplifiedValues.find(I) != SimplifiedValues.end())
467 return SimplifiedValues[I];
468 return None;
469 }
470
471 // Keep a bunch of stats about the cost savings found so we can print them
472 // out when debugging.
473 unsigned NumConstantArgs = 0;
474 unsigned NumConstantOffsetPtrArgs = 0;
475 unsigned NumAllocaArgs = 0;
476 unsigned NumConstantPtrCmps = 0;
477 unsigned NumConstantPtrDiffs = 0;
478 unsigned NumInstructionsSimplified = 0;
479
480 void dump();
481 };
482
483 // Considering forming a binary search, we should find the number of nodes
484 // which is same as the number of comparisons when lowered. For a given
485 // number of clusters, n, we can define a recursive function, f(n), to find
486 // the number of nodes in the tree. The recursion is :
487 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
488 // and f(n) = n, when n <= 3.
489 // This will lead a binary tree where the leaf should be either f(2) or f(3)
490 // when n > 3. So, the number of comparisons from leaves should be n, while
491 // the number of non-leaf should be :
492 // 2^(log2(n) - 1) - 1
493 // = 2^log2(n) * 2^-1 - 1
494 // = n / 2 - 1.
495 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
496 // number of comparisons in a simple closed form :
497 // n + n / 2 - 1 = n * 3 / 2 - 1
getExpectedNumberOfCompare(int NumCaseCluster)498 int64_t getExpectedNumberOfCompare(int NumCaseCluster) {
499 return 3 * static_cast<int64_t>(NumCaseCluster) / 2 - 1;
500 }
501
502 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
503 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
504 class InlineCostCallAnalyzer final : public CallAnalyzer {
505 const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
506 const bool ComputeFullInlineCost;
507 int LoadEliminationCost = 0;
508 /// Bonus to be applied when percentage of vector instructions in callee is
509 /// high (see more details in updateThreshold).
510 int VectorBonus = 0;
511 /// Bonus to be applied when the callee has only one reachable basic block.
512 int SingleBBBonus = 0;
513
514 /// Tunable parameters that control the analysis.
515 const InlineParams &Params;
516
517 // This DenseMap stores the delta change in cost and threshold after
518 // accounting for the given instruction. The map is filled only with the
519 // flag PrintInstructionComments on.
520 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
521
522 /// Upper bound for the inlining cost. Bonuses are being applied to account
523 /// for speculative "expected profit" of the inlining decision.
524 int Threshold = 0;
525
526 /// Attempt to evaluate indirect calls to boost its inline cost.
527 const bool BoostIndirectCalls;
528
529 /// Ignore the threshold when finalizing analysis.
530 const bool IgnoreThreshold;
531
532 // True if the cost-benefit-analysis-based inliner is enabled.
533 const bool CostBenefitAnalysisEnabled;
534
535 /// Inlining cost measured in abstract units, accounts for all the
536 /// instructions expected to be executed for a given function invocation.
537 /// Instructions that are statically proven to be dead based on call-site
538 /// arguments are not counted here.
539 int Cost = 0;
540
541 // The cumulative cost at the beginning of the basic block being analyzed. At
542 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
543 // the size of that basic block.
544 int CostAtBBStart = 0;
545
546 // The static size of live but cold basic blocks. This is "static" in the
547 // sense that it's not weighted by profile counts at all.
548 int ColdSize = 0;
549
550 // Whether inlining is decided by cost-threshold analysis.
551 bool DecidedByCostThreshold = false;
552
553 // Whether inlining is decided by cost-benefit analysis.
554 bool DecidedByCostBenefit = false;
555
556 // The cost-benefit pair computed by cost-benefit analysis.
557 Optional<CostBenefitPair> CostBenefit = None;
558
559 bool SingleBB = true;
560
561 unsigned SROACostSavings = 0;
562 unsigned SROACostSavingsLost = 0;
563
564 /// The mapping of caller Alloca values to their accumulated cost savings. If
565 /// we have to disable SROA for one of the allocas, this tells us how much
566 /// cost must be added.
567 DenseMap<AllocaInst *, int> SROAArgCosts;
568
569 /// Return true if \p Call is a cold callsite.
570 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
571
572 /// Update Threshold based on callsite properties such as callee
573 /// attributes and callee hotness for PGO builds. The Callee is explicitly
574 /// passed to support analyzing indirect calls whose target is inferred by
575 /// analysis.
576 void updateThreshold(CallBase &Call, Function &Callee);
577 /// Return a higher threshold if \p Call is a hot callsite.
578 Optional<int> getHotCallSiteThreshold(CallBase &Call,
579 BlockFrequencyInfo *CallerBFI);
580
581 /// Handle a capped 'int' increment for Cost.
addCost(int64_t Inc,int64_t UpperBound=INT_MAX)582 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
583 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
584 Cost = std::min<int>(UpperBound, Cost + Inc);
585 }
586
onDisableSROA(AllocaInst * Arg)587 void onDisableSROA(AllocaInst *Arg) override {
588 auto CostIt = SROAArgCosts.find(Arg);
589 if (CostIt == SROAArgCosts.end())
590 return;
591 addCost(CostIt->second);
592 SROACostSavings -= CostIt->second;
593 SROACostSavingsLost += CostIt->second;
594 SROAArgCosts.erase(CostIt);
595 }
596
onDisableLoadElimination()597 void onDisableLoadElimination() override {
598 addCost(LoadEliminationCost);
599 LoadEliminationCost = 0;
600 }
601
onCallBaseVisitStart(CallBase & Call)602 bool onCallBaseVisitStart(CallBase &Call) override {
603 if (Optional<int> AttrCallThresholdBonus =
604 getStringFnAttrAsInt(Call, "call-threshold-bonus"))
605 Threshold += *AttrCallThresholdBonus;
606
607 if (Optional<int> AttrCallCost =
608 getStringFnAttrAsInt(Call, "call-inline-cost")) {
609 addCost(*AttrCallCost);
610 // Prevent further processing of the call since we want to override its
611 // inline cost, not just add to it.
612 return false;
613 }
614 return true;
615 }
616
onCallPenalty()617 void onCallPenalty() override { addCost(CallPenalty); }
onCallArgumentSetup(const CallBase & Call)618 void onCallArgumentSetup(const CallBase &Call) override {
619 // Pay the price of the argument setup. We account for the average 1
620 // instruction per call argument setup here.
621 addCost(Call.arg_size() * InlineConstants::InstrCost);
622 }
onLoadRelativeIntrinsic()623 void onLoadRelativeIntrinsic() override {
624 // This is normally lowered to 4 LLVM instructions.
625 addCost(3 * InlineConstants::InstrCost);
626 }
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)627 void onLoweredCall(Function *F, CallBase &Call,
628 bool IsIndirectCall) override {
629 // We account for the average 1 instruction per call argument setup here.
630 addCost(Call.arg_size() * InlineConstants::InstrCost);
631
632 // If we have a constant that we are calling as a function, we can peer
633 // through it and see the function target. This happens not infrequently
634 // during devirtualization and so we want to give it a hefty bonus for
635 // inlining, but cap that bonus in the event that inlining wouldn't pan out.
636 // Pretend to inline the function, with a custom threshold.
637 if (IsIndirectCall && BoostIndirectCalls) {
638 auto IndirectCallParams = Params;
639 IndirectCallParams.DefaultThreshold =
640 InlineConstants::IndirectCallThreshold;
641 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
642 /// to instantiate the derived class.
643 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
644 GetAssumptionCache, GetBFI, PSI, ORE, false);
645 if (CA.analyze().isSuccess()) {
646 // We were able to inline the indirect call! Subtract the cost from the
647 // threshold to get the bonus we want to apply, but don't go below zero.
648 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
649 }
650 } else
651 // Otherwise simply add the cost for merely making the call.
652 addCost(CallPenalty);
653 }
654
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster)655 void onFinalizeSwitch(unsigned JumpTableSize,
656 unsigned NumCaseCluster) override {
657 // If suitable for a jump table, consider the cost for the table size and
658 // branch to destination.
659 // Maximum valid cost increased in this function.
660 if (JumpTableSize) {
661 int64_t JTCost =
662 static_cast<int64_t>(JumpTableSize) * InlineConstants::InstrCost +
663 4 * InlineConstants::InstrCost;
664
665 addCost(JTCost, static_cast<int64_t>(CostUpperBound));
666 return;
667 }
668
669 if (NumCaseCluster <= 3) {
670 // Suppose a comparison includes one compare and one conditional branch.
671 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
672 return;
673 }
674
675 int64_t ExpectedNumberOfCompare =
676 getExpectedNumberOfCompare(NumCaseCluster);
677 int64_t SwitchCost =
678 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
679
680 addCost(SwitchCost, static_cast<int64_t>(CostUpperBound));
681 }
onMissedSimplification()682 void onMissedSimplification() override {
683 addCost(InlineConstants::InstrCost);
684 }
685
onInitializeSROAArg(AllocaInst * Arg)686 void onInitializeSROAArg(AllocaInst *Arg) override {
687 assert(Arg != nullptr &&
688 "Should not initialize SROA costs for null value.");
689 SROAArgCosts[Arg] = 0;
690 }
691
onAggregateSROAUse(AllocaInst * SROAArg)692 void onAggregateSROAUse(AllocaInst *SROAArg) override {
693 auto CostIt = SROAArgCosts.find(SROAArg);
694 assert(CostIt != SROAArgCosts.end() &&
695 "expected this argument to have a cost");
696 CostIt->second += InlineConstants::InstrCost;
697 SROACostSavings += InlineConstants::InstrCost;
698 }
699
onBlockStart(const BasicBlock * BB)700 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
701
onBlockAnalyzed(const BasicBlock * BB)702 void onBlockAnalyzed(const BasicBlock *BB) override {
703 if (CostBenefitAnalysisEnabled) {
704 // Keep track of the static size of live but cold basic blocks. For now,
705 // we define a cold basic block to be one that's never executed.
706 assert(GetBFI && "GetBFI must be available");
707 BlockFrequencyInfo *BFI = &(GetBFI(F));
708 assert(BFI && "BFI must be available");
709 auto ProfileCount = BFI->getBlockProfileCount(BB);
710 assert(ProfileCount);
711 if (ProfileCount.value() == 0)
712 ColdSize += Cost - CostAtBBStart;
713 }
714
715 auto *TI = BB->getTerminator();
716 // If we had any successors at this point, than post-inlining is likely to
717 // have them as well. Note that we assume any basic blocks which existed
718 // due to branches or switches which folded above will also fold after
719 // inlining.
720 if (SingleBB && TI->getNumSuccessors() > 1) {
721 // Take off the bonus we applied to the threshold.
722 Threshold -= SingleBBBonus;
723 SingleBB = false;
724 }
725 }
726
onInstructionAnalysisStart(const Instruction * I)727 void onInstructionAnalysisStart(const Instruction *I) override {
728 // This function is called to store the initial cost of inlining before
729 // the given instruction was assessed.
730 if (!PrintInstructionComments)
731 return;
732 InstructionCostDetailMap[I].CostBefore = Cost;
733 InstructionCostDetailMap[I].ThresholdBefore = Threshold;
734 }
735
onInstructionAnalysisFinish(const Instruction * I)736 void onInstructionAnalysisFinish(const Instruction *I) override {
737 // This function is called to find new values of cost and threshold after
738 // the instruction has been assessed.
739 if (!PrintInstructionComments)
740 return;
741 InstructionCostDetailMap[I].CostAfter = Cost;
742 InstructionCostDetailMap[I].ThresholdAfter = Threshold;
743 }
744
isCostBenefitAnalysisEnabled()745 bool isCostBenefitAnalysisEnabled() {
746 if (!PSI || !PSI->hasProfileSummary())
747 return false;
748
749 if (!GetBFI)
750 return false;
751
752 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) {
753 // Honor the explicit request from the user.
754 if (!InlineEnableCostBenefitAnalysis)
755 return false;
756 } else {
757 // Otherwise, require instrumentation profile.
758 if (!PSI->hasInstrumentationProfile())
759 return false;
760 }
761
762 auto *Caller = CandidateCall.getParent()->getParent();
763 if (!Caller->getEntryCount())
764 return false;
765
766 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
767 if (!CallerBFI)
768 return false;
769
770 // For now, limit to hot call site.
771 if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
772 return false;
773
774 // Make sure we have a nonzero entry count.
775 auto EntryCount = F.getEntryCount();
776 if (!EntryCount || !EntryCount->getCount())
777 return false;
778
779 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
780 if (!CalleeBFI)
781 return false;
782
783 return true;
784 }
785
786 // Determine whether we should inline the given call site, taking into account
787 // both the size cost and the cycle savings. Return None if we don't have
788 // suficient profiling information to determine.
costBenefitAnalysis()789 Optional<bool> costBenefitAnalysis() {
790 if (!CostBenefitAnalysisEnabled)
791 return None;
792
793 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
794 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
795 // falling back to the cost-based metric.
796 // TODO: Improve this hacky condition.
797 if (Threshold == 0)
798 return None;
799
800 assert(GetBFI);
801 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
802 assert(CalleeBFI);
803
804 // The cycle savings expressed as the sum of InlineConstants::InstrCost
805 // multiplied by the estimated dynamic count of each instruction we can
806 // avoid. Savings come from the call site cost, such as argument setup and
807 // the call instruction, as well as the instructions that are folded.
808 //
809 // We use 128-bit APInt here to avoid potential overflow. This variable
810 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
811 // assumes that we can avoid or fold a billion instructions, each with a
812 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
813 // period on a 4GHz machine.
814 APInt CycleSavings(128, 0);
815
816 for (auto &BB : F) {
817 APInt CurrentSavings(128, 0);
818 for (auto &I : BB) {
819 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
820 // Count a conditional branch as savings if it becomes unconditional.
821 if (BI->isConditional() &&
822 isa_and_nonnull<ConstantInt>(
823 SimplifiedValues.lookup(BI->getCondition()))) {
824 CurrentSavings += InlineConstants::InstrCost;
825 }
826 } else if (Value *V = dyn_cast<Value>(&I)) {
827 // Count an instruction as savings if we can fold it.
828 if (SimplifiedValues.count(V)) {
829 CurrentSavings += InlineConstants::InstrCost;
830 }
831 }
832 }
833
834 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
835 assert(ProfileCount);
836 CurrentSavings *= ProfileCount.value();
837 CycleSavings += CurrentSavings;
838 }
839
840 // Compute the cycle savings per call.
841 auto EntryProfileCount = F.getEntryCount();
842 assert(EntryProfileCount && EntryProfileCount->getCount());
843 auto EntryCount = EntryProfileCount->getCount();
844 CycleSavings += EntryCount / 2;
845 CycleSavings = CycleSavings.udiv(EntryCount);
846
847 // Compute the total savings for the call site.
848 auto *CallerBB = CandidateCall.getParent();
849 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
850 CycleSavings += getCallsiteCost(this->CandidateCall, DL);
851 CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
852
853 // Remove the cost of the cold basic blocks.
854 int Size = Cost - ColdSize;
855
856 // Allow tiny callees to be inlined regardless of whether they meet the
857 // savings threshold.
858 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
859
860 CostBenefit.emplace(APInt(128, Size), CycleSavings);
861
862 // Return true if the savings justify the cost of inlining. Specifically,
863 // we evaluate the following inequality:
864 //
865 // CycleSavings PSI->getOrCompHotCountThreshold()
866 // -------------- >= -----------------------------------
867 // Size InlineSavingsMultiplier
868 //
869 // Note that the left hand side is specific to a call site. The right hand
870 // side is a constant for the entire executable.
871 APInt LHS = CycleSavings;
872 LHS *= InlineSavingsMultiplier;
873 APInt RHS(128, PSI->getOrCompHotCountThreshold());
874 RHS *= Size;
875 return LHS.uge(RHS);
876 }
877
finalizeAnalysis()878 InlineResult finalizeAnalysis() override {
879 // Loops generally act a lot like calls in that they act like barriers to
880 // movement, require a certain amount of setup, etc. So when optimising for
881 // size, we penalise any call sites that perform loops. We do this after all
882 // other costs here, so will likely only be dealing with relatively small
883 // functions (and hence DT and LI will hopefully be cheap).
884 auto *Caller = CandidateCall.getFunction();
885 if (Caller->hasMinSize()) {
886 DominatorTree DT(F);
887 LoopInfo LI(DT);
888 int NumLoops = 0;
889 for (Loop *L : LI) {
890 // Ignore loops that will not be executed
891 if (DeadBlocks.count(L->getHeader()))
892 continue;
893 NumLoops++;
894 }
895 addCost(NumLoops * InlineConstants::LoopPenalty);
896 }
897
898 // We applied the maximum possible vector bonus at the beginning. Now,
899 // subtract the excess bonus, if any, from the Threshold before
900 // comparing against Cost.
901 if (NumVectorInstructions <= NumInstructions / 10)
902 Threshold -= VectorBonus;
903 else if (NumVectorInstructions <= NumInstructions / 2)
904 Threshold -= VectorBonus / 2;
905
906 if (Optional<int> AttrCost =
907 getStringFnAttrAsInt(CandidateCall, "function-inline-cost"))
908 Cost = *AttrCost;
909
910 if (Optional<int> AttrCostMult = getStringFnAttrAsInt(
911 CandidateCall,
912 InlineConstants::FunctionInlineCostMultiplierAttributeName))
913 Cost *= *AttrCostMult;
914
915 if (Optional<int> AttrThreshold =
916 getStringFnAttrAsInt(CandidateCall, "function-inline-threshold"))
917 Threshold = *AttrThreshold;
918
919 if (auto Result = costBenefitAnalysis()) {
920 DecidedByCostBenefit = true;
921 if (*Result)
922 return InlineResult::success();
923 else
924 return InlineResult::failure("Cost over threshold.");
925 }
926
927 if (IgnoreThreshold)
928 return InlineResult::success();
929
930 DecidedByCostThreshold = true;
931 return Cost < std::max(1, Threshold)
932 ? InlineResult::success()
933 : InlineResult::failure("Cost over threshold.");
934 }
935
shouldStop()936 bool shouldStop() override {
937 if (IgnoreThreshold || ComputeFullInlineCost)
938 return false;
939 // Bail out the moment we cross the threshold. This means we'll under-count
940 // the cost, but only when undercounting doesn't matter.
941 if (Cost < Threshold)
942 return false;
943 DecidedByCostThreshold = true;
944 return true;
945 }
946
onLoadEliminationOpportunity()947 void onLoadEliminationOpportunity() override {
948 LoadEliminationCost += InlineConstants::InstrCost;
949 }
950
onAnalysisStart()951 InlineResult onAnalysisStart() override {
952 // Perform some tweaks to the cost and threshold based on the direct
953 // callsite information.
954
955 // We want to more aggressively inline vector-dense kernels, so up the
956 // threshold, and we'll lower it if the % of vector instructions gets too
957 // low. Note that these bonuses are some what arbitrary and evolved over
958 // time by accident as much as because they are principled bonuses.
959 //
960 // FIXME: It would be nice to remove all such bonuses. At least it would be
961 // nice to base the bonus values on something more scientific.
962 assert(NumInstructions == 0);
963 assert(NumVectorInstructions == 0);
964
965 // Update the threshold based on callsite properties
966 updateThreshold(CandidateCall, F);
967
968 // While Threshold depends on commandline options that can take negative
969 // values, we want to enforce the invariant that the computed threshold and
970 // bonuses are non-negative.
971 assert(Threshold >= 0);
972 assert(SingleBBBonus >= 0);
973 assert(VectorBonus >= 0);
974
975 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
976 // this Threshold any time, and cost cannot decrease, we can stop processing
977 // the rest of the function body.
978 Threshold += (SingleBBBonus + VectorBonus);
979
980 // Give out bonuses for the callsite, as the instructions setting them up
981 // will be gone after inlining.
982 addCost(-getCallsiteCost(this->CandidateCall, DL));
983
984 // If this function uses the coldcc calling convention, prefer not to inline
985 // it.
986 if (F.getCallingConv() == CallingConv::Cold)
987 Cost += InlineConstants::ColdccPenalty;
988
989 LLVM_DEBUG(dbgs() << " Initial cost: " << Cost << "\n");
990
991 // Check if we're done. This can happen due to bonuses and penalties.
992 if (Cost >= Threshold && !ComputeFullInlineCost)
993 return InlineResult::failure("high cost");
994
995 return InlineResult::success();
996 }
997
998 public:
InlineCostCallAnalyzer(Function & Callee,CallBase & Call,const InlineParams & Params,const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI=nullptr,ProfileSummaryInfo * PSI=nullptr,OptimizationRemarkEmitter * ORE=nullptr,bool BoostIndirect=true,bool IgnoreThreshold=false)999 InlineCostCallAnalyzer(
1000 Function &Callee, CallBase &Call, const InlineParams &Params,
1001 const TargetTransformInfo &TTI,
1002 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
1003 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
1004 ProfileSummaryInfo *PSI = nullptr,
1005 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
1006 bool IgnoreThreshold = false)
1007 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
1008 ComputeFullInlineCost(OptComputeFullInlineCost ||
1009 Params.ComputeFullInlineCost || ORE ||
1010 isCostBenefitAnalysisEnabled()),
1011 Params(Params), Threshold(Params.DefaultThreshold),
1012 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
1013 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
1014 Writer(this) {
1015 AllowRecursiveCall = *Params.AllowRecursiveCall;
1016 }
1017
1018 /// Annotation Writer for instruction details
1019 InlineCostAnnotationWriter Writer;
1020
1021 void dump();
1022
1023 // Prints the same analysis as dump(), but its definition is not dependent
1024 // on the build.
1025 void print(raw_ostream &OS);
1026
getCostDetails(const Instruction * I)1027 Optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
1028 if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end())
1029 return InstructionCostDetailMap[I];
1030 return None;
1031 }
1032
1033 virtual ~InlineCostCallAnalyzer() = default;
getThreshold() const1034 int getThreshold() const { return Threshold; }
getCost() const1035 int getCost() const { return Cost; }
getCostBenefitPair()1036 Optional<CostBenefitPair> getCostBenefitPair() { return CostBenefit; }
wasDecidedByCostBenefit() const1037 bool wasDecidedByCostBenefit() const { return DecidedByCostBenefit; }
wasDecidedByCostThreshold() const1038 bool wasDecidedByCostThreshold() const { return DecidedByCostThreshold; }
1039 };
1040
1041 class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
1042 private:
1043 InlineCostFeatures Cost = {};
1044
1045 // FIXME: These constants are taken from the heuristic-based cost visitor.
1046 // These should be removed entirely in a later revision to avoid reliance on
1047 // heuristics in the ML inliner.
1048 static constexpr int JTCostMultiplier = 4;
1049 static constexpr int CaseClusterCostMultiplier = 2;
1050 static constexpr int SwitchCostMultiplier = 2;
1051
1052 // FIXME: These are taken from the heuristic-based cost visitor: we should
1053 // eventually abstract these to the CallAnalyzer to avoid duplication.
1054 unsigned SROACostSavingOpportunities = 0;
1055 int VectorBonus = 0;
1056 int SingleBBBonus = 0;
1057 int Threshold = 5;
1058
1059 DenseMap<AllocaInst *, unsigned> SROACosts;
1060
increment(InlineCostFeatureIndex Feature,int64_t Delta=1)1061 void increment(InlineCostFeatureIndex Feature, int64_t Delta = 1) {
1062 Cost[static_cast<size_t>(Feature)] += Delta;
1063 }
1064
set(InlineCostFeatureIndex Feature,int64_t Value)1065 void set(InlineCostFeatureIndex Feature, int64_t Value) {
1066 Cost[static_cast<size_t>(Feature)] = Value;
1067 }
1068
onDisableSROA(AllocaInst * Arg)1069 void onDisableSROA(AllocaInst *Arg) override {
1070 auto CostIt = SROACosts.find(Arg);
1071 if (CostIt == SROACosts.end())
1072 return;
1073
1074 increment(InlineCostFeatureIndex::SROALosses, CostIt->second);
1075 SROACostSavingOpportunities -= CostIt->second;
1076 SROACosts.erase(CostIt);
1077 }
1078
onDisableLoadElimination()1079 void onDisableLoadElimination() override {
1080 set(InlineCostFeatureIndex::LoadElimination, 1);
1081 }
1082
onCallPenalty()1083 void onCallPenalty() override {
1084 increment(InlineCostFeatureIndex::CallPenalty, CallPenalty);
1085 }
1086
onCallArgumentSetup(const CallBase & Call)1087 void onCallArgumentSetup(const CallBase &Call) override {
1088 increment(InlineCostFeatureIndex::CallArgumentSetup,
1089 Call.arg_size() * InlineConstants::InstrCost);
1090 }
1091
onLoadRelativeIntrinsic()1092 void onLoadRelativeIntrinsic() override {
1093 increment(InlineCostFeatureIndex::LoadRelativeIntrinsic,
1094 3 * InlineConstants::InstrCost);
1095 }
1096
onLoweredCall(Function * F,CallBase & Call,bool IsIndirectCall)1097 void onLoweredCall(Function *F, CallBase &Call,
1098 bool IsIndirectCall) override {
1099 increment(InlineCostFeatureIndex::LoweredCallArgSetup,
1100 Call.arg_size() * InlineConstants::InstrCost);
1101
1102 if (IsIndirectCall) {
1103 InlineParams IndirectCallParams = {/* DefaultThreshold*/ 0,
1104 /*HintThreshold*/ {},
1105 /*ColdThreshold*/ {},
1106 /*OptSizeThreshold*/ {},
1107 /*OptMinSizeThreshold*/ {},
1108 /*HotCallSiteThreshold*/ {},
1109 /*LocallyHotCallSiteThreshold*/ {},
1110 /*ColdCallSiteThreshold*/ {},
1111 /*ComputeFullInlineCost*/ true,
1112 /*EnableDeferral*/ true};
1113 IndirectCallParams.DefaultThreshold =
1114 InlineConstants::IndirectCallThreshold;
1115
1116 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
1117 GetAssumptionCache, GetBFI, PSI, ORE, false,
1118 true);
1119 if (CA.analyze().isSuccess()) {
1120 increment(InlineCostFeatureIndex::NestedInlineCostEstimate,
1121 CA.getCost());
1122 increment(InlineCostFeatureIndex::NestedInlines, 1);
1123 }
1124 } else {
1125 onCallPenalty();
1126 }
1127 }
1128
onFinalizeSwitch(unsigned JumpTableSize,unsigned NumCaseCluster)1129 void onFinalizeSwitch(unsigned JumpTableSize,
1130 unsigned NumCaseCluster) override {
1131
1132 if (JumpTableSize) {
1133 int64_t JTCost =
1134 static_cast<int64_t>(JumpTableSize) * InlineConstants::InstrCost +
1135 JTCostMultiplier * InlineConstants::InstrCost;
1136 increment(InlineCostFeatureIndex::JumpTablePenalty, JTCost);
1137 return;
1138 }
1139
1140 if (NumCaseCluster <= 3) {
1141 increment(InlineCostFeatureIndex::CaseClusterPenalty,
1142 NumCaseCluster * CaseClusterCostMultiplier *
1143 InlineConstants::InstrCost);
1144 return;
1145 }
1146
1147 int64_t ExpectedNumberOfCompare =
1148 getExpectedNumberOfCompare(NumCaseCluster);
1149
1150 int64_t SwitchCost = ExpectedNumberOfCompare * SwitchCostMultiplier *
1151 InlineConstants::InstrCost;
1152 increment(InlineCostFeatureIndex::SwitchPenalty, SwitchCost);
1153 }
1154
onMissedSimplification()1155 void onMissedSimplification() override {
1156 increment(InlineCostFeatureIndex::UnsimplifiedCommonInstructions,
1157 InlineConstants::InstrCost);
1158 }
1159
onInitializeSROAArg(AllocaInst * Arg)1160 void onInitializeSROAArg(AllocaInst *Arg) override { SROACosts[Arg] = 0; }
onAggregateSROAUse(AllocaInst * Arg)1161 void onAggregateSROAUse(AllocaInst *Arg) override {
1162 SROACosts.find(Arg)->second += InlineConstants::InstrCost;
1163 SROACostSavingOpportunities += InlineConstants::InstrCost;
1164 }
1165
onBlockAnalyzed(const BasicBlock * BB)1166 void onBlockAnalyzed(const BasicBlock *BB) override {
1167 if (BB->getTerminator()->getNumSuccessors() > 1)
1168 set(InlineCostFeatureIndex::IsMultipleBlocks, 1);
1169 Threshold -= SingleBBBonus;
1170 }
1171
finalizeAnalysis()1172 InlineResult finalizeAnalysis() override {
1173 auto *Caller = CandidateCall.getFunction();
1174 if (Caller->hasMinSize()) {
1175 DominatorTree DT(F);
1176 LoopInfo LI(DT);
1177 for (Loop *L : LI) {
1178 // Ignore loops that will not be executed
1179 if (DeadBlocks.count(L->getHeader()))
1180 continue;
1181 increment(InlineCostFeatureIndex::NumLoops,
1182 InlineConstants::LoopPenalty);
1183 }
1184 }
1185 set(InlineCostFeatureIndex::DeadBlocks, DeadBlocks.size());
1186 set(InlineCostFeatureIndex::SimplifiedInstructions,
1187 NumInstructionsSimplified);
1188 set(InlineCostFeatureIndex::ConstantArgs, NumConstantArgs);
1189 set(InlineCostFeatureIndex::ConstantOffsetPtrArgs,
1190 NumConstantOffsetPtrArgs);
1191 set(InlineCostFeatureIndex::SROASavings, SROACostSavingOpportunities);
1192
1193 if (NumVectorInstructions <= NumInstructions / 10)
1194 Threshold -= VectorBonus;
1195 else if (NumVectorInstructions <= NumInstructions / 2)
1196 Threshold -= VectorBonus / 2;
1197
1198 set(InlineCostFeatureIndex::Threshold, Threshold);
1199
1200 return InlineResult::success();
1201 }
1202
shouldStop()1203 bool shouldStop() override { return false; }
1204
onLoadEliminationOpportunity()1205 void onLoadEliminationOpportunity() override {
1206 increment(InlineCostFeatureIndex::LoadElimination, 1);
1207 }
1208
onAnalysisStart()1209 InlineResult onAnalysisStart() override {
1210 increment(InlineCostFeatureIndex::CallSiteCost,
1211 -1 * getCallsiteCost(this->CandidateCall, DL));
1212
1213 set(InlineCostFeatureIndex::ColdCcPenalty,
1214 (F.getCallingConv() == CallingConv::Cold));
1215
1216 set(InlineCostFeatureIndex::LastCallToStaticBonus,
1217 (F.hasLocalLinkage() && F.hasOneLiveUse() &&
1218 &F == CandidateCall.getCalledFunction()));
1219
1220 // FIXME: we shouldn't repeat this logic in both the Features and Cost
1221 // analyzer - instead, we should abstract it to a common method in the
1222 // CallAnalyzer
1223 int SingleBBBonusPercent = 50;
1224 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1225 Threshold += TTI.adjustInliningThreshold(&CandidateCall);
1226 Threshold *= TTI.getInliningThresholdMultiplier();
1227 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1228 VectorBonus = Threshold * VectorBonusPercent / 100;
1229 Threshold += (SingleBBBonus + VectorBonus);
1230
1231 return InlineResult::success();
1232 }
1233
1234 public:
InlineCostFeaturesAnalyzer(const TargetTransformInfo & TTI,function_ref<AssumptionCache & (Function &)> & GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE,Function & Callee,CallBase & Call)1235 InlineCostFeaturesAnalyzer(
1236 const TargetTransformInfo &TTI,
1237 function_ref<AssumptionCache &(Function &)> &GetAssumptionCache,
1238 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
1239 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
1240 CallBase &Call)
1241 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI) {}
1242
features() const1243 const InlineCostFeatures &features() const { return Cost; }
1244 };
1245
1246 } // namespace
1247
1248 /// Test whether the given value is an Alloca-derived function argument.
isAllocaDerivedArg(Value * V)1249 bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
1250 return SROAArgValues.count(V);
1251 }
1252
disableSROAForArg(AllocaInst * SROAArg)1253 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
1254 onDisableSROA(SROAArg);
1255 EnabledSROAAllocas.erase(SROAArg);
1256 disableLoadElimination();
1257 }
1258
emitInstructionAnnot(const Instruction * I,formatted_raw_ostream & OS)1259 void InlineCostAnnotationWriter::emitInstructionAnnot(
1260 const Instruction *I, formatted_raw_ostream &OS) {
1261 // The cost of inlining of the given instruction is printed always.
1262 // The threshold delta is printed only when it is non-zero. It happens
1263 // when we decided to give a bonus at a particular instruction.
1264 Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
1265 if (!Record)
1266 OS << "; No analysis for the instruction";
1267 else {
1268 OS << "; cost before = " << Record->CostBefore
1269 << ", cost after = " << Record->CostAfter
1270 << ", threshold before = " << Record->ThresholdBefore
1271 << ", threshold after = " << Record->ThresholdAfter << ", ";
1272 OS << "cost delta = " << Record->getCostDelta();
1273 if (Record->hasThresholdChanged())
1274 OS << ", threshold delta = " << Record->getThresholdDelta();
1275 }
1276 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
1277 if (C) {
1278 OS << ", simplified to ";
1279 (*C)->print(OS, true);
1280 }
1281 OS << "\n";
1282 }
1283
1284 /// If 'V' maps to a SROA candidate, disable SROA for it.
disableSROA(Value * V)1285 void CallAnalyzer::disableSROA(Value *V) {
1286 if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
1287 disableSROAForArg(SROAArg);
1288 }
1289 }
1290
disableLoadElimination()1291 void CallAnalyzer::disableLoadElimination() {
1292 if (EnableLoadElimination) {
1293 onDisableLoadElimination();
1294 EnableLoadElimination = false;
1295 }
1296 }
1297
1298 /// Accumulate a constant GEP offset into an APInt if possible.
1299 ///
1300 /// Returns false if unable to compute the offset for any reason. Respects any
1301 /// simplified values known during the analysis of this callsite.
accumulateGEPOffset(GEPOperator & GEP,APInt & Offset)1302 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
1303 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType());
1304 assert(IntPtrWidth == Offset.getBitWidth());
1305
1306 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
1307 GTI != GTE; ++GTI) {
1308 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
1309 if (!OpC)
1310 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand()))
1311 OpC = dyn_cast<ConstantInt>(SimpleOp);
1312 if (!OpC)
1313 return false;
1314 if (OpC->isZero())
1315 continue;
1316
1317 // Handle a struct index, which adds its field offset to the pointer.
1318 if (StructType *STy = GTI.getStructTypeOrNull()) {
1319 unsigned ElementIdx = OpC->getZExtValue();
1320 const StructLayout *SL = DL.getStructLayout(STy);
1321 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
1322 continue;
1323 }
1324
1325 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
1326 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
1327 }
1328 return true;
1329 }
1330
1331 /// Use TTI to check whether a GEP is free.
1332 ///
1333 /// Respects any simplified values known during the analysis of this callsite.
isGEPFree(GetElementPtrInst & GEP)1334 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
1335 SmallVector<Value *, 4> Operands;
1336 Operands.push_back(GEP.getOperand(0));
1337 for (const Use &Op : GEP.indices())
1338 if (Constant *SimpleOp = SimplifiedValues.lookup(Op))
1339 Operands.push_back(SimpleOp);
1340 else
1341 Operands.push_back(Op);
1342 return TTI.getUserCost(&GEP, Operands,
1343 TargetTransformInfo::TCK_SizeAndLatency) ==
1344 TargetTransformInfo::TCC_Free;
1345 }
1346
visitAlloca(AllocaInst & I)1347 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
1348 disableSROA(I.getOperand(0));
1349
1350 // Check whether inlining will turn a dynamic alloca into a static
1351 // alloca and handle that case.
1352 if (I.isArrayAllocation()) {
1353 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
1354 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
1355 // Sometimes a dynamic alloca could be converted into a static alloca
1356 // after this constant prop, and become a huge static alloca on an
1357 // unconditional CFG path. Avoid inlining if this is going to happen above
1358 // a threshold.
1359 // FIXME: If the threshold is removed or lowered too much, we could end up
1360 // being too pessimistic and prevent inlining non-problematic code. This
1361 // could result in unintended perf regressions. A better overall strategy
1362 // is needed to track stack usage during inlining.
1363 Type *Ty = I.getAllocatedType();
1364 AllocatedSize = SaturatingMultiplyAdd(
1365 AllocSize->getLimitedValue(),
1366 DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
1367 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline)
1368 HasDynamicAlloca = true;
1369 return false;
1370 }
1371 }
1372
1373 // Accumulate the allocated size.
1374 if (I.isStaticAlloca()) {
1375 Type *Ty = I.getAllocatedType();
1376 AllocatedSize =
1377 SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize);
1378 }
1379
1380 // FIXME: This is overly conservative. Dynamic allocas are inefficient for
1381 // a variety of reasons, and so we would like to not inline them into
1382 // functions which don't currently have a dynamic alloca. This simply
1383 // disables inlining altogether in the presence of a dynamic alloca.
1384 if (!I.isStaticAlloca())
1385 HasDynamicAlloca = true;
1386
1387 return false;
1388 }
1389
visitPHI(PHINode & I)1390 bool CallAnalyzer::visitPHI(PHINode &I) {
1391 // FIXME: We need to propagate SROA *disabling* through phi nodes, even
1392 // though we don't want to propagate it's bonuses. The idea is to disable
1393 // SROA if it *might* be used in an inappropriate manner.
1394
1395 // Phi nodes are always zero-cost.
1396 // FIXME: Pointer sizes may differ between different address spaces, so do we
1397 // need to use correct address space in the call to getPointerSizeInBits here?
1398 // Or could we skip the getPointerSizeInBits call completely? As far as I can
1399 // see the ZeroOffset is used as a dummy value, so we can probably use any
1400 // bit width for the ZeroOffset?
1401 APInt ZeroOffset = APInt::getZero(DL.getPointerSizeInBits(0));
1402 bool CheckSROA = I.getType()->isPointerTy();
1403
1404 // Track the constant or pointer with constant offset we've seen so far.
1405 Constant *FirstC = nullptr;
1406 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset};
1407 Value *FirstV = nullptr;
1408
1409 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) {
1410 BasicBlock *Pred = I.getIncomingBlock(i);
1411 // If the incoming block is dead, skip the incoming block.
1412 if (DeadBlocks.count(Pred))
1413 continue;
1414 // If the parent block of phi is not the known successor of the incoming
1415 // block, skip the incoming block.
1416 BasicBlock *KnownSuccessor = KnownSuccessors[Pred];
1417 if (KnownSuccessor && KnownSuccessor != I.getParent())
1418 continue;
1419
1420 Value *V = I.getIncomingValue(i);
1421 // If the incoming value is this phi itself, skip the incoming value.
1422 if (&I == V)
1423 continue;
1424
1425 Constant *C = dyn_cast<Constant>(V);
1426 if (!C)
1427 C = SimplifiedValues.lookup(V);
1428
1429 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset};
1430 if (!C && CheckSROA)
1431 BaseAndOffset = ConstantOffsetPtrs.lookup(V);
1432
1433 if (!C && !BaseAndOffset.first)
1434 // The incoming value is neither a constant nor a pointer with constant
1435 // offset, exit early.
1436 return true;
1437
1438 if (FirstC) {
1439 if (FirstC == C)
1440 // If we've seen a constant incoming value before and it is the same
1441 // constant we see this time, continue checking the next incoming value.
1442 continue;
1443 // Otherwise early exit because we either see a different constant or saw
1444 // a constant before but we have a pointer with constant offset this time.
1445 return true;
1446 }
1447
1448 if (FirstV) {
1449 // The same logic as above, but check pointer with constant offset here.
1450 if (FirstBaseAndOffset == BaseAndOffset)
1451 continue;
1452 return true;
1453 }
1454
1455 if (C) {
1456 // This is the 1st time we've seen a constant, record it.
1457 FirstC = C;
1458 continue;
1459 }
1460
1461 // The remaining case is that this is the 1st time we've seen a pointer with
1462 // constant offset, record it.
1463 FirstV = V;
1464 FirstBaseAndOffset = BaseAndOffset;
1465 }
1466
1467 // Check if we can map phi to a constant.
1468 if (FirstC) {
1469 SimplifiedValues[&I] = FirstC;
1470 return true;
1471 }
1472
1473 // Check if we can map phi to a pointer with constant offset.
1474 if (FirstBaseAndOffset.first) {
1475 ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
1476
1477 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
1478 SROAArgValues[&I] = SROAArg;
1479 }
1480
1481 return true;
1482 }
1483
1484 /// Check we can fold GEPs of constant-offset call site argument pointers.
1485 /// This requires target data and inbounds GEPs.
1486 ///
1487 /// \return true if the specified GEP can be folded.
canFoldInboundsGEP(GetElementPtrInst & I)1488 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
1489 // Check if we have a base + offset for the pointer.
1490 std::pair<Value *, APInt> BaseAndOffset =
1491 ConstantOffsetPtrs.lookup(I.getPointerOperand());
1492 if (!BaseAndOffset.first)
1493 return false;
1494
1495 // Check if the offset of this GEP is constant, and if so accumulate it
1496 // into Offset.
1497 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
1498 return false;
1499
1500 // Add the result as a new mapping to Base + Offset.
1501 ConstantOffsetPtrs[&I] = BaseAndOffset;
1502
1503 return true;
1504 }
1505
visitGetElementPtr(GetElementPtrInst & I)1506 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
1507 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
1508
1509 // Lambda to check whether a GEP's indices are all constant.
1510 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
1511 for (const Use &Op : GEP.indices())
1512 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op))
1513 return false;
1514 return true;
1515 };
1516
1517 if (!DisableGEPConstOperand)
1518 if (simplifyInstruction(I))
1519 return true;
1520
1521 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
1522 if (SROAArg)
1523 SROAArgValues[&I] = SROAArg;
1524
1525 // Constant GEPs are modeled as free.
1526 return true;
1527 }
1528
1529 // Variable GEPs will require math and will disable SROA.
1530 if (SROAArg)
1531 disableSROAForArg(SROAArg);
1532 return isGEPFree(I);
1533 }
1534
1535 /// Simplify \p I if its operands are constants and update SimplifiedValues.
simplifyInstruction(Instruction & I)1536 bool CallAnalyzer::simplifyInstruction(Instruction &I) {
1537 SmallVector<Constant *> COps;
1538 for (Value *Op : I.operands()) {
1539 Constant *COp = dyn_cast<Constant>(Op);
1540 if (!COp)
1541 COp = SimplifiedValues.lookup(Op);
1542 if (!COp)
1543 return false;
1544 COps.push_back(COp);
1545 }
1546 auto *C = ConstantFoldInstOperands(&I, COps, DL);
1547 if (!C)
1548 return false;
1549 SimplifiedValues[&I] = C;
1550 return true;
1551 }
1552
1553 /// Try to simplify a call to llvm.is.constant.
1554 ///
1555 /// Duplicate the argument checking from CallAnalyzer::simplifyCallSite since
1556 /// we expect calls of this specific intrinsic to be infrequent.
1557 ///
1558 /// FIXME: Given that we know CB's parent (F) caller
1559 /// (CandidateCall->getParent()->getParent()), we might be able to determine
1560 /// whether inlining F into F's caller would change how the call to
1561 /// llvm.is.constant would evaluate.
simplifyIntrinsicCallIsConstant(CallBase & CB)1562 bool CallAnalyzer::simplifyIntrinsicCallIsConstant(CallBase &CB) {
1563 Value *Arg = CB.getArgOperand(0);
1564 auto *C = dyn_cast<Constant>(Arg);
1565
1566 if (!C)
1567 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(Arg));
1568
1569 Type *RT = CB.getFunctionType()->getReturnType();
1570 SimplifiedValues[&CB] = ConstantInt::get(RT, C ? 1 : 0);
1571 return true;
1572 }
1573
visitBitCast(BitCastInst & I)1574 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
1575 // Propagate constants through bitcasts.
1576 if (simplifyInstruction(I))
1577 return true;
1578
1579 // Track base/offsets through casts
1580 std::pair<Value *, APInt> BaseAndOffset =
1581 ConstantOffsetPtrs.lookup(I.getOperand(0));
1582 // Casts don't change the offset, just wrap it up.
1583 if (BaseAndOffset.first)
1584 ConstantOffsetPtrs[&I] = BaseAndOffset;
1585
1586 // Also look for SROA candidates here.
1587 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1588 SROAArgValues[&I] = SROAArg;
1589
1590 // Bitcasts are always zero cost.
1591 return true;
1592 }
1593
visitPtrToInt(PtrToIntInst & I)1594 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
1595 // Propagate constants through ptrtoint.
1596 if (simplifyInstruction(I))
1597 return true;
1598
1599 // Track base/offset pairs when converted to a plain integer provided the
1600 // integer is large enough to represent the pointer.
1601 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
1602 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace();
1603 if (IntegerSize == DL.getPointerSizeInBits(AS)) {
1604 std::pair<Value *, APInt> BaseAndOffset =
1605 ConstantOffsetPtrs.lookup(I.getOperand(0));
1606 if (BaseAndOffset.first)
1607 ConstantOffsetPtrs[&I] = BaseAndOffset;
1608 }
1609
1610 // This is really weird. Technically, ptrtoint will disable SROA. However,
1611 // unless that ptrtoint is *used* somewhere in the live basic blocks after
1612 // inlining, it will be nuked, and SROA should proceed. All of the uses which
1613 // would block SROA would also block SROA if applied directly to a pointer,
1614 // and so we can just add the integer in here. The only places where SROA is
1615 // preserved either cannot fire on an integer, or won't in-and-of themselves
1616 // disable SROA (ext) w/o some later use that we would see and disable.
1617 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
1618 SROAArgValues[&I] = SROAArg;
1619
1620 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1621 TargetTransformInfo::TCC_Free;
1622 }
1623
visitIntToPtr(IntToPtrInst & I)1624 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
1625 // Propagate constants through ptrtoint.
1626 if (simplifyInstruction(I))
1627 return true;
1628
1629 // Track base/offset pairs when round-tripped through a pointer without
1630 // modifications provided the integer is not too large.
1631 Value *Op = I.getOperand(0);
1632 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
1633 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) {
1634 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
1635 if (BaseAndOffset.first)
1636 ConstantOffsetPtrs[&I] = BaseAndOffset;
1637 }
1638
1639 // "Propagate" SROA here in the same manner as we do for ptrtoint above.
1640 if (auto *SROAArg = getSROAArgForValueOrNull(Op))
1641 SROAArgValues[&I] = SROAArg;
1642
1643 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1644 TargetTransformInfo::TCC_Free;
1645 }
1646
visitCastInst(CastInst & I)1647 bool CallAnalyzer::visitCastInst(CastInst &I) {
1648 // Propagate constants through casts.
1649 if (simplifyInstruction(I))
1650 return true;
1651
1652 // Disable SROA in the face of arbitrary casts we don't explicitly list
1653 // elsewhere.
1654 disableSROA(I.getOperand(0));
1655
1656 // If this is a floating-point cast, and the target says this operation
1657 // is expensive, this may eventually become a library call. Treat the cost
1658 // as such.
1659 switch (I.getOpcode()) {
1660 case Instruction::FPTrunc:
1661 case Instruction::FPExt:
1662 case Instruction::UIToFP:
1663 case Instruction::SIToFP:
1664 case Instruction::FPToUI:
1665 case Instruction::FPToSI:
1666 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
1667 onCallPenalty();
1668 break;
1669 default:
1670 break;
1671 }
1672
1673 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
1674 TargetTransformInfo::TCC_Free;
1675 }
1676
paramHasAttr(Argument * A,Attribute::AttrKind Attr)1677 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
1678 return CandidateCall.paramHasAttr(A->getArgNo(), Attr);
1679 }
1680
isKnownNonNullInCallee(Value * V)1681 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
1682 // Does the *call site* have the NonNull attribute set on an argument? We
1683 // use the attribute on the call site to memoize any analysis done in the
1684 // caller. This will also trip if the callee function has a non-null
1685 // parameter attribute, but that's a less interesting case because hopefully
1686 // the callee would already have been simplified based on that.
1687 if (Argument *A = dyn_cast<Argument>(V))
1688 if (paramHasAttr(A, Attribute::NonNull))
1689 return true;
1690
1691 // Is this an alloca in the caller? This is distinct from the attribute case
1692 // above because attributes aren't updated within the inliner itself and we
1693 // always want to catch the alloca derived case.
1694 if (isAllocaDerivedArg(V))
1695 // We can actually predict the result of comparisons between an
1696 // alloca-derived value and null. Note that this fires regardless of
1697 // SROA firing.
1698 return true;
1699
1700 return false;
1701 }
1702
allowSizeGrowth(CallBase & Call)1703 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
1704 // If the normal destination of the invoke or the parent block of the call
1705 // site is unreachable-terminated, there is little point in inlining this
1706 // unless there is literally zero cost.
1707 // FIXME: Note that it is possible that an unreachable-terminated block has a
1708 // hot entry. For example, in below scenario inlining hot_call_X() may be
1709 // beneficial :
1710 // main() {
1711 // hot_call_1();
1712 // ...
1713 // hot_call_N()
1714 // exit(0);
1715 // }
1716 // For now, we are not handling this corner case here as it is rare in real
1717 // code. In future, we should elaborate this based on BPI and BFI in more
1718 // general threshold adjusting heuristics in updateThreshold().
1719 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
1720 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator()))
1721 return false;
1722 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator()))
1723 return false;
1724
1725 return true;
1726 }
1727
isColdCallSite(CallBase & Call,BlockFrequencyInfo * CallerBFI)1728 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
1729 BlockFrequencyInfo *CallerBFI) {
1730 // If global profile summary is available, then callsite's coldness is
1731 // determined based on that.
1732 if (PSI && PSI->hasProfileSummary())
1733 return PSI->isColdCallSite(Call, CallerBFI);
1734
1735 // Otherwise we need BFI to be available.
1736 if (!CallerBFI)
1737 return false;
1738
1739 // Determine if the callsite is cold relative to caller's entry. We could
1740 // potentially cache the computation of scaled entry frequency, but the added
1741 // complexity is not worth it unless this scaling shows up high in the
1742 // profiles.
1743 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
1744 auto CallSiteBB = Call.getParent();
1745 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
1746 auto CallerEntryFreq =
1747 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock()));
1748 return CallSiteFreq < CallerEntryFreq * ColdProb;
1749 }
1750
1751 Optional<int>
getHotCallSiteThreshold(CallBase & Call,BlockFrequencyInfo * CallerBFI)1752 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
1753 BlockFrequencyInfo *CallerBFI) {
1754
1755 // If global profile summary is available, then callsite's hotness is
1756 // determined based on that.
1757 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
1758 return Params.HotCallSiteThreshold;
1759
1760 // Otherwise we need BFI to be available and to have a locally hot callsite
1761 // threshold.
1762 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
1763 return None;
1764
1765 // Determine if the callsite is hot relative to caller's entry. We could
1766 // potentially cache the computation of scaled entry frequency, but the added
1767 // complexity is not worth it unless this scaling shows up high in the
1768 // profiles.
1769 auto CallSiteBB = Call.getParent();
1770 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
1771 auto CallerEntryFreq = CallerBFI->getEntryFreq();
1772 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
1773 return Params.LocallyHotCallSiteThreshold;
1774
1775 // Otherwise treat it normally.
1776 return None;
1777 }
1778
updateThreshold(CallBase & Call,Function & Callee)1779 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
1780 // If no size growth is allowed for this inlining, set Threshold to 0.
1781 if (!allowSizeGrowth(Call)) {
1782 Threshold = 0;
1783 return;
1784 }
1785
1786 Function *Caller = Call.getCaller();
1787
1788 // return min(A, B) if B is valid.
1789 auto MinIfValid = [](int A, Optional<int> B) {
1790 return B ? std::min(A, B.value()) : A;
1791 };
1792
1793 // return max(A, B) if B is valid.
1794 auto MaxIfValid = [](int A, Optional<int> B) {
1795 return B ? std::max(A, B.value()) : A;
1796 };
1797
1798 // Various bonus percentages. These are multiplied by Threshold to get the
1799 // bonus values.
1800 // SingleBBBonus: This bonus is applied if the callee has a single reachable
1801 // basic block at the given callsite context. This is speculatively applied
1802 // and withdrawn if more than one basic block is seen.
1803 //
1804 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
1805 // of the last call to a static function as inlining such functions is
1806 // guaranteed to reduce code size.
1807 //
1808 // These bonus percentages may be set to 0 based on properties of the caller
1809 // and the callsite.
1810 int SingleBBBonusPercent = 50;
1811 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent();
1812 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
1813
1814 // Lambda to set all the above bonus and bonus percentages to 0.
1815 auto DisallowAllBonuses = [&]() {
1816 SingleBBBonusPercent = 0;
1817 VectorBonusPercent = 0;
1818 LastCallToStaticBonus = 0;
1819 };
1820
1821 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
1822 // and reduce the threshold if the caller has the necessary attribute.
1823 if (Caller->hasMinSize()) {
1824 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
1825 // For minsize, we want to disable the single BB bonus and the vector
1826 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
1827 // a static function will, at the minimum, eliminate the parameter setup and
1828 // call/return instructions.
1829 SingleBBBonusPercent = 0;
1830 VectorBonusPercent = 0;
1831 } else if (Caller->hasOptSize())
1832 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
1833
1834 // Adjust the threshold based on inlinehint attribute and profile based
1835 // hotness information if the caller does not have MinSize attribute.
1836 if (!Caller->hasMinSize()) {
1837 if (Callee.hasFnAttribute(Attribute::InlineHint))
1838 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1839
1840 // FIXME: After switching to the new passmanager, simplify the logic below
1841 // by checking only the callsite hotness/coldness as we will reliably
1842 // have local profile information.
1843 //
1844 // Callsite hotness and coldness can be determined if sample profile is
1845 // used (which adds hotness metadata to calls) or if caller's
1846 // BlockFrequencyInfo is available.
1847 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
1848 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
1849 if (!Caller->hasOptSize() && HotCallSiteThreshold) {
1850 LLVM_DEBUG(dbgs() << "Hot callsite.\n");
1851 // FIXME: This should update the threshold only if it exceeds the
1852 // current threshold, but AutoFDO + ThinLTO currently relies on this
1853 // behavior to prevent inlining of hot callsites during ThinLTO
1854 // compile phase.
1855 Threshold = *HotCallSiteThreshold;
1856 } else if (isColdCallSite(Call, CallerBFI)) {
1857 LLVM_DEBUG(dbgs() << "Cold callsite.\n");
1858 // Do not apply bonuses for a cold callsite including the
1859 // LastCallToStatic bonus. While this bonus might result in code size
1860 // reduction, it can cause the size of a non-cold caller to increase
1861 // preventing it from being inlined.
1862 DisallowAllBonuses();
1863 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
1864 } else if (PSI) {
1865 // Use callee's global profile information only if we have no way of
1866 // determining this via callsite information.
1867 if (PSI->isFunctionEntryHot(&Callee)) {
1868 LLVM_DEBUG(dbgs() << "Hot callee.\n");
1869 // If callsite hotness can not be determined, we may still know
1870 // that the callee is hot and treat it as a weaker hint for threshold
1871 // increase.
1872 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
1873 } else if (PSI->isFunctionEntryCold(&Callee)) {
1874 LLVM_DEBUG(dbgs() << "Cold callee.\n");
1875 // Do not apply bonuses for a cold callee including the
1876 // LastCallToStatic bonus. While this bonus might result in code size
1877 // reduction, it can cause the size of a non-cold caller to increase
1878 // preventing it from being inlined.
1879 DisallowAllBonuses();
1880 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
1881 }
1882 }
1883 }
1884
1885 Threshold += TTI.adjustInliningThreshold(&Call);
1886
1887 // Finally, take the target-specific inlining threshold multiplier into
1888 // account.
1889 Threshold *= TTI.getInliningThresholdMultiplier();
1890
1891 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
1892 VectorBonus = Threshold * VectorBonusPercent / 100;
1893
1894 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
1895 &F == Call.getCalledFunction();
1896 // If there is only one call of the function, and it has internal linkage,
1897 // the cost of inlining it drops dramatically. It may seem odd to update
1898 // Cost in updateThreshold, but the bonus depends on the logic in this method.
1899 if (OnlyOneCallAndLocalLinkage)
1900 Cost -= LastCallToStaticBonus;
1901 }
1902
visitCmpInst(CmpInst & I)1903 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
1904 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1905 // First try to handle simplified comparisons.
1906 if (simplifyInstruction(I))
1907 return true;
1908
1909 if (I.getOpcode() == Instruction::FCmp)
1910 return false;
1911
1912 // Otherwise look for a comparison between constant offset pointers with
1913 // a common base.
1914 Value *LHSBase, *RHSBase;
1915 APInt LHSOffset, RHSOffset;
1916 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1917 if (LHSBase) {
1918 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1919 if (RHSBase && LHSBase == RHSBase) {
1920 // We have common bases, fold the icmp to a constant based on the
1921 // offsets.
1922 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1923 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1924 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
1925 SimplifiedValues[&I] = C;
1926 ++NumConstantPtrCmps;
1927 return true;
1928 }
1929 }
1930 }
1931
1932 // If the comparison is an equality comparison with null, we can simplify it
1933 // if we know the value (argument) can't be null
1934 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) &&
1935 isKnownNonNullInCallee(I.getOperand(0))) {
1936 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE;
1937 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType())
1938 : ConstantInt::getFalse(I.getType());
1939 return true;
1940 }
1941 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
1942 }
1943
visitSub(BinaryOperator & I)1944 bool CallAnalyzer::visitSub(BinaryOperator &I) {
1945 // Try to handle a special case: we can fold computing the difference of two
1946 // constant-related pointers.
1947 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1948 Value *LHSBase, *RHSBase;
1949 APInt LHSOffset, RHSOffset;
1950 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS);
1951 if (LHSBase) {
1952 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS);
1953 if (RHSBase && LHSBase == RHSBase) {
1954 // We have common bases, fold the subtract to a constant based on the
1955 // offsets.
1956 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset);
1957 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset);
1958 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) {
1959 SimplifiedValues[&I] = C;
1960 ++NumConstantPtrDiffs;
1961 return true;
1962 }
1963 }
1964 }
1965
1966 // Otherwise, fall back to the generic logic for simplifying and handling
1967 // instructions.
1968 return Base::visitSub(I);
1969 }
1970
visitBinaryOperator(BinaryOperator & I)1971 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
1972 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
1973 Constant *CLHS = dyn_cast<Constant>(LHS);
1974 if (!CLHS)
1975 CLHS = SimplifiedValues.lookup(LHS);
1976 Constant *CRHS = dyn_cast<Constant>(RHS);
1977 if (!CRHS)
1978 CRHS = SimplifiedValues.lookup(RHS);
1979
1980 Value *SimpleV = nullptr;
1981 if (auto FI = dyn_cast<FPMathOperator>(&I))
1982 SimpleV = simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS,
1983 FI->getFastMathFlags(), DL);
1984 else
1985 SimpleV =
1986 simplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL);
1987
1988 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
1989 SimplifiedValues[&I] = C;
1990
1991 if (SimpleV)
1992 return true;
1993
1994 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
1995 disableSROA(LHS);
1996 disableSROA(RHS);
1997
1998 // If the instruction is floating point, and the target says this operation
1999 // is expensive, this may eventually become a library call. Treat the cost
2000 // as such. Unless it's fneg which can be implemented with an xor.
2001 using namespace llvm::PatternMatch;
2002 if (I.getType()->isFloatingPointTy() &&
2003 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
2004 !match(&I, m_FNeg(m_Value())))
2005 onCallPenalty();
2006
2007 return false;
2008 }
2009
visitFNeg(UnaryOperator & I)2010 bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
2011 Value *Op = I.getOperand(0);
2012 Constant *COp = dyn_cast<Constant>(Op);
2013 if (!COp)
2014 COp = SimplifiedValues.lookup(Op);
2015
2016 Value *SimpleV = simplifyFNegInst(
2017 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL);
2018
2019 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV))
2020 SimplifiedValues[&I] = C;
2021
2022 if (SimpleV)
2023 return true;
2024
2025 // Disable any SROA on arguments to arbitrary, unsimplified fneg.
2026 disableSROA(Op);
2027
2028 return false;
2029 }
2030
visitLoad(LoadInst & I)2031 bool CallAnalyzer::visitLoad(LoadInst &I) {
2032 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2033 return true;
2034
2035 // If the data is already loaded from this address and hasn't been clobbered
2036 // by any stores or calls, this load is likely to be redundant and can be
2037 // eliminated.
2038 if (EnableLoadElimination &&
2039 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
2040 onLoadEliminationOpportunity();
2041 return true;
2042 }
2043
2044 return false;
2045 }
2046
visitStore(StoreInst & I)2047 bool CallAnalyzer::visitStore(StoreInst &I) {
2048 if (handleSROA(I.getPointerOperand(), I.isSimple()))
2049 return true;
2050
2051 // The store can potentially clobber loads and prevent repeated loads from
2052 // being eliminated.
2053 // FIXME:
2054 // 1. We can probably keep an initial set of eliminatable loads substracted
2055 // from the cost even when we finally see a store. We just need to disable
2056 // *further* accumulation of elimination savings.
2057 // 2. We should probably at some point thread MemorySSA for the callee into
2058 // this and then use that to actually compute *really* precise savings.
2059 disableLoadElimination();
2060 return false;
2061 }
2062
visitExtractValue(ExtractValueInst & I)2063 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
2064 // Constant folding for extract value is trivial.
2065 if (simplifyInstruction(I))
2066 return true;
2067
2068 // SROA can't look through these, but they may be free.
2069 return Base::visitExtractValue(I);
2070 }
2071
visitInsertValue(InsertValueInst & I)2072 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
2073 // Constant folding for insert value is trivial.
2074 if (simplifyInstruction(I))
2075 return true;
2076
2077 // SROA can't look through these, but they may be free.
2078 return Base::visitInsertValue(I);
2079 }
2080
2081 /// Try to simplify a call site.
2082 ///
2083 /// Takes a concrete function and callsite and tries to actually simplify it by
2084 /// analyzing the arguments and call itself with instsimplify. Returns true if
2085 /// it has simplified the callsite to some other entity (a constant), making it
2086 /// free.
simplifyCallSite(Function * F,CallBase & Call)2087 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) {
2088 // FIXME: Using the instsimplify logic directly for this is inefficient
2089 // because we have to continually rebuild the argument list even when no
2090 // simplifications can be performed. Until that is fixed with remapping
2091 // inside of instsimplify, directly constant fold calls here.
2092 if (!canConstantFoldCallTo(&Call, F))
2093 return false;
2094
2095 // Try to re-map the arguments to constants.
2096 SmallVector<Constant *, 4> ConstantArgs;
2097 ConstantArgs.reserve(Call.arg_size());
2098 for (Value *I : Call.args()) {
2099 Constant *C = dyn_cast<Constant>(I);
2100 if (!C)
2101 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I));
2102 if (!C)
2103 return false; // This argument doesn't map to a constant.
2104
2105 ConstantArgs.push_back(C);
2106 }
2107 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) {
2108 SimplifiedValues[&Call] = C;
2109 return true;
2110 }
2111
2112 return false;
2113 }
2114
visitCallBase(CallBase & Call)2115 bool CallAnalyzer::visitCallBase(CallBase &Call) {
2116 if (!onCallBaseVisitStart(Call))
2117 return true;
2118
2119 if (Call.hasFnAttr(Attribute::ReturnsTwice) &&
2120 !F.hasFnAttribute(Attribute::ReturnsTwice)) {
2121 // This aborts the entire analysis.
2122 ExposesReturnsTwice = true;
2123 return false;
2124 }
2125 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
2126 ContainsNoDuplicateCall = true;
2127
2128 Function *F = Call.getCalledFunction();
2129 bool IsIndirectCall = !F;
2130 if (IsIndirectCall) {
2131 // Check if this happens to be an indirect function call to a known function
2132 // in this inline context. If not, we've done all we can.
2133 Value *Callee = Call.getCalledOperand();
2134 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
2135 if (!F || F->getFunctionType() != Call.getFunctionType()) {
2136 onCallArgumentSetup(Call);
2137
2138 if (!Call.onlyReadsMemory())
2139 disableLoadElimination();
2140 return Base::visitCallBase(Call);
2141 }
2142 }
2143
2144 assert(F && "Expected a call to a known function");
2145
2146 // When we have a concrete function, first try to simplify it directly.
2147 if (simplifyCallSite(F, Call))
2148 return true;
2149
2150 // Next check if it is an intrinsic we know about.
2151 // FIXME: Lift this into part of the InstVisitor.
2152 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
2153 switch (II->getIntrinsicID()) {
2154 default:
2155 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
2156 disableLoadElimination();
2157 return Base::visitCallBase(Call);
2158
2159 case Intrinsic::load_relative:
2160 onLoadRelativeIntrinsic();
2161 return false;
2162
2163 case Intrinsic::memset:
2164 case Intrinsic::memcpy:
2165 case Intrinsic::memmove:
2166 disableLoadElimination();
2167 // SROA can usually chew through these intrinsics, but they aren't free.
2168 return false;
2169 case Intrinsic::icall_branch_funnel:
2170 case Intrinsic::localescape:
2171 HasUninlineableIntrinsic = true;
2172 return false;
2173 case Intrinsic::vastart:
2174 InitsVargArgs = true;
2175 return false;
2176 case Intrinsic::launder_invariant_group:
2177 case Intrinsic::strip_invariant_group:
2178 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0)))
2179 SROAArgValues[II] = SROAArg;
2180 return true;
2181 case Intrinsic::is_constant:
2182 return simplifyIntrinsicCallIsConstant(Call);
2183 }
2184 }
2185
2186 if (F == Call.getFunction()) {
2187 // This flag will fully abort the analysis, so don't bother with anything
2188 // else.
2189 IsRecursiveCall = true;
2190 if (!AllowRecursiveCall)
2191 return false;
2192 }
2193
2194 if (TTI.isLoweredToCall(F)) {
2195 onLoweredCall(F, Call, IsIndirectCall);
2196 }
2197
2198 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
2199 disableLoadElimination();
2200 return Base::visitCallBase(Call);
2201 }
2202
visitReturnInst(ReturnInst & RI)2203 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
2204 // At least one return instruction will be free after inlining.
2205 bool Free = !HasReturn;
2206 HasReturn = true;
2207 return Free;
2208 }
2209
visitBranchInst(BranchInst & BI)2210 bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
2211 // We model unconditional branches as essentially free -- they really
2212 // shouldn't exist at all, but handling them makes the behavior of the
2213 // inliner more regular and predictable. Interestingly, conditional branches
2214 // which will fold away are also free.
2215 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
2216 isa_and_nonnull<ConstantInt>(
2217 SimplifiedValues.lookup(BI.getCondition()));
2218 }
2219
visitSelectInst(SelectInst & SI)2220 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
2221 bool CheckSROA = SI.getType()->isPointerTy();
2222 Value *TrueVal = SI.getTrueValue();
2223 Value *FalseVal = SI.getFalseValue();
2224
2225 Constant *TrueC = dyn_cast<Constant>(TrueVal);
2226 if (!TrueC)
2227 TrueC = SimplifiedValues.lookup(TrueVal);
2228 Constant *FalseC = dyn_cast<Constant>(FalseVal);
2229 if (!FalseC)
2230 FalseC = SimplifiedValues.lookup(FalseVal);
2231 Constant *CondC =
2232 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
2233
2234 if (!CondC) {
2235 // Select C, X, X => X
2236 if (TrueC == FalseC && TrueC) {
2237 SimplifiedValues[&SI] = TrueC;
2238 return true;
2239 }
2240
2241 if (!CheckSROA)
2242 return Base::visitSelectInst(SI);
2243
2244 std::pair<Value *, APInt> TrueBaseAndOffset =
2245 ConstantOffsetPtrs.lookup(TrueVal);
2246 std::pair<Value *, APInt> FalseBaseAndOffset =
2247 ConstantOffsetPtrs.lookup(FalseVal);
2248 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
2249 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
2250
2251 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
2252 SROAArgValues[&SI] = SROAArg;
2253 return true;
2254 }
2255
2256 return Base::visitSelectInst(SI);
2257 }
2258
2259 // Select condition is a constant.
2260 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal
2261 : (CondC->isNullValue()) ? FalseVal
2262 : nullptr;
2263 if (!SelectedV) {
2264 // Condition is a vector constant that is not all 1s or all 0s. If all
2265 // operands are constants, ConstantExpr::getSelect() can handle the cases
2266 // such as select vectors.
2267 if (TrueC && FalseC) {
2268 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
2269 SimplifiedValues[&SI] = C;
2270 return true;
2271 }
2272 }
2273 return Base::visitSelectInst(SI);
2274 }
2275
2276 // Condition is either all 1s or all 0s. SI can be simplified.
2277 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
2278 SimplifiedValues[&SI] = SelectedC;
2279 return true;
2280 }
2281
2282 if (!CheckSROA)
2283 return true;
2284
2285 std::pair<Value *, APInt> BaseAndOffset =
2286 ConstantOffsetPtrs.lookup(SelectedV);
2287 if (BaseAndOffset.first) {
2288 ConstantOffsetPtrs[&SI] = BaseAndOffset;
2289
2290 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
2291 SROAArgValues[&SI] = SROAArg;
2292 }
2293
2294 return true;
2295 }
2296
visitSwitchInst(SwitchInst & SI)2297 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
2298 // We model unconditional switches as free, see the comments on handling
2299 // branches.
2300 if (isa<ConstantInt>(SI.getCondition()))
2301 return true;
2302 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
2303 if (isa<ConstantInt>(V))
2304 return true;
2305
2306 // Assume the most general case where the switch is lowered into
2307 // either a jump table, bit test, or a balanced binary tree consisting of
2308 // case clusters without merging adjacent clusters with the same
2309 // destination. We do not consider the switches that are lowered with a mix
2310 // of jump table/bit test/binary search tree. The cost of the switch is
2311 // proportional to the size of the tree or the size of jump table range.
2312 //
2313 // NB: We convert large switches which are just used to initialize large phi
2314 // nodes to lookup tables instead in simplifycfg, so this shouldn't prevent
2315 // inlining those. It will prevent inlining in cases where the optimization
2316 // does not (yet) fire.
2317
2318 unsigned JumpTableSize = 0;
2319 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
2320 unsigned NumCaseCluster =
2321 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
2322
2323 onFinalizeSwitch(JumpTableSize, NumCaseCluster);
2324 return false;
2325 }
2326
visitIndirectBrInst(IndirectBrInst & IBI)2327 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
2328 // We never want to inline functions that contain an indirectbr. This is
2329 // incorrect because all the blockaddress's (in static global initializers
2330 // for example) would be referring to the original function, and this
2331 // indirect jump would jump from the inlined copy of the function into the
2332 // original function which is extremely undefined behavior.
2333 // FIXME: This logic isn't really right; we can safely inline functions with
2334 // indirectbr's as long as no other function or global references the
2335 // blockaddress of a block within the current function.
2336 HasIndirectBr = true;
2337 return false;
2338 }
2339
visitResumeInst(ResumeInst & RI)2340 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
2341 // FIXME: It's not clear that a single instruction is an accurate model for
2342 // the inline cost of a resume instruction.
2343 return false;
2344 }
2345
visitCleanupReturnInst(CleanupReturnInst & CRI)2346 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) {
2347 // FIXME: It's not clear that a single instruction is an accurate model for
2348 // the inline cost of a cleanupret instruction.
2349 return false;
2350 }
2351
visitCatchReturnInst(CatchReturnInst & CRI)2352 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) {
2353 // FIXME: It's not clear that a single instruction is an accurate model for
2354 // the inline cost of a catchret instruction.
2355 return false;
2356 }
2357
visitUnreachableInst(UnreachableInst & I)2358 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
2359 // FIXME: It might be reasonably to discount the cost of instructions leading
2360 // to unreachable as they have the lowest possible impact on both runtime and
2361 // code size.
2362 return true; // No actual code is needed for unreachable.
2363 }
2364
visitInstruction(Instruction & I)2365 bool CallAnalyzer::visitInstruction(Instruction &I) {
2366 // Some instructions are free. All of the free intrinsics can also be
2367 // handled by SROA, etc.
2368 if (TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
2369 TargetTransformInfo::TCC_Free)
2370 return true;
2371
2372 // We found something we don't understand or can't handle. Mark any SROA-able
2373 // values in the operand list as no longer viable.
2374 for (const Use &Op : I.operands())
2375 disableSROA(Op);
2376
2377 return false;
2378 }
2379
2380 /// Analyze a basic block for its contribution to the inline cost.
2381 ///
2382 /// This method walks the analyzer over every instruction in the given basic
2383 /// block and accounts for their cost during inlining at this callsite. It
2384 /// aborts early if the threshold has been exceeded or an impossible to inline
2385 /// construct has been detected. It returns false if inlining is no longer
2386 /// viable, and true if inlining remains viable.
2387 InlineResult
analyzeBlock(BasicBlock * BB,SmallPtrSetImpl<const Value * > & EphValues)2388 CallAnalyzer::analyzeBlock(BasicBlock *BB,
2389 SmallPtrSetImpl<const Value *> &EphValues) {
2390 for (Instruction &I : *BB) {
2391 // FIXME: Currently, the number of instructions in a function regardless of
2392 // our ability to simplify them during inline to constants or dead code,
2393 // are actually used by the vector bonus heuristic. As long as that's true,
2394 // we have to special case debug intrinsics here to prevent differences in
2395 // inlining due to debug symbols. Eventually, the number of unsimplified
2396 // instructions shouldn't factor into the cost computation, but until then,
2397 // hack around it here.
2398 // Similarly, skip pseudo-probes.
2399 if (I.isDebugOrPseudoInst())
2400 continue;
2401
2402 // Skip ephemeral values.
2403 if (EphValues.count(&I))
2404 continue;
2405
2406 ++NumInstructions;
2407 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
2408 ++NumVectorInstructions;
2409
2410 // If the instruction simplified to a constant, there is no cost to this
2411 // instruction. Visit the instructions using our InstVisitor to account for
2412 // all of the per-instruction logic. The visit tree returns true if we
2413 // consumed the instruction in any way, and false if the instruction's base
2414 // cost should count against inlining.
2415 onInstructionAnalysisStart(&I);
2416
2417 if (Base::visit(&I))
2418 ++NumInstructionsSimplified;
2419 else
2420 onMissedSimplification();
2421
2422 onInstructionAnalysisFinish(&I);
2423 using namespace ore;
2424 // If the visit this instruction detected an uninlinable pattern, abort.
2425 InlineResult IR = InlineResult::success();
2426 if (IsRecursiveCall && !AllowRecursiveCall)
2427 IR = InlineResult::failure("recursive");
2428 else if (ExposesReturnsTwice)
2429 IR = InlineResult::failure("exposes returns twice");
2430 else if (HasDynamicAlloca)
2431 IR = InlineResult::failure("dynamic alloca");
2432 else if (HasIndirectBr)
2433 IR = InlineResult::failure("indirect branch");
2434 else if (HasUninlineableIntrinsic)
2435 IR = InlineResult::failure("uninlinable intrinsic");
2436 else if (InitsVargArgs)
2437 IR = InlineResult::failure("varargs");
2438 if (!IR.isSuccess()) {
2439 if (ORE)
2440 ORE->emit([&]() {
2441 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2442 &CandidateCall)
2443 << NV("Callee", &F) << " has uninlinable pattern ("
2444 << NV("InlineResult", IR.getFailureReason())
2445 << ") and cost is not fully computed";
2446 });
2447 return IR;
2448 }
2449
2450 // If the caller is a recursive function then we don't want to inline
2451 // functions which allocate a lot of stack space because it would increase
2452 // the caller stack usage dramatically.
2453 if (IsCallerRecursive && AllocatedSize > RecurStackSizeThreshold) {
2454 auto IR =
2455 InlineResult::failure("recursive and allocates too much stack space");
2456 if (ORE)
2457 ORE->emit([&]() {
2458 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
2459 &CandidateCall)
2460 << NV("Callee", &F) << " is "
2461 << NV("InlineResult", IR.getFailureReason())
2462 << ". Cost is not fully computed";
2463 });
2464 return IR;
2465 }
2466
2467 if (shouldStop())
2468 return InlineResult::failure(
2469 "Call site analysis is not favorable to inlining.");
2470 }
2471
2472 return InlineResult::success();
2473 }
2474
2475 /// Compute the base pointer and cumulative constant offsets for V.
2476 ///
2477 /// This strips all constant offsets off of V, leaving it the base pointer, and
2478 /// accumulates the total constant offset applied in the returned constant. It
2479 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
2480 /// no constant offsets applied.
stripAndComputeInBoundsConstantOffsets(Value * & V)2481 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
2482 if (!V->getType()->isPointerTy())
2483 return nullptr;
2484
2485 unsigned AS = V->getType()->getPointerAddressSpace();
2486 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS);
2487 APInt Offset = APInt::getZero(IntPtrWidth);
2488
2489 // Even though we don't look through PHI nodes, we could be called on an
2490 // instruction in an unreachable block, which may be on a cycle.
2491 SmallPtrSet<Value *, 4> Visited;
2492 Visited.insert(V);
2493 do {
2494 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
2495 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset))
2496 return nullptr;
2497 V = GEP->getPointerOperand();
2498 } else if (Operator::getOpcode(V) == Instruction::BitCast) {
2499 V = cast<Operator>(V)->getOperand(0);
2500 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
2501 if (GA->isInterposable())
2502 break;
2503 V = GA->getAliasee();
2504 } else {
2505 break;
2506 }
2507 assert(V->getType()->isPointerTy() && "Unexpected operand type!");
2508 } while (Visited.insert(V).second);
2509
2510 Type *IdxPtrTy = DL.getIndexType(V->getType());
2511 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset));
2512 }
2513
2514 /// Find dead blocks due to deleted CFG edges during inlining.
2515 ///
2516 /// If we know the successor of the current block, \p CurrBB, has to be \p
2517 /// NextBB, the other successors of \p CurrBB are dead if these successors have
2518 /// no live incoming CFG edges. If one block is found to be dead, we can
2519 /// continue growing the dead block list by checking the successors of the dead
2520 /// blocks to see if all their incoming edges are dead or not.
findDeadBlocks(BasicBlock * CurrBB,BasicBlock * NextBB)2521 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
2522 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) {
2523 // A CFG edge is dead if the predecessor is dead or the predecessor has a
2524 // known successor which is not the one under exam.
2525 return (DeadBlocks.count(Pred) ||
2526 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ));
2527 };
2528
2529 auto IsNewlyDead = [&](BasicBlock *BB) {
2530 // If all the edges to a block are dead, the block is also dead.
2531 return (!DeadBlocks.count(BB) &&
2532 llvm::all_of(predecessors(BB),
2533 [&](BasicBlock *P) { return IsEdgeDead(P, BB); }));
2534 };
2535
2536 for (BasicBlock *Succ : successors(CurrBB)) {
2537 if (Succ == NextBB || !IsNewlyDead(Succ))
2538 continue;
2539 SmallVector<BasicBlock *, 4> NewDead;
2540 NewDead.push_back(Succ);
2541 while (!NewDead.empty()) {
2542 BasicBlock *Dead = NewDead.pop_back_val();
2543 if (DeadBlocks.insert(Dead).second)
2544 // Continue growing the dead block lists.
2545 for (BasicBlock *S : successors(Dead))
2546 if (IsNewlyDead(S))
2547 NewDead.push_back(S);
2548 }
2549 }
2550 }
2551
2552 /// Analyze a call site for potential inlining.
2553 ///
2554 /// Returns true if inlining this call is viable, and false if it is not
2555 /// viable. It computes the cost and adjusts the threshold based on numerous
2556 /// factors and heuristics. If this method returns false but the computed cost
2557 /// is below the computed threshold, then inlining was forcibly disabled by
2558 /// some artifact of the routine.
analyze()2559 InlineResult CallAnalyzer::analyze() {
2560 ++NumCallsAnalyzed;
2561
2562 auto Result = onAnalysisStart();
2563 if (!Result.isSuccess())
2564 return Result;
2565
2566 if (F.empty())
2567 return InlineResult::success();
2568
2569 Function *Caller = CandidateCall.getFunction();
2570 // Check if the caller function is recursive itself.
2571 for (User *U : Caller->users()) {
2572 CallBase *Call = dyn_cast<CallBase>(U);
2573 if (Call && Call->getFunction() == Caller) {
2574 IsCallerRecursive = true;
2575 break;
2576 }
2577 }
2578
2579 // Populate our simplified values by mapping from function arguments to call
2580 // arguments with known important simplifications.
2581 auto CAI = CandidateCall.arg_begin();
2582 for (Argument &FAI : F.args()) {
2583 assert(CAI != CandidateCall.arg_end());
2584 if (Constant *C = dyn_cast<Constant>(CAI))
2585 SimplifiedValues[&FAI] = C;
2586
2587 Value *PtrArg = *CAI;
2588 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) {
2589 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue());
2590
2591 // We can SROA any pointer arguments derived from alloca instructions.
2592 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
2593 SROAArgValues[&FAI] = SROAArg;
2594 onInitializeSROAArg(SROAArg);
2595 EnabledSROAAllocas.insert(SROAArg);
2596 }
2597 }
2598 ++CAI;
2599 }
2600 NumConstantArgs = SimplifiedValues.size();
2601 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size();
2602 NumAllocaArgs = SROAArgValues.size();
2603
2604 // FIXME: If a caller has multiple calls to a callee, we end up recomputing
2605 // the ephemeral values multiple times (and they're completely determined by
2606 // the callee, so this is purely duplicate work).
2607 SmallPtrSet<const Value *, 32> EphValues;
2608 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues);
2609
2610 // The worklist of live basic blocks in the callee *after* inlining. We avoid
2611 // adding basic blocks of the callee which can be proven to be dead for this
2612 // particular call site in order to get more accurate cost estimates. This
2613 // requires a somewhat heavyweight iteration pattern: we need to walk the
2614 // basic blocks in a breadth-first order as we insert live successors. To
2615 // accomplish this, prioritizing for small iterations because we exit after
2616 // crossing our threshold, we use a small-size optimized SetVector.
2617 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
2618 SmallPtrSet<BasicBlock *, 16>>
2619 BBSetVector;
2620 BBSetVector BBWorklist;
2621 BBWorklist.insert(&F.getEntryBlock());
2622
2623 // Note that we *must not* cache the size, this loop grows the worklist.
2624 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
2625 if (shouldStop())
2626 break;
2627
2628 BasicBlock *BB = BBWorklist[Idx];
2629 if (BB->empty())
2630 continue;
2631
2632 onBlockStart(BB);
2633
2634 // Disallow inlining a blockaddress with uses other than strictly callbr.
2635 // A blockaddress only has defined behavior for an indirect branch in the
2636 // same function, and we do not currently support inlining indirect
2637 // branches. But, the inliner may not see an indirect branch that ends up
2638 // being dead code at a particular call site. If the blockaddress escapes
2639 // the function, e.g., via a global variable, inlining may lead to an
2640 // invalid cross-function reference.
2641 // FIXME: pr/39560: continue relaxing this overt restriction.
2642 if (BB->hasAddressTaken())
2643 for (User *U : BlockAddress::get(&*BB)->users())
2644 if (!isa<CallBrInst>(*U))
2645 return InlineResult::failure("blockaddress used outside of callbr");
2646
2647 // Analyze the cost of this block. If we blow through the threshold, this
2648 // returns false, and we can bail on out.
2649 InlineResult IR = analyzeBlock(BB, EphValues);
2650 if (!IR.isSuccess())
2651 return IR;
2652
2653 Instruction *TI = BB->getTerminator();
2654
2655 // Add in the live successors by first checking whether we have terminator
2656 // that may be simplified based on the values simplified by this call.
2657 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
2658 if (BI->isConditional()) {
2659 Value *Cond = BI->getCondition();
2660 if (ConstantInt *SimpleCond =
2661 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2662 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0);
2663 BBWorklist.insert(NextBB);
2664 KnownSuccessors[BB] = NextBB;
2665 findDeadBlocks(BB, NextBB);
2666 continue;
2667 }
2668 }
2669 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
2670 Value *Cond = SI->getCondition();
2671 if (ConstantInt *SimpleCond =
2672 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
2673 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor();
2674 BBWorklist.insert(NextBB);
2675 KnownSuccessors[BB] = NextBB;
2676 findDeadBlocks(BB, NextBB);
2677 continue;
2678 }
2679 }
2680
2681 // If we're unable to select a particular successor, just count all of
2682 // them.
2683 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
2684 ++TIdx)
2685 BBWorklist.insert(TI->getSuccessor(TIdx));
2686
2687 onBlockAnalyzed(BB);
2688 }
2689
2690 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneLiveUse() &&
2691 &F == CandidateCall.getCalledFunction();
2692 // If this is a noduplicate call, we can still inline as long as
2693 // inlining this would cause the removal of the caller (so the instruction
2694 // is not actually duplicated, just moved).
2695 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
2696 return InlineResult::failure("noduplicate");
2697
2698 // If the callee's stack size exceeds the user-specified threshold,
2699 // do not let it be inlined.
2700 if (AllocatedSize > StackSizeThreshold)
2701 return InlineResult::failure("stacksize");
2702
2703 return finalizeAnalysis();
2704 }
2705
print(raw_ostream & OS)2706 void InlineCostCallAnalyzer::print(raw_ostream &OS) {
2707 #define DEBUG_PRINT_STAT(x) OS << " " #x ": " << x << "\n"
2708 if (PrintInstructionComments)
2709 F.print(OS, &Writer);
2710 DEBUG_PRINT_STAT(NumConstantArgs);
2711 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
2712 DEBUG_PRINT_STAT(NumAllocaArgs);
2713 DEBUG_PRINT_STAT(NumConstantPtrCmps);
2714 DEBUG_PRINT_STAT(NumConstantPtrDiffs);
2715 DEBUG_PRINT_STAT(NumInstructionsSimplified);
2716 DEBUG_PRINT_STAT(NumInstructions);
2717 DEBUG_PRINT_STAT(SROACostSavings);
2718 DEBUG_PRINT_STAT(SROACostSavingsLost);
2719 DEBUG_PRINT_STAT(LoadEliminationCost);
2720 DEBUG_PRINT_STAT(ContainsNoDuplicateCall);
2721 DEBUG_PRINT_STAT(Cost);
2722 DEBUG_PRINT_STAT(Threshold);
2723 #undef DEBUG_PRINT_STAT
2724 }
2725
2726 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2727 /// Dump stats about this call's analysis.
dump()2728 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(dbgs()); }
2729 #endif
2730
2731 /// Test that there are no attribute conflicts between Caller and Callee
2732 /// that prevent inlining.
functionsHaveCompatibleAttributes(Function * Caller,Function * Callee,TargetTransformInfo & TTI,function_ref<const TargetLibraryInfo & (Function &)> & GetTLI)2733 static bool functionsHaveCompatibleAttributes(
2734 Function *Caller, Function *Callee, TargetTransformInfo &TTI,
2735 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
2736 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
2737 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
2738 // object, and always returns the same object (which is overwritten on each
2739 // GetTLI call). Therefore we copy the first result.
2740 auto CalleeTLI = GetTLI(*Callee);
2741 return (IgnoreTTIInlineCompatible ||
2742 TTI.areInlineCompatible(Caller, Callee)) &&
2743 GetTLI(*Caller).areInlineCompatible(CalleeTLI,
2744 InlineCallerSupersetNoBuiltin) &&
2745 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
2746 }
2747
getCallsiteCost(CallBase & Call,const DataLayout & DL)2748 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
2749 int Cost = 0;
2750 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
2751 if (Call.isByValArgument(I)) {
2752 // We approximate the number of loads and stores needed by dividing the
2753 // size of the byval type by the target's pointer size.
2754 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2755 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I));
2756 unsigned AS = PTy->getAddressSpace();
2757 unsigned PointerSize = DL.getPointerSizeInBits(AS);
2758 // Ceiling division.
2759 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
2760
2761 // If it generates more than 8 stores it is likely to be expanded as an
2762 // inline memcpy so we take that as an upper bound. Otherwise we assume
2763 // one load and one store per word copied.
2764 // FIXME: The maxStoresPerMemcpy setting from the target should be used
2765 // here instead of a magic number of 8, but it's not available via
2766 // DataLayout.
2767 NumStores = std::min(NumStores, 8U);
2768
2769 Cost += 2 * NumStores * InlineConstants::InstrCost;
2770 } else {
2771 // For non-byval arguments subtract off one instruction per call
2772 // argument.
2773 Cost += InlineConstants::InstrCost;
2774 }
2775 }
2776 // The call instruction also disappears after inlining.
2777 Cost += InlineConstants::InstrCost + CallPenalty;
2778 return Cost;
2779 }
2780
getInlineCost(CallBase & Call,const InlineParams & Params,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)2781 InlineCost llvm::getInlineCost(
2782 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
2783 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2784 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2785 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2786 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2787 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
2788 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
2789 }
2790
getInliningCostEstimate(CallBase & Call,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)2791 Optional<int> llvm::getInliningCostEstimate(
2792 CallBase &Call, TargetTransformInfo &CalleeTTI,
2793 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2794 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2795 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2796 const InlineParams Params = {/* DefaultThreshold*/ 0,
2797 /*HintThreshold*/ {},
2798 /*ColdThreshold*/ {},
2799 /*OptSizeThreshold*/ {},
2800 /*OptMinSizeThreshold*/ {},
2801 /*HotCallSiteThreshold*/ {},
2802 /*LocallyHotCallSiteThreshold*/ {},
2803 /*ColdCallSiteThreshold*/ {},
2804 /*ComputeFullInlineCost*/ true,
2805 /*EnableDeferral*/ true};
2806
2807 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
2808 GetAssumptionCache, GetBFI, PSI, ORE, true,
2809 /*IgnoreThreshold*/ true);
2810 auto R = CA.analyze();
2811 if (!R.isSuccess())
2812 return None;
2813 return CA.getCost();
2814 }
2815
getInliningCostFeatures(CallBase & Call,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)2816 Optional<InlineCostFeatures> llvm::getInliningCostFeatures(
2817 CallBase &Call, TargetTransformInfo &CalleeTTI,
2818 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2819 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2820 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2821 InlineCostFeaturesAnalyzer CFA(CalleeTTI, GetAssumptionCache, GetBFI, PSI,
2822 ORE, *Call.getCalledFunction(), Call);
2823 auto R = CFA.analyze();
2824 if (!R.isSuccess())
2825 return None;
2826 return CFA.features();
2827 }
2828
getAttributeBasedInliningDecision(CallBase & Call,Function * Callee,TargetTransformInfo & CalleeTTI,function_ref<const TargetLibraryInfo & (Function &)> GetTLI)2829 Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
2830 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
2831 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
2832
2833 // Cannot inline indirect calls.
2834 if (!Callee)
2835 return InlineResult::failure("indirect call");
2836
2837 // When callee coroutine function is inlined into caller coroutine function
2838 // before coro-split pass,
2839 // coro-early pass can not handle this quiet well.
2840 // So we won't inline the coroutine function if it have not been unsplited
2841 if (Callee->isPresplitCoroutine())
2842 return InlineResult::failure("unsplited coroutine call");
2843
2844 // Never inline calls with byval arguments that does not have the alloca
2845 // address space. Since byval arguments can be replaced with a copy to an
2846 // alloca, the inlined code would need to be adjusted to handle that the
2847 // argument is in the alloca address space (so it is a little bit complicated
2848 // to solve).
2849 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace();
2850 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I)
2851 if (Call.isByValArgument(I)) {
2852 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
2853 if (PTy->getAddressSpace() != AllocaAS)
2854 return InlineResult::failure("byval arguments without alloca"
2855 " address space");
2856 }
2857
2858 // Calls to functions with always-inline attributes should be inlined
2859 // whenever possible.
2860 if (Call.hasFnAttr(Attribute::AlwaysInline)) {
2861 if (Call.getAttributes().hasFnAttr(Attribute::NoInline))
2862 return InlineResult::failure("noinline call site attribute");
2863
2864 auto IsViable = isInlineViable(*Callee);
2865 if (IsViable.isSuccess())
2866 return InlineResult::success();
2867 return InlineResult::failure(IsViable.getFailureReason());
2868 }
2869
2870 // Never inline functions with conflicting attributes (unless callee has
2871 // always-inline attribute).
2872 Function *Caller = Call.getCaller();
2873 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
2874 return InlineResult::failure("conflicting attributes");
2875
2876 // Don't inline this call if the caller has the optnone attribute.
2877 if (Caller->hasOptNone())
2878 return InlineResult::failure("optnone attribute");
2879
2880 // Don't inline a function that treats null pointer as valid into a caller
2881 // that does not have this attribute.
2882 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
2883 return InlineResult::failure("nullptr definitions incompatible");
2884
2885 // Don't inline functions which can be interposed at link-time.
2886 if (Callee->isInterposable())
2887 return InlineResult::failure("interposable");
2888
2889 // Don't inline functions marked noinline.
2890 if (Callee->hasFnAttribute(Attribute::NoInline))
2891 return InlineResult::failure("noinline function attribute");
2892
2893 // Don't inline call sites marked noinline.
2894 if (Call.isNoInline())
2895 return InlineResult::failure("noinline call site attribute");
2896
2897 return None;
2898 }
2899
getInlineCost(CallBase & Call,Function * Callee,const InlineParams & Params,TargetTransformInfo & CalleeTTI,function_ref<AssumptionCache & (Function &)> GetAssumptionCache,function_ref<const TargetLibraryInfo & (Function &)> GetTLI,function_ref<BlockFrequencyInfo & (Function &)> GetBFI,ProfileSummaryInfo * PSI,OptimizationRemarkEmitter * ORE)2900 InlineCost llvm::getInlineCost(
2901 CallBase &Call, Function *Callee, const InlineParams &Params,
2902 TargetTransformInfo &CalleeTTI,
2903 function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
2904 function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
2905 function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
2906 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
2907
2908 auto UserDecision =
2909 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
2910
2911 if (UserDecision) {
2912 if (UserDecision->isSuccess())
2913 return llvm::InlineCost::getAlways("always inline attribute");
2914 return llvm::InlineCost::getNever(UserDecision->getFailureReason());
2915 }
2916
2917 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
2918 << "... (caller:" << Call.getCaller()->getName()
2919 << ")\n");
2920
2921 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
2922 GetAssumptionCache, GetBFI, PSI, ORE);
2923 InlineResult ShouldInline = CA.analyze();
2924
2925 LLVM_DEBUG(CA.dump());
2926
2927 // Always make cost benefit based decision explicit.
2928 // We use always/never here since threshold is not meaningful,
2929 // as it's not what drives cost-benefit analysis.
2930 if (CA.wasDecidedByCostBenefit()) {
2931 if (ShouldInline.isSuccess())
2932 return InlineCost::getAlways("benefit over cost",
2933 CA.getCostBenefitPair());
2934 else
2935 return InlineCost::getNever("cost over benefit", CA.getCostBenefitPair());
2936 }
2937
2938 if (CA.wasDecidedByCostThreshold())
2939 return InlineCost::get(CA.getCost(), CA.getThreshold());
2940
2941 // No details on how the decision was made, simply return always or never.
2942 return ShouldInline.isSuccess()
2943 ? InlineCost::getAlways("empty function")
2944 : InlineCost::getNever(ShouldInline.getFailureReason());
2945 }
2946
isInlineViable(Function & F)2947 InlineResult llvm::isInlineViable(Function &F) {
2948 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice);
2949 for (BasicBlock &BB : F) {
2950 // Disallow inlining of functions which contain indirect branches.
2951 if (isa<IndirectBrInst>(BB.getTerminator()))
2952 return InlineResult::failure("contains indirect branches");
2953
2954 // Disallow inlining of blockaddresses which are used by non-callbr
2955 // instructions.
2956 if (BB.hasAddressTaken())
2957 for (User *U : BlockAddress::get(&BB)->users())
2958 if (!isa<CallBrInst>(*U))
2959 return InlineResult::failure("blockaddress used outside of callbr");
2960
2961 for (auto &II : BB) {
2962 CallBase *Call = dyn_cast<CallBase>(&II);
2963 if (!Call)
2964 continue;
2965
2966 // Disallow recursive calls.
2967 Function *Callee = Call->getCalledFunction();
2968 if (&F == Callee)
2969 return InlineResult::failure("recursive call");
2970
2971 // Disallow calls which expose returns-twice to a function not previously
2972 // attributed as such.
2973 if (!ReturnsTwice && isa<CallInst>(Call) &&
2974 cast<CallInst>(Call)->canReturnTwice())
2975 return InlineResult::failure("exposes returns-twice attribute");
2976
2977 if (Callee)
2978 switch (Callee->getIntrinsicID()) {
2979 default:
2980 break;
2981 case llvm::Intrinsic::icall_branch_funnel:
2982 // Disallow inlining of @llvm.icall.branch.funnel because current
2983 // backend can't separate call targets from call arguments.
2984 return InlineResult::failure(
2985 "disallowed inlining of @llvm.icall.branch.funnel");
2986 case llvm::Intrinsic::localescape:
2987 // Disallow inlining functions that call @llvm.localescape. Doing this
2988 // correctly would require major changes to the inliner.
2989 return InlineResult::failure(
2990 "disallowed inlining of @llvm.localescape");
2991 case llvm::Intrinsic::vastart:
2992 // Disallow inlining of functions that initialize VarArgs with
2993 // va_start.
2994 return InlineResult::failure(
2995 "contains VarArgs initialized with va_start");
2996 }
2997 }
2998 }
2999
3000 return InlineResult::success();
3001 }
3002
3003 // APIs to create InlineParams based on command line flags and/or other
3004 // parameters.
3005
getInlineParams(int Threshold)3006 InlineParams llvm::getInlineParams(int Threshold) {
3007 InlineParams Params;
3008
3009 // This field is the threshold to use for a callee by default. This is
3010 // derived from one or more of:
3011 // * optimization or size-optimization levels,
3012 // * a value passed to createFunctionInliningPass function, or
3013 // * the -inline-threshold flag.
3014 // If the -inline-threshold flag is explicitly specified, that is used
3015 // irrespective of anything else.
3016 if (InlineThreshold.getNumOccurrences() > 0)
3017 Params.DefaultThreshold = InlineThreshold;
3018 else
3019 Params.DefaultThreshold = Threshold;
3020
3021 // Set the HintThreshold knob from the -inlinehint-threshold.
3022 Params.HintThreshold = HintThreshold;
3023
3024 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
3025 Params.HotCallSiteThreshold = HotCallSiteThreshold;
3026
3027 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
3028 // populate LocallyHotCallSiteThreshold. Later, we populate
3029 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
3030 // we know that optimization level is O3 (in the getInlineParams variant that
3031 // takes the opt and size levels).
3032 // FIXME: Remove this check (and make the assignment unconditional) after
3033 // addressing size regression issues at O2.
3034 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
3035 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3036
3037 // Set the ColdCallSiteThreshold knob from the
3038 // -inline-cold-callsite-threshold.
3039 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
3040
3041 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
3042 // -inlinehint-threshold commandline option is not explicitly given. If that
3043 // option is present, then its value applies even for callees with size and
3044 // minsize attributes.
3045 // If the -inline-threshold is not specified, set the ColdThreshold from the
3046 // -inlinecold-threshold even if it is not explicitly passed. If
3047 // -inline-threshold is specified, then -inlinecold-threshold needs to be
3048 // explicitly specified to set the ColdThreshold knob
3049 if (InlineThreshold.getNumOccurrences() == 0) {
3050 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold;
3051 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold;
3052 Params.ColdThreshold = ColdThreshold;
3053 } else if (ColdThreshold.getNumOccurrences() > 0) {
3054 Params.ColdThreshold = ColdThreshold;
3055 }
3056 return Params;
3057 }
3058
getInlineParams()3059 InlineParams llvm::getInlineParams() {
3060 return getInlineParams(DefaultThreshold);
3061 }
3062
3063 // Compute the default threshold for inlining based on the opt level and the
3064 // size opt level.
computeThresholdFromOptLevels(unsigned OptLevel,unsigned SizeOptLevel)3065 static int computeThresholdFromOptLevels(unsigned OptLevel,
3066 unsigned SizeOptLevel) {
3067 if (OptLevel > 2)
3068 return InlineConstants::OptAggressiveThreshold;
3069 if (SizeOptLevel == 1) // -Os
3070 return InlineConstants::OptSizeThreshold;
3071 if (SizeOptLevel == 2) // -Oz
3072 return InlineConstants::OptMinSizeThreshold;
3073 return DefaultThreshold;
3074 }
3075
getInlineParams(unsigned OptLevel,unsigned SizeOptLevel)3076 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
3077 auto Params =
3078 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
3079 // At O3, use the value of -locally-hot-callsite-threshold option to populate
3080 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
3081 // when it is specified explicitly.
3082 if (OptLevel > 2)
3083 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
3084 return Params;
3085 }
3086
3087 PreservedAnalyses
run(Function & F,FunctionAnalysisManager & FAM)3088 InlineCostAnnotationPrinterPass::run(Function &F,
3089 FunctionAnalysisManager &FAM) {
3090 PrintInstructionComments = true;
3091 std::function<AssumptionCache &(Function &)> GetAssumptionCache =
3092 [&](Function &F) -> AssumptionCache & {
3093 return FAM.getResult<AssumptionAnalysis>(F);
3094 };
3095 Module *M = F.getParent();
3096 ProfileSummaryInfo PSI(*M);
3097 DataLayout DL(M);
3098 TargetTransformInfo TTI(DL);
3099 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
3100 // In the current implementation, the type of InlineParams doesn't matter as
3101 // the pass serves only for verification of inliner's decisions.
3102 // We can add a flag which determines InlineParams for this run. Right now,
3103 // the default InlineParams are used.
3104 const InlineParams Params = llvm::getInlineParams();
3105 for (BasicBlock &BB : F) {
3106 for (Instruction &I : BB) {
3107 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
3108 Function *CalledFunction = CI->getCalledFunction();
3109 if (!CalledFunction || CalledFunction->isDeclaration())
3110 continue;
3111 OptimizationRemarkEmitter ORE(CalledFunction);
3112 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
3113 GetAssumptionCache, nullptr, &PSI, &ORE);
3114 ICCA.analyze();
3115 OS << " Analyzing call of " << CalledFunction->getName()
3116 << "... (caller:" << CI->getCaller()->getName() << ")\n";
3117 ICCA.print(OS);
3118 OS << "\n";
3119 }
3120 }
3121 }
3122 return PreservedAnalyses::all();
3123 }
3124