1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CFG.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetLibraryInfo.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/Analysis/ValueTracking.h" 30 #include "llvm/Config/llvm-config.h" 31 #include "llvm/IR/AssemblyAnnotationWriter.h" 32 #include "llvm/IR/CallingConv.h" 33 #include "llvm/IR/DataLayout.h" 34 #include "llvm/IR/Dominators.h" 35 #include "llvm/IR/GetElementPtrTypeIterator.h" 36 #include "llvm/IR/GlobalAlias.h" 37 #include "llvm/IR/InstVisitor.h" 38 #include "llvm/IR/IntrinsicInst.h" 39 #include "llvm/IR/Operator.h" 40 #include "llvm/IR/PatternMatch.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/Debug.h" 43 #include "llvm/Support/FormattedStream.h" 44 #include "llvm/Support/raw_ostream.h" 45 46 using namespace llvm; 47 48 #define DEBUG_TYPE "inline-cost" 49 50 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 51 52 static cl::opt<int> 53 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), 54 cl::ZeroOrMore, 55 cl::desc("Default amount of inlining to perform")); 56 57 static cl::opt<bool> PrintInstructionComments( 58 "print-instruction-comments", cl::Hidden, cl::init(false), 59 cl::desc("Prints comments for instruction based on inline cost analysis")); 60 61 static cl::opt<int> InlineThreshold( 62 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 63 cl::desc("Control the amount of inlining to perform (default = 225)")); 64 65 static cl::opt<int> HintThreshold( 66 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 67 cl::desc("Threshold for inlining functions with inline hint")); 68 69 static cl::opt<int> 70 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 71 cl::init(45), cl::ZeroOrMore, 72 cl::desc("Threshold for inlining cold callsites")); 73 74 static cl::opt<bool> InlineEnableCostBenefitAnalysis( 75 "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false), 76 cl::desc("Enable the cost-benefit analysis for the inliner")); 77 78 static cl::opt<int> InlineSavingsMultiplier( 79 "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore, 80 cl::desc("Multiplier to multiply cycle savings by during inlining")); 81 82 static cl::opt<int> 83 InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100), 84 cl::ZeroOrMore, 85 cl::desc("The maximum size of a callee that get's " 86 "inlined without sufficient cycle savings")); 87 88 // We introduce this threshold to help performance of instrumentation based 89 // PGO before we actually hook up inliner with analysis passes such as BPI and 90 // BFI. 91 static cl::opt<int> ColdThreshold( 92 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 93 cl::desc("Threshold for inlining functions with cold attribute")); 94 95 static cl::opt<int> 96 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 97 cl::ZeroOrMore, 98 cl::desc("Threshold for hot callsites ")); 99 100 static cl::opt<int> LocallyHotCallSiteThreshold( 101 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 102 cl::desc("Threshold for locally hot callsites ")); 103 104 static cl::opt<int> ColdCallSiteRelFreq( 105 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 106 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 107 "entry frequency, for a callsite to be cold in the absence of " 108 "profile information.")); 109 110 static cl::opt<int> HotCallSiteRelFreq( 111 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 112 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 113 "entry frequency, for a callsite to be hot in the absence of " 114 "profile information.")); 115 116 static cl::opt<bool> OptComputeFullInlineCost( 117 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 118 cl::desc("Compute the full inline cost of a call site even when the cost " 119 "exceeds the threshold.")); 120 121 static cl::opt<bool> InlineCallerSupersetNoBuiltin( 122 "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true), 123 cl::ZeroOrMore, 124 cl::desc("Allow inlining when caller has a superset of callee's nobuiltin " 125 "attributes.")); 126 127 static cl::opt<bool> DisableGEPConstOperand( 128 "disable-gep-const-evaluation", cl::Hidden, cl::init(false), 129 cl::desc("Disables evaluation of GetElementPtr with constant operands")); 130 131 namespace { 132 class InlineCostCallAnalyzer; 133 134 // This struct is used to store information about inline cost of a 135 // particular instruction 136 struct InstructionCostDetail { 137 int CostBefore = 0; 138 int CostAfter = 0; 139 int ThresholdBefore = 0; 140 int ThresholdAfter = 0; 141 142 int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; } 143 144 int getCostDelta() const { return CostAfter - CostBefore; } 145 146 bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; } 147 }; 148 149 class InlineCostAnnotationWriter : public AssemblyAnnotationWriter { 150 private: 151 InlineCostCallAnalyzer *const ICCA; 152 153 public: 154 InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {} 155 virtual void emitInstructionAnnot(const Instruction *I, 156 formatted_raw_ostream &OS) override; 157 }; 158 159 /// Carry out call site analysis, in order to evaluate inlinability. 160 /// NOTE: the type is currently used as implementation detail of functions such 161 /// as llvm::getInlineCost. Note the function_ref constructor parameters - the 162 /// expectation is that they come from the outer scope, from the wrapper 163 /// functions. If we want to support constructing CallAnalyzer objects where 164 /// lambdas are provided inline at construction, or where the object needs to 165 /// otherwise survive past the scope of the provided functions, we need to 166 /// revisit the argument types. 167 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 168 typedef InstVisitor<CallAnalyzer, bool> Base; 169 friend class InstVisitor<CallAnalyzer, bool>; 170 171 protected: 172 virtual ~CallAnalyzer() {} 173 /// The TargetTransformInfo available for this compilation. 174 const TargetTransformInfo &TTI; 175 176 /// Getter for the cache of @llvm.assume intrinsics. 177 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 178 179 /// Getter for BlockFrequencyInfo 180 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 181 182 /// Profile summary information. 183 ProfileSummaryInfo *PSI; 184 185 /// The called function. 186 Function &F; 187 188 // Cache the DataLayout since we use it a lot. 189 const DataLayout &DL; 190 191 /// The OptimizationRemarkEmitter available for this compilation. 192 OptimizationRemarkEmitter *ORE; 193 194 /// The candidate callsite being analyzed. Please do not use this to do 195 /// analysis in the caller function; we want the inline cost query to be 196 /// easily cacheable. Instead, use the cover function paramHasAttr. 197 CallBase &CandidateCall; 198 199 /// Extension points for handling callsite features. 200 // Called before a basic block was analyzed. 201 virtual void onBlockStart(const BasicBlock *BB) {} 202 203 /// Called after a basic block was analyzed. 204 virtual void onBlockAnalyzed(const BasicBlock *BB) {} 205 206 /// Called before an instruction was analyzed 207 virtual void onInstructionAnalysisStart(const Instruction *I) {} 208 209 /// Called after an instruction was analyzed 210 virtual void onInstructionAnalysisFinish(const Instruction *I) {} 211 212 /// Called at the end of the analysis of the callsite. Return the outcome of 213 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or 214 /// the reason it can't. 215 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } 216 /// Called when we're about to start processing a basic block, and every time 217 /// we are done processing an instruction. Return true if there is no point in 218 /// continuing the analysis (e.g. we've determined already the call site is 219 /// too expensive to inline) 220 virtual bool shouldStop() { return false; } 221 222 /// Called before the analysis of the callee body starts (with callsite 223 /// contexts propagated). It checks callsite-specific information. Return a 224 /// reason analysis can't continue if that's the case, or 'true' if it may 225 /// continue. 226 virtual InlineResult onAnalysisStart() { return InlineResult::success(); } 227 /// Called if the analysis engine decides SROA cannot be done for the given 228 /// alloca. 229 virtual void onDisableSROA(AllocaInst *Arg) {} 230 231 /// Called the analysis engine determines load elimination won't happen. 232 virtual void onDisableLoadElimination() {} 233 234 /// Called to account for a call. 235 virtual void onCallPenalty() {} 236 237 /// Called to account for the expectation the inlining would result in a load 238 /// elimination. 239 virtual void onLoadEliminationOpportunity() {} 240 241 /// Called to account for the cost of argument setup for the Call in the 242 /// callee's body (not the callsite currently under analysis). 243 virtual void onCallArgumentSetup(const CallBase &Call) {} 244 245 /// Called to account for a load relative intrinsic. 246 virtual void onLoadRelativeIntrinsic() {} 247 248 /// Called to account for a lowered call. 249 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { 250 } 251 252 /// Account for a jump table of given size. Return false to stop further 253 /// processing the switch instruction 254 virtual bool onJumpTable(unsigned JumpTableSize) { return true; } 255 256 /// Account for a case cluster of given size. Return false to stop further 257 /// processing of the instruction. 258 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } 259 260 /// Called at the end of processing a switch instruction, with the given 261 /// number of case clusters. 262 virtual void onFinalizeSwitch(unsigned JumpTableSize, 263 unsigned NumCaseCluster) {} 264 265 /// Called to account for any other instruction not specifically accounted 266 /// for. 267 virtual void onMissedSimplification() {} 268 269 /// Start accounting potential benefits due to SROA for the given alloca. 270 virtual void onInitializeSROAArg(AllocaInst *Arg) {} 271 272 /// Account SROA savings for the AllocaInst value. 273 virtual void onAggregateSROAUse(AllocaInst *V) {} 274 275 bool handleSROA(Value *V, bool DoNotDisable) { 276 // Check for SROA candidates in comparisons. 277 if (auto *SROAArg = getSROAArgForValueOrNull(V)) { 278 if (DoNotDisable) { 279 onAggregateSROAUse(SROAArg); 280 return true; 281 } 282 disableSROAForArg(SROAArg); 283 } 284 return false; 285 } 286 287 bool IsCallerRecursive = false; 288 bool IsRecursiveCall = false; 289 bool ExposesReturnsTwice = false; 290 bool HasDynamicAlloca = false; 291 bool ContainsNoDuplicateCall = false; 292 bool HasReturn = false; 293 bool HasIndirectBr = false; 294 bool HasUninlineableIntrinsic = false; 295 bool InitsVargArgs = false; 296 297 /// Number of bytes allocated statically by the callee. 298 uint64_t AllocatedSize = 0; 299 unsigned NumInstructions = 0; 300 unsigned NumVectorInstructions = 0; 301 302 /// While we walk the potentially-inlined instructions, we build up and 303 /// maintain a mapping of simplified values specific to this callsite. The 304 /// idea is to propagate any special information we have about arguments to 305 /// this call through the inlinable section of the function, and account for 306 /// likely simplifications post-inlining. The most important aspect we track 307 /// is CFG altering simplifications -- when we prove a basic block dead, that 308 /// can cause dramatic shifts in the cost of inlining a function. 309 DenseMap<Value *, Constant *> SimplifiedValues; 310 311 /// Keep track of the values which map back (through function arguments) to 312 /// allocas on the caller stack which could be simplified through SROA. 313 DenseMap<Value *, AllocaInst *> SROAArgValues; 314 315 /// Keep track of Allocas for which we believe we may get SROA optimization. 316 DenseSet<AllocaInst *> EnabledSROAAllocas; 317 318 /// Keep track of values which map to a pointer base and constant offset. 319 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 320 321 /// Keep track of dead blocks due to the constant arguments. 322 SetVector<BasicBlock *> DeadBlocks; 323 324 /// The mapping of the blocks to their known unique successors due to the 325 /// constant arguments. 326 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 327 328 /// Model the elimination of repeated loads that is expected to happen 329 /// whenever we simplify away the stores that would otherwise cause them to be 330 /// loads. 331 bool EnableLoadElimination; 332 SmallPtrSet<Value *, 16> LoadAddrSet; 333 334 AllocaInst *getSROAArgForValueOrNull(Value *V) const { 335 auto It = SROAArgValues.find(V); 336 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0) 337 return nullptr; 338 return It->second; 339 } 340 341 // Custom simplification helper routines. 342 bool isAllocaDerivedArg(Value *V); 343 void disableSROAForArg(AllocaInst *SROAArg); 344 void disableSROA(Value *V); 345 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 346 void disableLoadElimination(); 347 bool isGEPFree(GetElementPtrInst &GEP); 348 bool canFoldInboundsGEP(GetElementPtrInst &I); 349 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 350 bool simplifyCallSite(Function *F, CallBase &Call); 351 template <typename Callable> 352 bool simplifyInstruction(Instruction &I, Callable Evaluate); 353 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 354 355 /// Return true if the given argument to the function being considered for 356 /// inlining has the given attribute set either at the call site or the 357 /// function declaration. Primarily used to inspect call site specific 358 /// attributes since these can be more precise than the ones on the callee 359 /// itself. 360 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 361 362 /// Return true if the given value is known non null within the callee if 363 /// inlined through this particular callsite. 364 bool isKnownNonNullInCallee(Value *V); 365 366 /// Return true if size growth is allowed when inlining the callee at \p Call. 367 bool allowSizeGrowth(CallBase &Call); 368 369 // Custom analysis routines. 370 InlineResult analyzeBlock(BasicBlock *BB, 371 SmallPtrSetImpl<const Value *> &EphValues); 372 373 // Disable several entry points to the visitor so we don't accidentally use 374 // them by declaring but not defining them here. 375 void visit(Module *); 376 void visit(Module &); 377 void visit(Function *); 378 void visit(Function &); 379 void visit(BasicBlock *); 380 void visit(BasicBlock &); 381 382 // Provide base case for our instruction visit. 383 bool visitInstruction(Instruction &I); 384 385 // Our visit overrides. 386 bool visitAlloca(AllocaInst &I); 387 bool visitPHI(PHINode &I); 388 bool visitGetElementPtr(GetElementPtrInst &I); 389 bool visitBitCast(BitCastInst &I); 390 bool visitPtrToInt(PtrToIntInst &I); 391 bool visitIntToPtr(IntToPtrInst &I); 392 bool visitCastInst(CastInst &I); 393 bool visitCmpInst(CmpInst &I); 394 bool visitSub(BinaryOperator &I); 395 bool visitBinaryOperator(BinaryOperator &I); 396 bool visitFNeg(UnaryOperator &I); 397 bool visitLoad(LoadInst &I); 398 bool visitStore(StoreInst &I); 399 bool visitExtractValue(ExtractValueInst &I); 400 bool visitInsertValue(InsertValueInst &I); 401 bool visitCallBase(CallBase &Call); 402 bool visitReturnInst(ReturnInst &RI); 403 bool visitBranchInst(BranchInst &BI); 404 bool visitSelectInst(SelectInst &SI); 405 bool visitSwitchInst(SwitchInst &SI); 406 bool visitIndirectBrInst(IndirectBrInst &IBI); 407 bool visitResumeInst(ResumeInst &RI); 408 bool visitCleanupReturnInst(CleanupReturnInst &RI); 409 bool visitCatchReturnInst(CatchReturnInst &RI); 410 bool visitUnreachableInst(UnreachableInst &I); 411 412 public: 413 CallAnalyzer(Function &Callee, CallBase &Call, const TargetTransformInfo &TTI, 414 function_ref<AssumptionCache &(Function &)> GetAssumptionCache, 415 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, 416 ProfileSummaryInfo *PSI = nullptr, 417 OptimizationRemarkEmitter *ORE = nullptr) 418 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 419 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 420 CandidateCall(Call), EnableLoadElimination(true) {} 421 422 InlineResult analyze(); 423 424 Optional<Constant *> getSimplifiedValue(Instruction *I) { 425 if (SimplifiedValues.find(I) != SimplifiedValues.end()) 426 return SimplifiedValues[I]; 427 return None; 428 } 429 430 // Keep a bunch of stats about the cost savings found so we can print them 431 // out when debugging. 432 unsigned NumConstantArgs = 0; 433 unsigned NumConstantOffsetPtrArgs = 0; 434 unsigned NumAllocaArgs = 0; 435 unsigned NumConstantPtrCmps = 0; 436 unsigned NumConstantPtrDiffs = 0; 437 unsigned NumInstructionsSimplified = 0; 438 439 void dump(); 440 }; 441 442 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note 443 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer 444 class InlineCostCallAnalyzer final : public CallAnalyzer { 445 const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 446 const bool ComputeFullInlineCost; 447 int LoadEliminationCost = 0; 448 /// Bonus to be applied when percentage of vector instructions in callee is 449 /// high (see more details in updateThreshold). 450 int VectorBonus = 0; 451 /// Bonus to be applied when the callee has only one reachable basic block. 452 int SingleBBBonus = 0; 453 454 /// Tunable parameters that control the analysis. 455 const InlineParams &Params; 456 457 // This DenseMap stores the delta change in cost and threshold after 458 // accounting for the given instruction. The map is filled only with the 459 // flag PrintInstructionComments on. 460 DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap; 461 462 /// Upper bound for the inlining cost. Bonuses are being applied to account 463 /// for speculative "expected profit" of the inlining decision. 464 int Threshold = 0; 465 466 /// Attempt to evaluate indirect calls to boost its inline cost. 467 const bool BoostIndirectCalls; 468 469 /// Ignore the threshold when finalizing analysis. 470 const bool IgnoreThreshold; 471 472 // True if the cost-benefit-analysis-based inliner is enabled. 473 const bool CostBenefitAnalysisEnabled; 474 475 /// Inlining cost measured in abstract units, accounts for all the 476 /// instructions expected to be executed for a given function invocation. 477 /// Instructions that are statically proven to be dead based on call-site 478 /// arguments are not counted here. 479 int Cost = 0; 480 481 // The cumulative cost at the beginning of the basic block being analyzed. At 482 // the end of analyzing each basic block, "Cost - CostAtBBStart" represents 483 // the size of that basic block. 484 int CostAtBBStart = 0; 485 486 // The static size of live but cold basic blocks. This is "static" in the 487 // sense that it's not weighted by profile counts at all. 488 int ColdSize = 0; 489 490 // Whether inlining is decided by cost-benefit analysis. 491 bool DecidedByCostBenefit = false; 492 493 bool SingleBB = true; 494 495 unsigned SROACostSavings = 0; 496 unsigned SROACostSavingsLost = 0; 497 498 /// The mapping of caller Alloca values to their accumulated cost savings. If 499 /// we have to disable SROA for one of the allocas, this tells us how much 500 /// cost must be added. 501 DenseMap<AllocaInst *, int> SROAArgCosts; 502 503 /// Return true if \p Call is a cold callsite. 504 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 505 506 /// Update Threshold based on callsite properties such as callee 507 /// attributes and callee hotness for PGO builds. The Callee is explicitly 508 /// passed to support analyzing indirect calls whose target is inferred by 509 /// analysis. 510 void updateThreshold(CallBase &Call, Function &Callee); 511 /// Return a higher threshold if \p Call is a hot callsite. 512 Optional<int> getHotCallSiteThreshold(CallBase &Call, 513 BlockFrequencyInfo *CallerBFI); 514 515 /// Handle a capped 'int' increment for Cost. 516 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 517 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 518 Cost = (int)std::min(UpperBound, Cost + Inc); 519 } 520 521 void onDisableSROA(AllocaInst *Arg) override { 522 auto CostIt = SROAArgCosts.find(Arg); 523 if (CostIt == SROAArgCosts.end()) 524 return; 525 addCost(CostIt->second); 526 SROACostSavings -= CostIt->second; 527 SROACostSavingsLost += CostIt->second; 528 SROAArgCosts.erase(CostIt); 529 } 530 531 void onDisableLoadElimination() override { 532 addCost(LoadEliminationCost); 533 LoadEliminationCost = 0; 534 } 535 void onCallPenalty() override { addCost(InlineConstants::CallPenalty); } 536 void onCallArgumentSetup(const CallBase &Call) override { 537 // Pay the price of the argument setup. We account for the average 1 538 // instruction per call argument setup here. 539 addCost(Call.arg_size() * InlineConstants::InstrCost); 540 } 541 void onLoadRelativeIntrinsic() override { 542 // This is normally lowered to 4 LLVM instructions. 543 addCost(3 * InlineConstants::InstrCost); 544 } 545 void onLoweredCall(Function *F, CallBase &Call, 546 bool IsIndirectCall) override { 547 // We account for the average 1 instruction per call argument setup here. 548 addCost(Call.arg_size() * InlineConstants::InstrCost); 549 550 // If we have a constant that we are calling as a function, we can peer 551 // through it and see the function target. This happens not infrequently 552 // during devirtualization and so we want to give it a hefty bonus for 553 // inlining, but cap that bonus in the event that inlining wouldn't pan out. 554 // Pretend to inline the function, with a custom threshold. 555 if (IsIndirectCall && BoostIndirectCalls) { 556 auto IndirectCallParams = Params; 557 IndirectCallParams.DefaultThreshold = 558 InlineConstants::IndirectCallThreshold; 559 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need 560 /// to instantiate the derived class. 561 InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI, 562 GetAssumptionCache, GetBFI, PSI, ORE, false); 563 if (CA.analyze().isSuccess()) { 564 // We were able to inline the indirect call! Subtract the cost from the 565 // threshold to get the bonus we want to apply, but don't go below zero. 566 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 567 } 568 } else 569 // Otherwise simply add the cost for merely making the call. 570 addCost(InlineConstants::CallPenalty); 571 } 572 573 void onFinalizeSwitch(unsigned JumpTableSize, 574 unsigned NumCaseCluster) override { 575 // If suitable for a jump table, consider the cost for the table size and 576 // branch to destination. 577 // Maximum valid cost increased in this function. 578 if (JumpTableSize) { 579 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 580 4 * InlineConstants::InstrCost; 581 582 addCost(JTCost, (int64_t)CostUpperBound); 583 return; 584 } 585 // Considering forming a binary search, we should find the number of nodes 586 // which is same as the number of comparisons when lowered. For a given 587 // number of clusters, n, we can define a recursive function, f(n), to find 588 // the number of nodes in the tree. The recursion is : 589 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 590 // and f(n) = n, when n <= 3. 591 // This will lead a binary tree where the leaf should be either f(2) or f(3) 592 // when n > 3. So, the number of comparisons from leaves should be n, while 593 // the number of non-leaf should be : 594 // 2^(log2(n) - 1) - 1 595 // = 2^log2(n) * 2^-1 - 1 596 // = n / 2 - 1. 597 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 598 // number of comparisons in a simple closed form : 599 // n + n / 2 - 1 = n * 3 / 2 - 1 600 if (NumCaseCluster <= 3) { 601 // Suppose a comparison includes one compare and one conditional branch. 602 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 603 return; 604 } 605 606 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 607 int64_t SwitchCost = 608 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 609 610 addCost(SwitchCost, (int64_t)CostUpperBound); 611 } 612 void onMissedSimplification() override { 613 addCost(InlineConstants::InstrCost); 614 } 615 616 void onInitializeSROAArg(AllocaInst *Arg) override { 617 assert(Arg != nullptr && 618 "Should not initialize SROA costs for null value."); 619 SROAArgCosts[Arg] = 0; 620 } 621 622 void onAggregateSROAUse(AllocaInst *SROAArg) override { 623 auto CostIt = SROAArgCosts.find(SROAArg); 624 assert(CostIt != SROAArgCosts.end() && 625 "expected this argument to have a cost"); 626 CostIt->second += InlineConstants::InstrCost; 627 SROACostSavings += InlineConstants::InstrCost; 628 } 629 630 void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; } 631 632 void onBlockAnalyzed(const BasicBlock *BB) override { 633 if (CostBenefitAnalysisEnabled) { 634 // Keep track of the static size of live but cold basic blocks. For now, 635 // we define a cold basic block to be one that's never executed. 636 assert(GetBFI && "GetBFI must be available"); 637 BlockFrequencyInfo *BFI = &(GetBFI(F)); 638 assert(BFI && "BFI must be available"); 639 auto ProfileCount = BFI->getBlockProfileCount(BB); 640 assert(ProfileCount.hasValue()); 641 if (ProfileCount.getValue() == 0) 642 ColdSize += Cost - CostAtBBStart; 643 } 644 645 auto *TI = BB->getTerminator(); 646 // If we had any successors at this point, than post-inlining is likely to 647 // have them as well. Note that we assume any basic blocks which existed 648 // due to branches or switches which folded above will also fold after 649 // inlining. 650 if (SingleBB && TI->getNumSuccessors() > 1) { 651 // Take off the bonus we applied to the threshold. 652 Threshold -= SingleBBBonus; 653 SingleBB = false; 654 } 655 } 656 657 void onInstructionAnalysisStart(const Instruction *I) override { 658 // This function is called to store the initial cost of inlining before 659 // the given instruction was assessed. 660 if (!PrintInstructionComments) 661 return; 662 InstructionCostDetailMap[I].CostBefore = Cost; 663 InstructionCostDetailMap[I].ThresholdBefore = Threshold; 664 } 665 666 void onInstructionAnalysisFinish(const Instruction *I) override { 667 // This function is called to find new values of cost and threshold after 668 // the instruction has been assessed. 669 if (!PrintInstructionComments) 670 return; 671 InstructionCostDetailMap[I].CostAfter = Cost; 672 InstructionCostDetailMap[I].ThresholdAfter = Threshold; 673 } 674 675 bool isCostBenefitAnalysisEnabled() { 676 if (!PSI || !PSI->hasProfileSummary()) 677 return false; 678 679 if (!GetBFI) 680 return false; 681 682 if (InlineEnableCostBenefitAnalysis.getNumOccurrences()) { 683 // Honor the explicit request from the user. 684 if (!InlineEnableCostBenefitAnalysis) 685 return false; 686 } else { 687 // Otherwise, require instrumentation profile. 688 if (!PSI->hasInstrumentationProfile()) 689 return false; 690 } 691 692 auto *Caller = CandidateCall.getParent()->getParent(); 693 if (!Caller->getEntryCount()) 694 return false; 695 696 BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller)); 697 if (!CallerBFI) 698 return false; 699 700 // For now, limit to hot call site. 701 if (!PSI->isHotCallSite(CandidateCall, CallerBFI)) 702 return false; 703 704 // Make sure we have a nonzero entry count. 705 auto EntryCount = F.getEntryCount(); 706 if (!EntryCount || !EntryCount.getCount()) 707 return false; 708 709 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); 710 if (!CalleeBFI) 711 return false; 712 713 return true; 714 } 715 716 // Determine whether we should inline the given call site, taking into account 717 // both the size cost and the cycle savings. Return None if we don't have 718 // suficient profiling information to determine. 719 Optional<bool> costBenefitAnalysis() { 720 if (!CostBenefitAnalysisEnabled) 721 return None; 722 723 // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0 724 // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by 725 // falling back to the cost-based metric. 726 // TODO: Improve this hacky condition. 727 if (Threshold == 0) 728 return None; 729 730 assert(GetBFI); 731 BlockFrequencyInfo *CalleeBFI = &(GetBFI(F)); 732 assert(CalleeBFI); 733 734 // The cycle savings expressed as the sum of InlineConstants::InstrCost 735 // multiplied by the estimated dynamic count of each instruction we can 736 // avoid. Savings come from the call site cost, such as argument setup and 737 // the call instruction, as well as the instructions that are folded. 738 // 739 // We use 128-bit APInt here to avoid potential overflow. This variable 740 // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case 741 // assumes that we can avoid or fold a billion instructions, each with a 742 // profile count of 10^^15 -- roughly the number of cycles for a 24-hour 743 // period on a 4GHz machine. 744 APInt CycleSavings(128, 0); 745 746 for (auto &BB : F) { 747 APInt CurrentSavings(128, 0); 748 for (auto &I : BB) { 749 if (BranchInst *BI = dyn_cast<BranchInst>(&I)) { 750 // Count a conditional branch as savings if it becomes unconditional. 751 if (BI->isConditional() && 752 dyn_cast_or_null<ConstantInt>( 753 SimplifiedValues.lookup(BI->getCondition()))) { 754 CurrentSavings += InlineConstants::InstrCost; 755 } 756 } else if (Value *V = dyn_cast<Value>(&I)) { 757 // Count an instruction as savings if we can fold it. 758 if (SimplifiedValues.count(V)) { 759 CurrentSavings += InlineConstants::InstrCost; 760 } 761 } 762 } 763 764 auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB); 765 assert(ProfileCount.hasValue()); 766 CurrentSavings *= ProfileCount.getValue(); 767 CycleSavings += CurrentSavings; 768 } 769 770 // Compute the cycle savings per call. 771 auto EntryProfileCount = F.getEntryCount(); 772 assert(EntryProfileCount.hasValue() && EntryProfileCount.getCount()); 773 auto EntryCount = EntryProfileCount.getCount(); 774 CycleSavings += EntryCount / 2; 775 CycleSavings = CycleSavings.udiv(EntryCount); 776 777 // Compute the total savings for the call site. 778 auto *CallerBB = CandidateCall.getParent(); 779 BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent()))); 780 CycleSavings += getCallsiteCost(this->CandidateCall, DL); 781 CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue(); 782 783 // Remove the cost of the cold basic blocks. 784 int Size = Cost - ColdSize; 785 786 // Allow tiny callees to be inlined regardless of whether they meet the 787 // savings threshold. 788 Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1; 789 790 // Return true if the savings justify the cost of inlining. Specifically, 791 // we evaluate the following inequality: 792 // 793 // CycleSavings PSI->getOrCompHotCountThreshold() 794 // -------------- >= ----------------------------------- 795 // Size InlineSavingsMultiplier 796 // 797 // Note that the left hand side is specific to a call site. The right hand 798 // side is a constant for the entire executable. 799 APInt LHS = CycleSavings; 800 LHS *= InlineSavingsMultiplier; 801 APInt RHS(128, PSI->getOrCompHotCountThreshold()); 802 RHS *= Size; 803 return LHS.uge(RHS); 804 } 805 806 InlineResult finalizeAnalysis() override { 807 // Loops generally act a lot like calls in that they act like barriers to 808 // movement, require a certain amount of setup, etc. So when optimising for 809 // size, we penalise any call sites that perform loops. We do this after all 810 // other costs here, so will likely only be dealing with relatively small 811 // functions (and hence DT and LI will hopefully be cheap). 812 auto *Caller = CandidateCall.getFunction(); 813 if (Caller->hasMinSize()) { 814 DominatorTree DT(F); 815 LoopInfo LI(DT); 816 int NumLoops = 0; 817 for (Loop *L : LI) { 818 // Ignore loops that will not be executed 819 if (DeadBlocks.count(L->getHeader())) 820 continue; 821 NumLoops++; 822 } 823 addCost(NumLoops * InlineConstants::CallPenalty); 824 } 825 826 // We applied the maximum possible vector bonus at the beginning. Now, 827 // subtract the excess bonus, if any, from the Threshold before 828 // comparing against Cost. 829 if (NumVectorInstructions <= NumInstructions / 10) 830 Threshold -= VectorBonus; 831 else if (NumVectorInstructions <= NumInstructions / 2) 832 Threshold -= VectorBonus / 2; 833 834 if (auto Result = costBenefitAnalysis()) { 835 DecidedByCostBenefit = true; 836 if (Result.getValue()) 837 return InlineResult::success(); 838 else 839 return InlineResult::failure("Cost over threshold."); 840 } 841 842 if (IgnoreThreshold || Cost < std::max(1, Threshold)) 843 return InlineResult::success(); 844 return InlineResult::failure("Cost over threshold."); 845 } 846 bool shouldStop() override { 847 // Bail out the moment we cross the threshold. This means we'll under-count 848 // the cost, but only when undercounting doesn't matter. 849 return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost; 850 } 851 852 void onLoadEliminationOpportunity() override { 853 LoadEliminationCost += InlineConstants::InstrCost; 854 } 855 856 InlineResult onAnalysisStart() override { 857 // Perform some tweaks to the cost and threshold based on the direct 858 // callsite information. 859 860 // We want to more aggressively inline vector-dense kernels, so up the 861 // threshold, and we'll lower it if the % of vector instructions gets too 862 // low. Note that these bonuses are some what arbitrary and evolved over 863 // time by accident as much as because they are principled bonuses. 864 // 865 // FIXME: It would be nice to remove all such bonuses. At least it would be 866 // nice to base the bonus values on something more scientific. 867 assert(NumInstructions == 0); 868 assert(NumVectorInstructions == 0); 869 870 // Update the threshold based on callsite properties 871 updateThreshold(CandidateCall, F); 872 873 // While Threshold depends on commandline options that can take negative 874 // values, we want to enforce the invariant that the computed threshold and 875 // bonuses are non-negative. 876 assert(Threshold >= 0); 877 assert(SingleBBBonus >= 0); 878 assert(VectorBonus >= 0); 879 880 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 881 // this Threshold any time, and cost cannot decrease, we can stop processing 882 // the rest of the function body. 883 Threshold += (SingleBBBonus + VectorBonus); 884 885 // Give out bonuses for the callsite, as the instructions setting them up 886 // will be gone after inlining. 887 addCost(-getCallsiteCost(this->CandidateCall, DL)); 888 889 // If this function uses the coldcc calling convention, prefer not to inline 890 // it. 891 if (F.getCallingConv() == CallingConv::Cold) 892 Cost += InlineConstants::ColdccPenalty; 893 894 // Check if we're done. This can happen due to bonuses and penalties. 895 if (Cost >= Threshold && !ComputeFullInlineCost) 896 return InlineResult::failure("high cost"); 897 898 return InlineResult::success(); 899 } 900 901 public: 902 InlineCostCallAnalyzer( 903 Function &Callee, CallBase &Call, const InlineParams &Params, 904 const TargetTransformInfo &TTI, 905 function_ref<AssumptionCache &(Function &)> GetAssumptionCache, 906 function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr, 907 ProfileSummaryInfo *PSI = nullptr, 908 OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true, 909 bool IgnoreThreshold = false) 910 : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE), 911 ComputeFullInlineCost(OptComputeFullInlineCost || 912 Params.ComputeFullInlineCost || ORE || 913 isCostBenefitAnalysisEnabled()), 914 Params(Params), Threshold(Params.DefaultThreshold), 915 BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold), 916 CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()), 917 Writer(this) {} 918 919 /// Annotation Writer for instruction details 920 InlineCostAnnotationWriter Writer; 921 922 void dump(); 923 924 // Prints the same analysis as dump(), but its definition is not dependent 925 // on the build. 926 void print(); 927 928 Optional<InstructionCostDetail> getCostDetails(const Instruction *I) { 929 if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end()) 930 return InstructionCostDetailMap[I]; 931 return None; 932 } 933 934 virtual ~InlineCostCallAnalyzer() {} 935 int getThreshold() { return Threshold; } 936 int getCost() { return Cost; } 937 bool wasDecidedByCostBenefit() { return DecidedByCostBenefit; } 938 }; 939 } // namespace 940 941 /// Test whether the given value is an Alloca-derived function argument. 942 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 943 return SROAArgValues.count(V); 944 } 945 946 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { 947 onDisableSROA(SROAArg); 948 EnabledSROAAllocas.erase(SROAArg); 949 disableLoadElimination(); 950 } 951 952 void InlineCostAnnotationWriter::emitInstructionAnnot( 953 const Instruction *I, formatted_raw_ostream &OS) { 954 // The cost of inlining of the given instruction is printed always. 955 // The threshold delta is printed only when it is non-zero. It happens 956 // when we decided to give a bonus at a particular instruction. 957 Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I); 958 if (!Record) 959 OS << "; No analysis for the instruction"; 960 else { 961 OS << "; cost before = " << Record->CostBefore 962 << ", cost after = " << Record->CostAfter 963 << ", threshold before = " << Record->ThresholdBefore 964 << ", threshold after = " << Record->ThresholdAfter << ", "; 965 OS << "cost delta = " << Record->getCostDelta(); 966 if (Record->hasThresholdChanged()) 967 OS << ", threshold delta = " << Record->getThresholdDelta(); 968 } 969 auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I)); 970 if (C) { 971 OS << ", simplified to "; 972 C.getValue()->print(OS, true); 973 } 974 OS << "\n"; 975 } 976 977 /// If 'V' maps to a SROA candidate, disable SROA for it. 978 void CallAnalyzer::disableSROA(Value *V) { 979 if (auto *SROAArg = getSROAArgForValueOrNull(V)) { 980 disableSROAForArg(SROAArg); 981 } 982 } 983 984 void CallAnalyzer::disableLoadElimination() { 985 if (EnableLoadElimination) { 986 onDisableLoadElimination(); 987 EnableLoadElimination = false; 988 } 989 } 990 991 /// Accumulate a constant GEP offset into an APInt if possible. 992 /// 993 /// Returns false if unable to compute the offset for any reason. Respects any 994 /// simplified values known during the analysis of this callsite. 995 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 996 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 997 assert(IntPtrWidth == Offset.getBitWidth()); 998 999 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 1000 GTI != GTE; ++GTI) { 1001 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1002 if (!OpC) 1003 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 1004 OpC = dyn_cast<ConstantInt>(SimpleOp); 1005 if (!OpC) 1006 return false; 1007 if (OpC->isZero()) 1008 continue; 1009 1010 // Handle a struct index, which adds its field offset to the pointer. 1011 if (StructType *STy = GTI.getStructTypeOrNull()) { 1012 unsigned ElementIdx = OpC->getZExtValue(); 1013 const StructLayout *SL = DL.getStructLayout(STy); 1014 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 1015 continue; 1016 } 1017 1018 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 1019 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 1020 } 1021 return true; 1022 } 1023 1024 /// Use TTI to check whether a GEP is free. 1025 /// 1026 /// Respects any simplified values known during the analysis of this callsite. 1027 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 1028 SmallVector<Value *, 4> Operands; 1029 Operands.push_back(GEP.getOperand(0)); 1030 for (const Use &Op : GEP.indices()) 1031 if (Constant *SimpleOp = SimplifiedValues.lookup(Op)) 1032 Operands.push_back(SimpleOp); 1033 else 1034 Operands.push_back(Op); 1035 return TTI.getUserCost(&GEP, Operands, 1036 TargetTransformInfo::TCK_SizeAndLatency) == 1037 TargetTransformInfo::TCC_Free; 1038 } 1039 1040 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 1041 disableSROA(I.getOperand(0)); 1042 1043 // Check whether inlining will turn a dynamic alloca into a static 1044 // alloca and handle that case. 1045 if (I.isArrayAllocation()) { 1046 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 1047 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 1048 // Sometimes a dynamic alloca could be converted into a static alloca 1049 // after this constant prop, and become a huge static alloca on an 1050 // unconditional CFG path. Avoid inlining if this is going to happen above 1051 // a threshold. 1052 // FIXME: If the threshold is removed or lowered too much, we could end up 1053 // being too pessimistic and prevent inlining non-problematic code. This 1054 // could result in unintended perf regressions. A better overall strategy 1055 // is needed to track stack usage during inlining. 1056 Type *Ty = I.getAllocatedType(); 1057 AllocatedSize = SaturatingMultiplyAdd( 1058 AllocSize->getLimitedValue(), 1059 DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize); 1060 if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) 1061 HasDynamicAlloca = true; 1062 return false; 1063 } 1064 } 1065 1066 // Accumulate the allocated size. 1067 if (I.isStaticAlloca()) { 1068 Type *Ty = I.getAllocatedType(); 1069 AllocatedSize = 1070 SaturatingAdd(DL.getTypeAllocSize(Ty).getKnownMinSize(), AllocatedSize); 1071 } 1072 1073 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 1074 // a variety of reasons, and so we would like to not inline them into 1075 // functions which don't currently have a dynamic alloca. This simply 1076 // disables inlining altogether in the presence of a dynamic alloca. 1077 if (!I.isStaticAlloca()) 1078 HasDynamicAlloca = true; 1079 1080 return false; 1081 } 1082 1083 bool CallAnalyzer::visitPHI(PHINode &I) { 1084 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 1085 // though we don't want to propagate it's bonuses. The idea is to disable 1086 // SROA if it *might* be used in an inappropriate manner. 1087 1088 // Phi nodes are always zero-cost. 1089 // FIXME: Pointer sizes may differ between different address spaces, so do we 1090 // need to use correct address space in the call to getPointerSizeInBits here? 1091 // Or could we skip the getPointerSizeInBits call completely? As far as I can 1092 // see the ZeroOffset is used as a dummy value, so we can probably use any 1093 // bit width for the ZeroOffset? 1094 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 1095 bool CheckSROA = I.getType()->isPointerTy(); 1096 1097 // Track the constant or pointer with constant offset we've seen so far. 1098 Constant *FirstC = nullptr; 1099 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 1100 Value *FirstV = nullptr; 1101 1102 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 1103 BasicBlock *Pred = I.getIncomingBlock(i); 1104 // If the incoming block is dead, skip the incoming block. 1105 if (DeadBlocks.count(Pred)) 1106 continue; 1107 // If the parent block of phi is not the known successor of the incoming 1108 // block, skip the incoming block. 1109 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 1110 if (KnownSuccessor && KnownSuccessor != I.getParent()) 1111 continue; 1112 1113 Value *V = I.getIncomingValue(i); 1114 // If the incoming value is this phi itself, skip the incoming value. 1115 if (&I == V) 1116 continue; 1117 1118 Constant *C = dyn_cast<Constant>(V); 1119 if (!C) 1120 C = SimplifiedValues.lookup(V); 1121 1122 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 1123 if (!C && CheckSROA) 1124 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 1125 1126 if (!C && !BaseAndOffset.first) 1127 // The incoming value is neither a constant nor a pointer with constant 1128 // offset, exit early. 1129 return true; 1130 1131 if (FirstC) { 1132 if (FirstC == C) 1133 // If we've seen a constant incoming value before and it is the same 1134 // constant we see this time, continue checking the next incoming value. 1135 continue; 1136 // Otherwise early exit because we either see a different constant or saw 1137 // a constant before but we have a pointer with constant offset this time. 1138 return true; 1139 } 1140 1141 if (FirstV) { 1142 // The same logic as above, but check pointer with constant offset here. 1143 if (FirstBaseAndOffset == BaseAndOffset) 1144 continue; 1145 return true; 1146 } 1147 1148 if (C) { 1149 // This is the 1st time we've seen a constant, record it. 1150 FirstC = C; 1151 continue; 1152 } 1153 1154 // The remaining case is that this is the 1st time we've seen a pointer with 1155 // constant offset, record it. 1156 FirstV = V; 1157 FirstBaseAndOffset = BaseAndOffset; 1158 } 1159 1160 // Check if we can map phi to a constant. 1161 if (FirstC) { 1162 SimplifiedValues[&I] = FirstC; 1163 return true; 1164 } 1165 1166 // Check if we can map phi to a pointer with constant offset. 1167 if (FirstBaseAndOffset.first) { 1168 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 1169 1170 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV)) 1171 SROAArgValues[&I] = SROAArg; 1172 } 1173 1174 return true; 1175 } 1176 1177 /// Check we can fold GEPs of constant-offset call site argument pointers. 1178 /// This requires target data and inbounds GEPs. 1179 /// 1180 /// \return true if the specified GEP can be folded. 1181 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 1182 // Check if we have a base + offset for the pointer. 1183 std::pair<Value *, APInt> BaseAndOffset = 1184 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 1185 if (!BaseAndOffset.first) 1186 return false; 1187 1188 // Check if the offset of this GEP is constant, and if so accumulate it 1189 // into Offset. 1190 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 1191 return false; 1192 1193 // Add the result as a new mapping to Base + Offset. 1194 ConstantOffsetPtrs[&I] = BaseAndOffset; 1195 1196 return true; 1197 } 1198 1199 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 1200 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand()); 1201 1202 // Lambda to check whether a GEP's indices are all constant. 1203 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 1204 for (const Use &Op : GEP.indices()) 1205 if (!isa<Constant>(Op) && !SimplifiedValues.lookup(Op)) 1206 return false; 1207 return true; 1208 }; 1209 1210 if (!DisableGEPConstOperand) 1211 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1212 SmallVector<Constant *, 2> Indices; 1213 for (unsigned int Index = 1; Index < COps.size(); ++Index) 1214 Indices.push_back(COps[Index]); 1215 return ConstantExpr::getGetElementPtr( 1216 I.getSourceElementType(), COps[0], Indices, I.isInBounds()); 1217 })) 1218 return true; 1219 1220 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 1221 if (SROAArg) 1222 SROAArgValues[&I] = SROAArg; 1223 1224 // Constant GEPs are modeled as free. 1225 return true; 1226 } 1227 1228 // Variable GEPs will require math and will disable SROA. 1229 if (SROAArg) 1230 disableSROAForArg(SROAArg); 1231 return isGEPFree(I); 1232 } 1233 1234 /// Simplify \p I if its operands are constants and update SimplifiedValues. 1235 /// \p Evaluate is a callable specific to instruction type that evaluates the 1236 /// instruction when all the operands are constants. 1237 template <typename Callable> 1238 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 1239 SmallVector<Constant *, 2> COps; 1240 for (Value *Op : I.operands()) { 1241 Constant *COp = dyn_cast<Constant>(Op); 1242 if (!COp) 1243 COp = SimplifiedValues.lookup(Op); 1244 if (!COp) 1245 return false; 1246 COps.push_back(COp); 1247 } 1248 auto *C = Evaluate(COps); 1249 if (!C) 1250 return false; 1251 SimplifiedValues[&I] = C; 1252 return true; 1253 } 1254 1255 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 1256 // Propagate constants through bitcasts. 1257 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1258 return ConstantExpr::getBitCast(COps[0], I.getType()); 1259 })) 1260 return true; 1261 1262 // Track base/offsets through casts 1263 std::pair<Value *, APInt> BaseAndOffset = 1264 ConstantOffsetPtrs.lookup(I.getOperand(0)); 1265 // Casts don't change the offset, just wrap it up. 1266 if (BaseAndOffset.first) 1267 ConstantOffsetPtrs[&I] = BaseAndOffset; 1268 1269 // Also look for SROA candidates here. 1270 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) 1271 SROAArgValues[&I] = SROAArg; 1272 1273 // Bitcasts are always zero cost. 1274 return true; 1275 } 1276 1277 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 1278 // Propagate constants through ptrtoint. 1279 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1280 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 1281 })) 1282 return true; 1283 1284 // Track base/offset pairs when converted to a plain integer provided the 1285 // integer is large enough to represent the pointer. 1286 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 1287 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 1288 if (IntegerSize == DL.getPointerSizeInBits(AS)) { 1289 std::pair<Value *, APInt> BaseAndOffset = 1290 ConstantOffsetPtrs.lookup(I.getOperand(0)); 1291 if (BaseAndOffset.first) 1292 ConstantOffsetPtrs[&I] = BaseAndOffset; 1293 } 1294 1295 // This is really weird. Technically, ptrtoint will disable SROA. However, 1296 // unless that ptrtoint is *used* somewhere in the live basic blocks after 1297 // inlining, it will be nuked, and SROA should proceed. All of the uses which 1298 // would block SROA would also block SROA if applied directly to a pointer, 1299 // and so we can just add the integer in here. The only places where SROA is 1300 // preserved either cannot fire on an integer, or won't in-and-of themselves 1301 // disable SROA (ext) w/o some later use that we would see and disable. 1302 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) 1303 SROAArgValues[&I] = SROAArg; 1304 1305 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == 1306 TargetTransformInfo::TCC_Free; 1307 } 1308 1309 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 1310 // Propagate constants through ptrtoint. 1311 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1312 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 1313 })) 1314 return true; 1315 1316 // Track base/offset pairs when round-tripped through a pointer without 1317 // modifications provided the integer is not too large. 1318 Value *Op = I.getOperand(0); 1319 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 1320 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 1321 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 1322 if (BaseAndOffset.first) 1323 ConstantOffsetPtrs[&I] = BaseAndOffset; 1324 } 1325 1326 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 1327 if (auto *SROAArg = getSROAArgForValueOrNull(Op)) 1328 SROAArgValues[&I] = SROAArg; 1329 1330 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == 1331 TargetTransformInfo::TCC_Free; 1332 } 1333 1334 bool CallAnalyzer::visitCastInst(CastInst &I) { 1335 // Propagate constants through casts. 1336 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1337 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 1338 })) 1339 return true; 1340 1341 // Disable SROA in the face of arbitrary casts we don't explicitly list 1342 // elsewhere. 1343 disableSROA(I.getOperand(0)); 1344 1345 // If this is a floating-point cast, and the target says this operation 1346 // is expensive, this may eventually become a library call. Treat the cost 1347 // as such. 1348 switch (I.getOpcode()) { 1349 case Instruction::FPTrunc: 1350 case Instruction::FPExt: 1351 case Instruction::UIToFP: 1352 case Instruction::SIToFP: 1353 case Instruction::FPToUI: 1354 case Instruction::FPToSI: 1355 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1356 onCallPenalty(); 1357 break; 1358 default: 1359 break; 1360 } 1361 1362 return TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == 1363 TargetTransformInfo::TCC_Free; 1364 } 1365 1366 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 1367 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 1368 } 1369 1370 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 1371 // Does the *call site* have the NonNull attribute set on an argument? We 1372 // use the attribute on the call site to memoize any analysis done in the 1373 // caller. This will also trip if the callee function has a non-null 1374 // parameter attribute, but that's a less interesting case because hopefully 1375 // the callee would already have been simplified based on that. 1376 if (Argument *A = dyn_cast<Argument>(V)) 1377 if (paramHasAttr(A, Attribute::NonNull)) 1378 return true; 1379 1380 // Is this an alloca in the caller? This is distinct from the attribute case 1381 // above because attributes aren't updated within the inliner itself and we 1382 // always want to catch the alloca derived case. 1383 if (isAllocaDerivedArg(V)) 1384 // We can actually predict the result of comparisons between an 1385 // alloca-derived value and null. Note that this fires regardless of 1386 // SROA firing. 1387 return true; 1388 1389 return false; 1390 } 1391 1392 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 1393 // If the normal destination of the invoke or the parent block of the call 1394 // site is unreachable-terminated, there is little point in inlining this 1395 // unless there is literally zero cost. 1396 // FIXME: Note that it is possible that an unreachable-terminated block has a 1397 // hot entry. For example, in below scenario inlining hot_call_X() may be 1398 // beneficial : 1399 // main() { 1400 // hot_call_1(); 1401 // ... 1402 // hot_call_N() 1403 // exit(0); 1404 // } 1405 // For now, we are not handling this corner case here as it is rare in real 1406 // code. In future, we should elaborate this based on BPI and BFI in more 1407 // general threshold adjusting heuristics in updateThreshold(). 1408 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 1409 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 1410 return false; 1411 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 1412 return false; 1413 1414 return true; 1415 } 1416 1417 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, 1418 BlockFrequencyInfo *CallerBFI) { 1419 // If global profile summary is available, then callsite's coldness is 1420 // determined based on that. 1421 if (PSI && PSI->hasProfileSummary()) 1422 return PSI->isColdCallSite(Call, CallerBFI); 1423 1424 // Otherwise we need BFI to be available. 1425 if (!CallerBFI) 1426 return false; 1427 1428 // Determine if the callsite is cold relative to caller's entry. We could 1429 // potentially cache the computation of scaled entry frequency, but the added 1430 // complexity is not worth it unless this scaling shows up high in the 1431 // profiles. 1432 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 1433 auto CallSiteBB = Call.getParent(); 1434 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 1435 auto CallerEntryFreq = 1436 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 1437 return CallSiteFreq < CallerEntryFreq * ColdProb; 1438 } 1439 1440 Optional<int> 1441 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 1442 BlockFrequencyInfo *CallerBFI) { 1443 1444 // If global profile summary is available, then callsite's hotness is 1445 // determined based on that. 1446 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI)) 1447 return Params.HotCallSiteThreshold; 1448 1449 // Otherwise we need BFI to be available and to have a locally hot callsite 1450 // threshold. 1451 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 1452 return None; 1453 1454 // Determine if the callsite is hot relative to caller's entry. We could 1455 // potentially cache the computation of scaled entry frequency, but the added 1456 // complexity is not worth it unless this scaling shows up high in the 1457 // profiles. 1458 auto CallSiteBB = Call.getParent(); 1459 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 1460 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 1461 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 1462 return Params.LocallyHotCallSiteThreshold; 1463 1464 // Otherwise treat it normally. 1465 return None; 1466 } 1467 1468 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 1469 // If no size growth is allowed for this inlining, set Threshold to 0. 1470 if (!allowSizeGrowth(Call)) { 1471 Threshold = 0; 1472 return; 1473 } 1474 1475 Function *Caller = Call.getCaller(); 1476 1477 // return min(A, B) if B is valid. 1478 auto MinIfValid = [](int A, Optional<int> B) { 1479 return B ? std::min(A, B.getValue()) : A; 1480 }; 1481 1482 // return max(A, B) if B is valid. 1483 auto MaxIfValid = [](int A, Optional<int> B) { 1484 return B ? std::max(A, B.getValue()) : A; 1485 }; 1486 1487 // Various bonus percentages. These are multiplied by Threshold to get the 1488 // bonus values. 1489 // SingleBBBonus: This bonus is applied if the callee has a single reachable 1490 // basic block at the given callsite context. This is speculatively applied 1491 // and withdrawn if more than one basic block is seen. 1492 // 1493 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 1494 // of the last call to a static function as inlining such functions is 1495 // guaranteed to reduce code size. 1496 // 1497 // These bonus percentages may be set to 0 based on properties of the caller 1498 // and the callsite. 1499 int SingleBBBonusPercent = 50; 1500 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); 1501 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 1502 1503 // Lambda to set all the above bonus and bonus percentages to 0. 1504 auto DisallowAllBonuses = [&]() { 1505 SingleBBBonusPercent = 0; 1506 VectorBonusPercent = 0; 1507 LastCallToStaticBonus = 0; 1508 }; 1509 1510 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 1511 // and reduce the threshold if the caller has the necessary attribute. 1512 if (Caller->hasMinSize()) { 1513 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 1514 // For minsize, we want to disable the single BB bonus and the vector 1515 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 1516 // a static function will, at the minimum, eliminate the parameter setup and 1517 // call/return instructions. 1518 SingleBBBonusPercent = 0; 1519 VectorBonusPercent = 0; 1520 } else if (Caller->hasOptSize()) 1521 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 1522 1523 // Adjust the threshold based on inlinehint attribute and profile based 1524 // hotness information if the caller does not have MinSize attribute. 1525 if (!Caller->hasMinSize()) { 1526 if (Callee.hasFnAttribute(Attribute::InlineHint)) 1527 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 1528 1529 // FIXME: After switching to the new passmanager, simplify the logic below 1530 // by checking only the callsite hotness/coldness as we will reliably 1531 // have local profile information. 1532 // 1533 // Callsite hotness and coldness can be determined if sample profile is 1534 // used (which adds hotness metadata to calls) or if caller's 1535 // BlockFrequencyInfo is available. 1536 BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr; 1537 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 1538 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 1539 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 1540 // FIXME: This should update the threshold only if it exceeds the 1541 // current threshold, but AutoFDO + ThinLTO currently relies on this 1542 // behavior to prevent inlining of hot callsites during ThinLTO 1543 // compile phase. 1544 Threshold = HotCallSiteThreshold.getValue(); 1545 } else if (isColdCallSite(Call, CallerBFI)) { 1546 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 1547 // Do not apply bonuses for a cold callsite including the 1548 // LastCallToStatic bonus. While this bonus might result in code size 1549 // reduction, it can cause the size of a non-cold caller to increase 1550 // preventing it from being inlined. 1551 DisallowAllBonuses(); 1552 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 1553 } else if (PSI) { 1554 // Use callee's global profile information only if we have no way of 1555 // determining this via callsite information. 1556 if (PSI->isFunctionEntryHot(&Callee)) { 1557 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 1558 // If callsite hotness can not be determined, we may still know 1559 // that the callee is hot and treat it as a weaker hint for threshold 1560 // increase. 1561 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 1562 } else if (PSI->isFunctionEntryCold(&Callee)) { 1563 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 1564 // Do not apply bonuses for a cold callee including the 1565 // LastCallToStatic bonus. While this bonus might result in code size 1566 // reduction, it can cause the size of a non-cold caller to increase 1567 // preventing it from being inlined. 1568 DisallowAllBonuses(); 1569 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 1570 } 1571 } 1572 } 1573 1574 Threshold += TTI.adjustInliningThreshold(&Call); 1575 1576 // Finally, take the target-specific inlining threshold multiplier into 1577 // account. 1578 Threshold *= TTI.getInliningThresholdMultiplier(); 1579 1580 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 1581 VectorBonus = Threshold * VectorBonusPercent / 100; 1582 1583 bool OnlyOneCallAndLocalLinkage = 1584 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1585 // If there is only one call of the function, and it has internal linkage, 1586 // the cost of inlining it drops dramatically. It may seem odd to update 1587 // Cost in updateThreshold, but the bonus depends on the logic in this method. 1588 if (OnlyOneCallAndLocalLinkage) 1589 Cost -= LastCallToStaticBonus; 1590 } 1591 1592 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 1593 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1594 // First try to handle simplified comparisons. 1595 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1596 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 1597 })) 1598 return true; 1599 1600 if (I.getOpcode() == Instruction::FCmp) 1601 return false; 1602 1603 // Otherwise look for a comparison between constant offset pointers with 1604 // a common base. 1605 Value *LHSBase, *RHSBase; 1606 APInt LHSOffset, RHSOffset; 1607 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1608 if (LHSBase) { 1609 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1610 if (RHSBase && LHSBase == RHSBase) { 1611 // We have common bases, fold the icmp to a constant based on the 1612 // offsets. 1613 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1614 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1615 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1616 SimplifiedValues[&I] = C; 1617 ++NumConstantPtrCmps; 1618 return true; 1619 } 1620 } 1621 } 1622 1623 // If the comparison is an equality comparison with null, we can simplify it 1624 // if we know the value (argument) can't be null 1625 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1626 isKnownNonNullInCallee(I.getOperand(0))) { 1627 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1628 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1629 : ConstantInt::getFalse(I.getType()); 1630 return true; 1631 } 1632 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1))); 1633 } 1634 1635 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1636 // Try to handle a special case: we can fold computing the difference of two 1637 // constant-related pointers. 1638 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1639 Value *LHSBase, *RHSBase; 1640 APInt LHSOffset, RHSOffset; 1641 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1642 if (LHSBase) { 1643 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1644 if (RHSBase && LHSBase == RHSBase) { 1645 // We have common bases, fold the subtract to a constant based on the 1646 // offsets. 1647 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1648 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1649 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1650 SimplifiedValues[&I] = C; 1651 ++NumConstantPtrDiffs; 1652 return true; 1653 } 1654 } 1655 } 1656 1657 // Otherwise, fall back to the generic logic for simplifying and handling 1658 // instructions. 1659 return Base::visitSub(I); 1660 } 1661 1662 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1663 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1664 Constant *CLHS = dyn_cast<Constant>(LHS); 1665 if (!CLHS) 1666 CLHS = SimplifiedValues.lookup(LHS); 1667 Constant *CRHS = dyn_cast<Constant>(RHS); 1668 if (!CRHS) 1669 CRHS = SimplifiedValues.lookup(RHS); 1670 1671 Value *SimpleV = nullptr; 1672 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1673 SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, 1674 FI->getFastMathFlags(), DL); 1675 else 1676 SimpleV = 1677 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1678 1679 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1680 SimplifiedValues[&I] = C; 1681 1682 if (SimpleV) 1683 return true; 1684 1685 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1686 disableSROA(LHS); 1687 disableSROA(RHS); 1688 1689 // If the instruction is floating point, and the target says this operation 1690 // is expensive, this may eventually become a library call. Treat the cost 1691 // as such. Unless it's fneg which can be implemented with an xor. 1692 using namespace llvm::PatternMatch; 1693 if (I.getType()->isFloatingPointTy() && 1694 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && 1695 !match(&I, m_FNeg(m_Value()))) 1696 onCallPenalty(); 1697 1698 return false; 1699 } 1700 1701 bool CallAnalyzer::visitFNeg(UnaryOperator &I) { 1702 Value *Op = I.getOperand(0); 1703 Constant *COp = dyn_cast<Constant>(Op); 1704 if (!COp) 1705 COp = SimplifiedValues.lookup(Op); 1706 1707 Value *SimpleV = SimplifyFNegInst( 1708 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL); 1709 1710 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1711 SimplifiedValues[&I] = C; 1712 1713 if (SimpleV) 1714 return true; 1715 1716 // Disable any SROA on arguments to arbitrary, unsimplified fneg. 1717 disableSROA(Op); 1718 1719 return false; 1720 } 1721 1722 bool CallAnalyzer::visitLoad(LoadInst &I) { 1723 if (handleSROA(I.getPointerOperand(), I.isSimple())) 1724 return true; 1725 1726 // If the data is already loaded from this address and hasn't been clobbered 1727 // by any stores or calls, this load is likely to be redundant and can be 1728 // eliminated. 1729 if (EnableLoadElimination && 1730 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1731 onLoadEliminationOpportunity(); 1732 return true; 1733 } 1734 1735 return false; 1736 } 1737 1738 bool CallAnalyzer::visitStore(StoreInst &I) { 1739 if (handleSROA(I.getPointerOperand(), I.isSimple())) 1740 return true; 1741 1742 // The store can potentially clobber loads and prevent repeated loads from 1743 // being eliminated. 1744 // FIXME: 1745 // 1. We can probably keep an initial set of eliminatable loads substracted 1746 // from the cost even when we finally see a store. We just need to disable 1747 // *further* accumulation of elimination savings. 1748 // 2. We should probably at some point thread MemorySSA for the callee into 1749 // this and then use that to actually compute *really* precise savings. 1750 disableLoadElimination(); 1751 return false; 1752 } 1753 1754 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1755 // Constant folding for extract value is trivial. 1756 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1757 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1758 })) 1759 return true; 1760 1761 // SROA can't look through these, but they may be free. 1762 return Base::visitExtractValue(I); 1763 } 1764 1765 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1766 // Constant folding for insert value is trivial. 1767 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1768 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1769 /*InsertedValueOperand*/ COps[1], 1770 I.getIndices()); 1771 })) 1772 return true; 1773 1774 // SROA can't look through these, but they may be free. 1775 return Base::visitInsertValue(I); 1776 } 1777 1778 /// Try to simplify a call site. 1779 /// 1780 /// Takes a concrete function and callsite and tries to actually simplify it by 1781 /// analyzing the arguments and call itself with instsimplify. Returns true if 1782 /// it has simplified the callsite to some other entity (a constant), making it 1783 /// free. 1784 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1785 // FIXME: Using the instsimplify logic directly for this is inefficient 1786 // because we have to continually rebuild the argument list even when no 1787 // simplifications can be performed. Until that is fixed with remapping 1788 // inside of instsimplify, directly constant fold calls here. 1789 if (!canConstantFoldCallTo(&Call, F)) 1790 return false; 1791 1792 // Try to re-map the arguments to constants. 1793 SmallVector<Constant *, 4> ConstantArgs; 1794 ConstantArgs.reserve(Call.arg_size()); 1795 for (Value *I : Call.args()) { 1796 Constant *C = dyn_cast<Constant>(I); 1797 if (!C) 1798 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1799 if (!C) 1800 return false; // This argument doesn't map to a constant. 1801 1802 ConstantArgs.push_back(C); 1803 } 1804 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1805 SimplifiedValues[&Call] = C; 1806 return true; 1807 } 1808 1809 return false; 1810 } 1811 1812 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1813 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1814 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1815 // This aborts the entire analysis. 1816 ExposesReturnsTwice = true; 1817 return false; 1818 } 1819 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1820 ContainsNoDuplicateCall = true; 1821 1822 Value *Callee = Call.getCalledOperand(); 1823 Function *F = dyn_cast_or_null<Function>(Callee); 1824 bool IsIndirectCall = !F; 1825 if (IsIndirectCall) { 1826 // Check if this happens to be an indirect function call to a known function 1827 // in this inline context. If not, we've done all we can. 1828 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1829 if (!F) { 1830 onCallArgumentSetup(Call); 1831 1832 if (!Call.onlyReadsMemory()) 1833 disableLoadElimination(); 1834 return Base::visitCallBase(Call); 1835 } 1836 } 1837 1838 assert(F && "Expected a call to a known function"); 1839 1840 // When we have a concrete function, first try to simplify it directly. 1841 if (simplifyCallSite(F, Call)) 1842 return true; 1843 1844 // Next check if it is an intrinsic we know about. 1845 // FIXME: Lift this into part of the InstVisitor. 1846 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1847 switch (II->getIntrinsicID()) { 1848 default: 1849 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1850 disableLoadElimination(); 1851 return Base::visitCallBase(Call); 1852 1853 case Intrinsic::load_relative: 1854 onLoadRelativeIntrinsic(); 1855 return false; 1856 1857 case Intrinsic::memset: 1858 case Intrinsic::memcpy: 1859 case Intrinsic::memmove: 1860 disableLoadElimination(); 1861 // SROA can usually chew through these intrinsics, but they aren't free. 1862 return false; 1863 case Intrinsic::icall_branch_funnel: 1864 case Intrinsic::localescape: 1865 HasUninlineableIntrinsic = true; 1866 return false; 1867 case Intrinsic::vastart: 1868 InitsVargArgs = true; 1869 return false; 1870 case Intrinsic::launder_invariant_group: 1871 case Intrinsic::strip_invariant_group: 1872 if (auto *SROAArg = getSROAArgForValueOrNull(II->getOperand(0))) 1873 SROAArgValues[II] = SROAArg; 1874 return true; 1875 } 1876 } 1877 1878 if (F == Call.getFunction()) { 1879 // This flag will fully abort the analysis, so don't bother with anything 1880 // else. 1881 IsRecursiveCall = true; 1882 return false; 1883 } 1884 1885 if (TTI.isLoweredToCall(F)) { 1886 onLoweredCall(F, Call, IsIndirectCall); 1887 } 1888 1889 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) 1890 disableLoadElimination(); 1891 return Base::visitCallBase(Call); 1892 } 1893 1894 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1895 // At least one return instruction will be free after inlining. 1896 bool Free = !HasReturn; 1897 HasReturn = true; 1898 return Free; 1899 } 1900 1901 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1902 // We model unconditional branches as essentially free -- they really 1903 // shouldn't exist at all, but handling them makes the behavior of the 1904 // inliner more regular and predictable. Interestingly, conditional branches 1905 // which will fold away are also free. 1906 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1907 dyn_cast_or_null<ConstantInt>( 1908 SimplifiedValues.lookup(BI.getCondition())); 1909 } 1910 1911 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1912 bool CheckSROA = SI.getType()->isPointerTy(); 1913 Value *TrueVal = SI.getTrueValue(); 1914 Value *FalseVal = SI.getFalseValue(); 1915 1916 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1917 if (!TrueC) 1918 TrueC = SimplifiedValues.lookup(TrueVal); 1919 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1920 if (!FalseC) 1921 FalseC = SimplifiedValues.lookup(FalseVal); 1922 Constant *CondC = 1923 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1924 1925 if (!CondC) { 1926 // Select C, X, X => X 1927 if (TrueC == FalseC && TrueC) { 1928 SimplifiedValues[&SI] = TrueC; 1929 return true; 1930 } 1931 1932 if (!CheckSROA) 1933 return Base::visitSelectInst(SI); 1934 1935 std::pair<Value *, APInt> TrueBaseAndOffset = 1936 ConstantOffsetPtrs.lookup(TrueVal); 1937 std::pair<Value *, APInt> FalseBaseAndOffset = 1938 ConstantOffsetPtrs.lookup(FalseVal); 1939 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1940 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1941 1942 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal)) 1943 SROAArgValues[&SI] = SROAArg; 1944 return true; 1945 } 1946 1947 return Base::visitSelectInst(SI); 1948 } 1949 1950 // Select condition is a constant. 1951 Value *SelectedV = CondC->isAllOnesValue() ? TrueVal 1952 : (CondC->isNullValue()) ? FalseVal 1953 : nullptr; 1954 if (!SelectedV) { 1955 // Condition is a vector constant that is not all 1s or all 0s. If all 1956 // operands are constants, ConstantExpr::getSelect() can handle the cases 1957 // such as select vectors. 1958 if (TrueC && FalseC) { 1959 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1960 SimplifiedValues[&SI] = C; 1961 return true; 1962 } 1963 } 1964 return Base::visitSelectInst(SI); 1965 } 1966 1967 // Condition is either all 1s or all 0s. SI can be simplified. 1968 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1969 SimplifiedValues[&SI] = SelectedC; 1970 return true; 1971 } 1972 1973 if (!CheckSROA) 1974 return true; 1975 1976 std::pair<Value *, APInt> BaseAndOffset = 1977 ConstantOffsetPtrs.lookup(SelectedV); 1978 if (BaseAndOffset.first) { 1979 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1980 1981 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV)) 1982 SROAArgValues[&SI] = SROAArg; 1983 } 1984 1985 return true; 1986 } 1987 1988 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1989 // We model unconditional switches as free, see the comments on handling 1990 // branches. 1991 if (isa<ConstantInt>(SI.getCondition())) 1992 return true; 1993 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1994 if (isa<ConstantInt>(V)) 1995 return true; 1996 1997 // Assume the most general case where the switch is lowered into 1998 // either a jump table, bit test, or a balanced binary tree consisting of 1999 // case clusters without merging adjacent clusters with the same 2000 // destination. We do not consider the switches that are lowered with a mix 2001 // of jump table/bit test/binary search tree. The cost of the switch is 2002 // proportional to the size of the tree or the size of jump table range. 2003 // 2004 // NB: We convert large switches which are just used to initialize large phi 2005 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 2006 // inlining those. It will prevent inlining in cases where the optimization 2007 // does not (yet) fire. 2008 2009 unsigned JumpTableSize = 0; 2010 BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr; 2011 unsigned NumCaseCluster = 2012 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); 2013 2014 onFinalizeSwitch(JumpTableSize, NumCaseCluster); 2015 return false; 2016 } 2017 2018 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 2019 // We never want to inline functions that contain an indirectbr. This is 2020 // incorrect because all the blockaddress's (in static global initializers 2021 // for example) would be referring to the original function, and this 2022 // indirect jump would jump from the inlined copy of the function into the 2023 // original function which is extremely undefined behavior. 2024 // FIXME: This logic isn't really right; we can safely inline functions with 2025 // indirectbr's as long as no other function or global references the 2026 // blockaddress of a block within the current function. 2027 HasIndirectBr = true; 2028 return false; 2029 } 2030 2031 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 2032 // FIXME: It's not clear that a single instruction is an accurate model for 2033 // the inline cost of a resume instruction. 2034 return false; 2035 } 2036 2037 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 2038 // FIXME: It's not clear that a single instruction is an accurate model for 2039 // the inline cost of a cleanupret instruction. 2040 return false; 2041 } 2042 2043 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 2044 // FIXME: It's not clear that a single instruction is an accurate model for 2045 // the inline cost of a catchret instruction. 2046 return false; 2047 } 2048 2049 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 2050 // FIXME: It might be reasonably to discount the cost of instructions leading 2051 // to unreachable as they have the lowest possible impact on both runtime and 2052 // code size. 2053 return true; // No actual code is needed for unreachable. 2054 } 2055 2056 bool CallAnalyzer::visitInstruction(Instruction &I) { 2057 // Some instructions are free. All of the free intrinsics can also be 2058 // handled by SROA, etc. 2059 if (TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) == 2060 TargetTransformInfo::TCC_Free) 2061 return true; 2062 2063 // We found something we don't understand or can't handle. Mark any SROA-able 2064 // values in the operand list as no longer viable. 2065 for (const Use &Op : I.operands()) 2066 disableSROA(Op); 2067 2068 return false; 2069 } 2070 2071 /// Analyze a basic block for its contribution to the inline cost. 2072 /// 2073 /// This method walks the analyzer over every instruction in the given basic 2074 /// block and accounts for their cost during inlining at this callsite. It 2075 /// aborts early if the threshold has been exceeded or an impossible to inline 2076 /// construct has been detected. It returns false if inlining is no longer 2077 /// viable, and true if inlining remains viable. 2078 InlineResult 2079 CallAnalyzer::analyzeBlock(BasicBlock *BB, 2080 SmallPtrSetImpl<const Value *> &EphValues) { 2081 for (Instruction &I : *BB) { 2082 // FIXME: Currently, the number of instructions in a function regardless of 2083 // our ability to simplify them during inline to constants or dead code, 2084 // are actually used by the vector bonus heuristic. As long as that's true, 2085 // we have to special case debug intrinsics here to prevent differences in 2086 // inlining due to debug symbols. Eventually, the number of unsimplified 2087 // instructions shouldn't factor into the cost computation, but until then, 2088 // hack around it here. 2089 if (isa<DbgInfoIntrinsic>(I)) 2090 continue; 2091 2092 // Skip pseudo-probes. 2093 if (isa<PseudoProbeInst>(I)) 2094 continue; 2095 2096 // Skip ephemeral values. 2097 if (EphValues.count(&I)) 2098 continue; 2099 2100 ++NumInstructions; 2101 if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy()) 2102 ++NumVectorInstructions; 2103 2104 // If the instruction simplified to a constant, there is no cost to this 2105 // instruction. Visit the instructions using our InstVisitor to account for 2106 // all of the per-instruction logic. The visit tree returns true if we 2107 // consumed the instruction in any way, and false if the instruction's base 2108 // cost should count against inlining. 2109 onInstructionAnalysisStart(&I); 2110 2111 if (Base::visit(&I)) 2112 ++NumInstructionsSimplified; 2113 else 2114 onMissedSimplification(); 2115 2116 onInstructionAnalysisFinish(&I); 2117 using namespace ore; 2118 // If the visit this instruction detected an uninlinable pattern, abort. 2119 InlineResult IR = InlineResult::success(); 2120 if (IsRecursiveCall) 2121 IR = InlineResult::failure("recursive"); 2122 else if (ExposesReturnsTwice) 2123 IR = InlineResult::failure("exposes returns twice"); 2124 else if (HasDynamicAlloca) 2125 IR = InlineResult::failure("dynamic alloca"); 2126 else if (HasIndirectBr) 2127 IR = InlineResult::failure("indirect branch"); 2128 else if (HasUninlineableIntrinsic) 2129 IR = InlineResult::failure("uninlinable intrinsic"); 2130 else if (InitsVargArgs) 2131 IR = InlineResult::failure("varargs"); 2132 if (!IR.isSuccess()) { 2133 if (ORE) 2134 ORE->emit([&]() { 2135 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 2136 &CandidateCall) 2137 << NV("Callee", &F) << " has uninlinable pattern (" 2138 << NV("InlineResult", IR.getFailureReason()) 2139 << ") and cost is not fully computed"; 2140 }); 2141 return IR; 2142 } 2143 2144 // If the caller is a recursive function then we don't want to inline 2145 // functions which allocate a lot of stack space because it would increase 2146 // the caller stack usage dramatically. 2147 if (IsCallerRecursive && 2148 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 2149 auto IR = 2150 InlineResult::failure("recursive and allocates too much stack space"); 2151 if (ORE) 2152 ORE->emit([&]() { 2153 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 2154 &CandidateCall) 2155 << NV("Callee", &F) << " is " 2156 << NV("InlineResult", IR.getFailureReason()) 2157 << ". Cost is not fully computed"; 2158 }); 2159 return IR; 2160 } 2161 2162 if (shouldStop()) 2163 return InlineResult::failure( 2164 "Call site analysis is not favorable to inlining."); 2165 } 2166 2167 return InlineResult::success(); 2168 } 2169 2170 /// Compute the base pointer and cumulative constant offsets for V. 2171 /// 2172 /// This strips all constant offsets off of V, leaving it the base pointer, and 2173 /// accumulates the total constant offset applied in the returned constant. It 2174 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 2175 /// no constant offsets applied. 2176 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 2177 if (!V->getType()->isPointerTy()) 2178 return nullptr; 2179 2180 unsigned AS = V->getType()->getPointerAddressSpace(); 2181 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 2182 APInt Offset = APInt::getNullValue(IntPtrWidth); 2183 2184 // Even though we don't look through PHI nodes, we could be called on an 2185 // instruction in an unreachable block, which may be on a cycle. 2186 SmallPtrSet<Value *, 4> Visited; 2187 Visited.insert(V); 2188 do { 2189 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 2190 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 2191 return nullptr; 2192 V = GEP->getPointerOperand(); 2193 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 2194 V = cast<Operator>(V)->getOperand(0); 2195 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 2196 if (GA->isInterposable()) 2197 break; 2198 V = GA->getAliasee(); 2199 } else { 2200 break; 2201 } 2202 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 2203 } while (Visited.insert(V).second); 2204 2205 Type *IdxPtrTy = DL.getIndexType(V->getType()); 2206 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset)); 2207 } 2208 2209 /// Find dead blocks due to deleted CFG edges during inlining. 2210 /// 2211 /// If we know the successor of the current block, \p CurrBB, has to be \p 2212 /// NextBB, the other successors of \p CurrBB are dead if these successors have 2213 /// no live incoming CFG edges. If one block is found to be dead, we can 2214 /// continue growing the dead block list by checking the successors of the dead 2215 /// blocks to see if all their incoming edges are dead or not. 2216 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 2217 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 2218 // A CFG edge is dead if the predecessor is dead or the predecessor has a 2219 // known successor which is not the one under exam. 2220 return (DeadBlocks.count(Pred) || 2221 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 2222 }; 2223 2224 auto IsNewlyDead = [&](BasicBlock *BB) { 2225 // If all the edges to a block are dead, the block is also dead. 2226 return (!DeadBlocks.count(BB) && 2227 llvm::all_of(predecessors(BB), 2228 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 2229 }; 2230 2231 for (BasicBlock *Succ : successors(CurrBB)) { 2232 if (Succ == NextBB || !IsNewlyDead(Succ)) 2233 continue; 2234 SmallVector<BasicBlock *, 4> NewDead; 2235 NewDead.push_back(Succ); 2236 while (!NewDead.empty()) { 2237 BasicBlock *Dead = NewDead.pop_back_val(); 2238 if (DeadBlocks.insert(Dead)) 2239 // Continue growing the dead block lists. 2240 for (BasicBlock *S : successors(Dead)) 2241 if (IsNewlyDead(S)) 2242 NewDead.push_back(S); 2243 } 2244 } 2245 } 2246 2247 /// Analyze a call site for potential inlining. 2248 /// 2249 /// Returns true if inlining this call is viable, and false if it is not 2250 /// viable. It computes the cost and adjusts the threshold based on numerous 2251 /// factors and heuristics. If this method returns false but the computed cost 2252 /// is below the computed threshold, then inlining was forcibly disabled by 2253 /// some artifact of the routine. 2254 InlineResult CallAnalyzer::analyze() { 2255 ++NumCallsAnalyzed; 2256 2257 auto Result = onAnalysisStart(); 2258 if (!Result.isSuccess()) 2259 return Result; 2260 2261 if (F.empty()) 2262 return InlineResult::success(); 2263 2264 Function *Caller = CandidateCall.getFunction(); 2265 // Check if the caller function is recursive itself. 2266 for (User *U : Caller->users()) { 2267 CallBase *Call = dyn_cast<CallBase>(U); 2268 if (Call && Call->getFunction() == Caller) { 2269 IsCallerRecursive = true; 2270 break; 2271 } 2272 } 2273 2274 // Populate our simplified values by mapping from function arguments to call 2275 // arguments with known important simplifications. 2276 auto CAI = CandidateCall.arg_begin(); 2277 for (Argument &FAI : F.args()) { 2278 assert(CAI != CandidateCall.arg_end()); 2279 if (Constant *C = dyn_cast<Constant>(CAI)) 2280 SimplifiedValues[&FAI] = C; 2281 2282 Value *PtrArg = *CAI; 2283 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 2284 ConstantOffsetPtrs[&FAI] = std::make_pair(PtrArg, C->getValue()); 2285 2286 // We can SROA any pointer arguments derived from alloca instructions. 2287 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) { 2288 SROAArgValues[&FAI] = SROAArg; 2289 onInitializeSROAArg(SROAArg); 2290 EnabledSROAAllocas.insert(SROAArg); 2291 } 2292 } 2293 ++CAI; 2294 } 2295 NumConstantArgs = SimplifiedValues.size(); 2296 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 2297 NumAllocaArgs = SROAArgValues.size(); 2298 2299 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 2300 // the ephemeral values multiple times (and they're completely determined by 2301 // the callee, so this is purely duplicate work). 2302 SmallPtrSet<const Value *, 32> EphValues; 2303 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 2304 2305 // The worklist of live basic blocks in the callee *after* inlining. We avoid 2306 // adding basic blocks of the callee which can be proven to be dead for this 2307 // particular call site in order to get more accurate cost estimates. This 2308 // requires a somewhat heavyweight iteration pattern: we need to walk the 2309 // basic blocks in a breadth-first order as we insert live successors. To 2310 // accomplish this, prioritizing for small iterations because we exit after 2311 // crossing our threshold, we use a small-size optimized SetVector. 2312 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 2313 SmallPtrSet<BasicBlock *, 16>> 2314 BBSetVector; 2315 BBSetVector BBWorklist; 2316 BBWorklist.insert(&F.getEntryBlock()); 2317 2318 // Note that we *must not* cache the size, this loop grows the worklist. 2319 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 2320 if (shouldStop()) 2321 break; 2322 2323 BasicBlock *BB = BBWorklist[Idx]; 2324 if (BB->empty()) 2325 continue; 2326 2327 onBlockStart(BB); 2328 2329 // Disallow inlining a blockaddress with uses other than strictly callbr. 2330 // A blockaddress only has defined behavior for an indirect branch in the 2331 // same function, and we do not currently support inlining indirect 2332 // branches. But, the inliner may not see an indirect branch that ends up 2333 // being dead code at a particular call site. If the blockaddress escapes 2334 // the function, e.g., via a global variable, inlining may lead to an 2335 // invalid cross-function reference. 2336 // FIXME: pr/39560: continue relaxing this overt restriction. 2337 if (BB->hasAddressTaken()) 2338 for (User *U : BlockAddress::get(&*BB)->users()) 2339 if (!isa<CallBrInst>(*U)) 2340 return InlineResult::failure("blockaddress used outside of callbr"); 2341 2342 // Analyze the cost of this block. If we blow through the threshold, this 2343 // returns false, and we can bail on out. 2344 InlineResult IR = analyzeBlock(BB, EphValues); 2345 if (!IR.isSuccess()) 2346 return IR; 2347 2348 Instruction *TI = BB->getTerminator(); 2349 2350 // Add in the live successors by first checking whether we have terminator 2351 // that may be simplified based on the values simplified by this call. 2352 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 2353 if (BI->isConditional()) { 2354 Value *Cond = BI->getCondition(); 2355 if (ConstantInt *SimpleCond = 2356 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 2357 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 2358 BBWorklist.insert(NextBB); 2359 KnownSuccessors[BB] = NextBB; 2360 findDeadBlocks(BB, NextBB); 2361 continue; 2362 } 2363 } 2364 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 2365 Value *Cond = SI->getCondition(); 2366 if (ConstantInt *SimpleCond = 2367 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 2368 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 2369 BBWorklist.insert(NextBB); 2370 KnownSuccessors[BB] = NextBB; 2371 findDeadBlocks(BB, NextBB); 2372 continue; 2373 } 2374 } 2375 2376 // If we're unable to select a particular successor, just count all of 2377 // them. 2378 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 2379 ++TIdx) 2380 BBWorklist.insert(TI->getSuccessor(TIdx)); 2381 2382 onBlockAnalyzed(BB); 2383 } 2384 2385 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && 2386 &F == CandidateCall.getCalledFunction(); 2387 // If this is a noduplicate call, we can still inline as long as 2388 // inlining this would cause the removal of the caller (so the instruction 2389 // is not actually duplicated, just moved). 2390 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 2391 return InlineResult::failure("noduplicate"); 2392 2393 return finalizeAnalysis(); 2394 } 2395 2396 void InlineCostCallAnalyzer::print() { 2397 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 2398 if (PrintInstructionComments) 2399 F.print(dbgs(), &Writer); 2400 DEBUG_PRINT_STAT(NumConstantArgs); 2401 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 2402 DEBUG_PRINT_STAT(NumAllocaArgs); 2403 DEBUG_PRINT_STAT(NumConstantPtrCmps); 2404 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 2405 DEBUG_PRINT_STAT(NumInstructionsSimplified); 2406 DEBUG_PRINT_STAT(NumInstructions); 2407 DEBUG_PRINT_STAT(SROACostSavings); 2408 DEBUG_PRINT_STAT(SROACostSavingsLost); 2409 DEBUG_PRINT_STAT(LoadEliminationCost); 2410 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 2411 DEBUG_PRINT_STAT(Cost); 2412 DEBUG_PRINT_STAT(Threshold); 2413 #undef DEBUG_PRINT_STAT 2414 } 2415 2416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2417 /// Dump stats about this call's analysis. 2418 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { print(); } 2419 #endif 2420 2421 /// Test that there are no attribute conflicts between Caller and Callee 2422 /// that prevent inlining. 2423 static bool functionsHaveCompatibleAttributes( 2424 Function *Caller, Function *Callee, TargetTransformInfo &TTI, 2425 function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) { 2426 // Note that CalleeTLI must be a copy not a reference. The legacy pass manager 2427 // caches the most recently created TLI in the TargetLibraryInfoWrapperPass 2428 // object, and always returns the same object (which is overwritten on each 2429 // GetTLI call). Therefore we copy the first result. 2430 auto CalleeTLI = GetTLI(*Callee); 2431 return TTI.areInlineCompatible(Caller, Callee) && 2432 GetTLI(*Caller).areInlineCompatible(CalleeTLI, 2433 InlineCallerSupersetNoBuiltin) && 2434 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 2435 } 2436 2437 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 2438 int Cost = 0; 2439 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 2440 if (Call.isByValArgument(I)) { 2441 // We approximate the number of loads and stores needed by dividing the 2442 // size of the byval type by the target's pointer size. 2443 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2444 unsigned TypeSize = DL.getTypeSizeInBits(Call.getParamByValType(I)); 2445 unsigned AS = PTy->getAddressSpace(); 2446 unsigned PointerSize = DL.getPointerSizeInBits(AS); 2447 // Ceiling division. 2448 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 2449 2450 // If it generates more than 8 stores it is likely to be expanded as an 2451 // inline memcpy so we take that as an upper bound. Otherwise we assume 2452 // one load and one store per word copied. 2453 // FIXME: The maxStoresPerMemcpy setting from the target should be used 2454 // here instead of a magic number of 8, but it's not available via 2455 // DataLayout. 2456 NumStores = std::min(NumStores, 8U); 2457 2458 Cost += 2 * NumStores * InlineConstants::InstrCost; 2459 } else { 2460 // For non-byval arguments subtract off one instruction per call 2461 // argument. 2462 Cost += InlineConstants::InstrCost; 2463 } 2464 } 2465 // The call instruction also disappears after inlining. 2466 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 2467 return Cost; 2468 } 2469 2470 InlineCost llvm::getInlineCost( 2471 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 2472 function_ref<AssumptionCache &(Function &)> GetAssumptionCache, 2473 function_ref<const TargetLibraryInfo &(Function &)> GetTLI, 2474 function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 2475 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2476 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2477 GetAssumptionCache, GetTLI, GetBFI, PSI, ORE); 2478 } 2479 2480 Optional<int> llvm::getInliningCostEstimate( 2481 CallBase &Call, TargetTransformInfo &CalleeTTI, 2482 function_ref<AssumptionCache &(Function &)> GetAssumptionCache, 2483 function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 2484 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2485 const InlineParams Params = {/* DefaultThreshold*/ 0, 2486 /*HintThreshold*/ {}, 2487 /*ColdThreshold*/ {}, 2488 /*OptSizeThreshold*/ {}, 2489 /*OptMinSizeThreshold*/ {}, 2490 /*HotCallSiteThreshold*/ {}, 2491 /*LocallyHotCallSiteThreshold*/ {}, 2492 /*ColdCallSiteThreshold*/ {}, 2493 /*ComputeFullInlineCost*/ true, 2494 /*EnableDeferral*/ true}; 2495 2496 InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI, 2497 GetAssumptionCache, GetBFI, PSI, ORE, true, 2498 /*IgnoreThreshold*/ true); 2499 auto R = CA.analyze(); 2500 if (!R.isSuccess()) 2501 return None; 2502 return CA.getCost(); 2503 } 2504 2505 Optional<InlineResult> llvm::getAttributeBasedInliningDecision( 2506 CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI, 2507 function_ref<const TargetLibraryInfo &(Function &)> GetTLI) { 2508 2509 // Cannot inline indirect calls. 2510 if (!Callee) 2511 return InlineResult::failure("indirect call"); 2512 2513 // When callee coroutine function is inlined into caller coroutine function 2514 // before coro-split pass, 2515 // coro-early pass can not handle this quiet well. 2516 // So we won't inline the coroutine function if it have not been unsplited 2517 if (Callee->isPresplitCoroutine()) 2518 return InlineResult::failure("unsplited coroutine call"); 2519 2520 // Never inline calls with byval arguments that does not have the alloca 2521 // address space. Since byval arguments can be replaced with a copy to an 2522 // alloca, the inlined code would need to be adjusted to handle that the 2523 // argument is in the alloca address space (so it is a little bit complicated 2524 // to solve). 2525 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2526 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2527 if (Call.isByValArgument(I)) { 2528 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2529 if (PTy->getAddressSpace() != AllocaAS) 2530 return InlineResult::failure("byval arguments without alloca" 2531 " address space"); 2532 } 2533 2534 // Calls to functions with always-inline attributes should be inlined 2535 // whenever possible. 2536 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2537 auto IsViable = isInlineViable(*Callee); 2538 if (IsViable.isSuccess()) 2539 return InlineResult::success(); 2540 return InlineResult::failure(IsViable.getFailureReason()); 2541 } 2542 2543 // Never inline functions with conflicting attributes (unless callee has 2544 // always-inline attribute). 2545 Function *Caller = Call.getCaller(); 2546 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI)) 2547 return InlineResult::failure("conflicting attributes"); 2548 2549 // Don't inline this call if the caller has the optnone attribute. 2550 if (Caller->hasOptNone()) 2551 return InlineResult::failure("optnone attribute"); 2552 2553 // Don't inline a function that treats null pointer as valid into a caller 2554 // that does not have this attribute. 2555 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2556 return InlineResult::failure("nullptr definitions incompatible"); 2557 2558 // Don't inline functions which can be interposed at link-time. 2559 if (Callee->isInterposable()) 2560 return InlineResult::failure("interposable"); 2561 2562 // Don't inline functions marked noinline. 2563 if (Callee->hasFnAttribute(Attribute::NoInline)) 2564 return InlineResult::failure("noinline function attribute"); 2565 2566 // Don't inline call sites marked noinline. 2567 if (Call.isNoInline()) 2568 return InlineResult::failure("noinline call site attribute"); 2569 2570 // Don't inline functions if one does not have any stack protector attribute 2571 // but the other does. 2572 if (Caller->hasStackProtectorFnAttr() && !Callee->hasStackProtectorFnAttr()) 2573 return InlineResult::failure( 2574 "stack protected caller but callee requested no stack protector"); 2575 if (Callee->hasStackProtectorFnAttr() && !Caller->hasStackProtectorFnAttr()) 2576 return InlineResult::failure( 2577 "stack protected callee but caller requested no stack protector"); 2578 2579 return None; 2580 } 2581 2582 InlineCost llvm::getInlineCost( 2583 CallBase &Call, Function *Callee, const InlineParams &Params, 2584 TargetTransformInfo &CalleeTTI, 2585 function_ref<AssumptionCache &(Function &)> GetAssumptionCache, 2586 function_ref<const TargetLibraryInfo &(Function &)> GetTLI, 2587 function_ref<BlockFrequencyInfo &(Function &)> GetBFI, 2588 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2589 2590 auto UserDecision = 2591 llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI); 2592 2593 if (UserDecision.hasValue()) { 2594 if (UserDecision->isSuccess()) 2595 return llvm::InlineCost::getAlways("always inline attribute"); 2596 return llvm::InlineCost::getNever(UserDecision->getFailureReason()); 2597 } 2598 2599 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2600 << "... (caller:" << Call.getCaller()->getName() 2601 << ")\n"); 2602 2603 InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI, 2604 GetAssumptionCache, GetBFI, PSI, ORE); 2605 InlineResult ShouldInline = CA.analyze(); 2606 2607 LLVM_DEBUG(CA.dump()); 2608 2609 // Always make cost benefit based decision explicit. 2610 // We use always/never here since threshold is not meaningful, 2611 // as it's not what drives cost-benefit analysis. 2612 if (CA.wasDecidedByCostBenefit()) { 2613 if (ShouldInline.isSuccess()) 2614 return InlineCost::getAlways("benefit over cost"); 2615 else 2616 return InlineCost::getNever("cost over benefit"); 2617 } 2618 2619 // Check if there was a reason to force inlining or no inlining. 2620 if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold()) 2621 return InlineCost::getNever(ShouldInline.getFailureReason()); 2622 if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold()) 2623 return InlineCost::getAlways("empty function"); 2624 2625 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2626 } 2627 2628 InlineResult llvm::isInlineViable(Function &F) { 2629 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2630 for (BasicBlock &BB : F) { 2631 // Disallow inlining of functions which contain indirect branches. 2632 if (isa<IndirectBrInst>(BB.getTerminator())) 2633 return InlineResult::failure("contains indirect branches"); 2634 2635 // Disallow inlining of blockaddresses which are used by non-callbr 2636 // instructions. 2637 if (BB.hasAddressTaken()) 2638 for (User *U : BlockAddress::get(&BB)->users()) 2639 if (!isa<CallBrInst>(*U)) 2640 return InlineResult::failure("blockaddress used outside of callbr"); 2641 2642 for (auto &II : BB) { 2643 CallBase *Call = dyn_cast<CallBase>(&II); 2644 if (!Call) 2645 continue; 2646 2647 // Disallow recursive calls. 2648 Function *Callee = Call->getCalledFunction(); 2649 if (&F == Callee) 2650 return InlineResult::failure("recursive call"); 2651 2652 // Disallow calls which expose returns-twice to a function not previously 2653 // attributed as such. 2654 if (!ReturnsTwice && isa<CallInst>(Call) && 2655 cast<CallInst>(Call)->canReturnTwice()) 2656 return InlineResult::failure("exposes returns-twice attribute"); 2657 2658 if (Callee) 2659 switch (Callee->getIntrinsicID()) { 2660 default: 2661 break; 2662 case llvm::Intrinsic::icall_branch_funnel: 2663 // Disallow inlining of @llvm.icall.branch.funnel because current 2664 // backend can't separate call targets from call arguments. 2665 return InlineResult::failure( 2666 "disallowed inlining of @llvm.icall.branch.funnel"); 2667 case llvm::Intrinsic::localescape: 2668 // Disallow inlining functions that call @llvm.localescape. Doing this 2669 // correctly would require major changes to the inliner. 2670 return InlineResult::failure( 2671 "disallowed inlining of @llvm.localescape"); 2672 case llvm::Intrinsic::vastart: 2673 // Disallow inlining of functions that initialize VarArgs with 2674 // va_start. 2675 return InlineResult::failure( 2676 "contains VarArgs initialized with va_start"); 2677 } 2678 } 2679 } 2680 2681 return InlineResult::success(); 2682 } 2683 2684 // APIs to create InlineParams based on command line flags and/or other 2685 // parameters. 2686 2687 InlineParams llvm::getInlineParams(int Threshold) { 2688 InlineParams Params; 2689 2690 // This field is the threshold to use for a callee by default. This is 2691 // derived from one or more of: 2692 // * optimization or size-optimization levels, 2693 // * a value passed to createFunctionInliningPass function, or 2694 // * the -inline-threshold flag. 2695 // If the -inline-threshold flag is explicitly specified, that is used 2696 // irrespective of anything else. 2697 if (InlineThreshold.getNumOccurrences() > 0) 2698 Params.DefaultThreshold = InlineThreshold; 2699 else 2700 Params.DefaultThreshold = Threshold; 2701 2702 // Set the HintThreshold knob from the -inlinehint-threshold. 2703 Params.HintThreshold = HintThreshold; 2704 2705 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2706 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2707 2708 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2709 // populate LocallyHotCallSiteThreshold. Later, we populate 2710 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2711 // we know that optimization level is O3 (in the getInlineParams variant that 2712 // takes the opt and size levels). 2713 // FIXME: Remove this check (and make the assignment unconditional) after 2714 // addressing size regression issues at O2. 2715 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2716 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2717 2718 // Set the ColdCallSiteThreshold knob from the 2719 // -inline-cold-callsite-threshold. 2720 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2721 2722 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2723 // -inlinehint-threshold commandline option is not explicitly given. If that 2724 // option is present, then its value applies even for callees with size and 2725 // minsize attributes. 2726 // If the -inline-threshold is not specified, set the ColdThreshold from the 2727 // -inlinecold-threshold even if it is not explicitly passed. If 2728 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2729 // explicitly specified to set the ColdThreshold knob 2730 if (InlineThreshold.getNumOccurrences() == 0) { 2731 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2732 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2733 Params.ColdThreshold = ColdThreshold; 2734 } else if (ColdThreshold.getNumOccurrences() > 0) { 2735 Params.ColdThreshold = ColdThreshold; 2736 } 2737 return Params; 2738 } 2739 2740 InlineParams llvm::getInlineParams() { 2741 return getInlineParams(DefaultThreshold); 2742 } 2743 2744 // Compute the default threshold for inlining based on the opt level and the 2745 // size opt level. 2746 static int computeThresholdFromOptLevels(unsigned OptLevel, 2747 unsigned SizeOptLevel) { 2748 if (OptLevel > 2) 2749 return InlineConstants::OptAggressiveThreshold; 2750 if (SizeOptLevel == 1) // -Os 2751 return InlineConstants::OptSizeThreshold; 2752 if (SizeOptLevel == 2) // -Oz 2753 return InlineConstants::OptMinSizeThreshold; 2754 return DefaultThreshold; 2755 } 2756 2757 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2758 auto Params = 2759 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2760 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2761 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2762 // when it is specified explicitly. 2763 if (OptLevel > 2) 2764 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2765 return Params; 2766 } 2767 2768 PreservedAnalyses 2769 InlineCostAnnotationPrinterPass::run(Function &F, 2770 FunctionAnalysisManager &FAM) { 2771 PrintInstructionComments = true; 2772 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 2773 [&](Function &F) -> AssumptionCache & { 2774 return FAM.getResult<AssumptionAnalysis>(F); 2775 }; 2776 Module *M = F.getParent(); 2777 ProfileSummaryInfo PSI(*M); 2778 DataLayout DL(M); 2779 TargetTransformInfo TTI(DL); 2780 // FIXME: Redesign the usage of InlineParams to expand the scope of this pass. 2781 // In the current implementation, the type of InlineParams doesn't matter as 2782 // the pass serves only for verification of inliner's decisions. 2783 // We can add a flag which determines InlineParams for this run. Right now, 2784 // the default InlineParams are used. 2785 const InlineParams Params = llvm::getInlineParams(); 2786 for (BasicBlock &BB : F) { 2787 for (Instruction &I : BB) { 2788 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 2789 Function *CalledFunction = CI->getCalledFunction(); 2790 if (!CalledFunction || CalledFunction->isDeclaration()) 2791 continue; 2792 OptimizationRemarkEmitter ORE(CalledFunction); 2793 InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI, 2794 GetAssumptionCache, nullptr, &PSI, &ORE); 2795 ICCA.analyze(); 2796 OS << " Analyzing call of " << CalledFunction->getName() 2797 << "... (caller:" << CI->getCaller()->getName() << ")\n"; 2798 ICCA.print(); 2799 } 2800 } 2801 } 2802 return PreservedAnalyses::all(); 2803 } 2804