1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CFG.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/IR/PatternMatch.h" 39 #include "llvm/Support/CommandLine.h" 40 #include "llvm/Support/Debug.h" 41 #include "llvm/Support/raw_ostream.h" 42 43 using namespace llvm; 44 45 #define DEBUG_TYPE "inline-cost" 46 47 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 48 49 static cl::opt<int> 50 DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225), 51 cl::ZeroOrMore, 52 cl::desc("Default amount of inlining to perform")); 53 54 static cl::opt<int> InlineThreshold( 55 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 56 cl::desc("Control the amount of inlining to perform (default = 225)")); 57 58 static cl::opt<int> HintThreshold( 59 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 60 cl::desc("Threshold for inlining functions with inline hint")); 61 62 static cl::opt<int> 63 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 64 cl::init(45), cl::ZeroOrMore, 65 cl::desc("Threshold for inlining cold callsites")); 66 67 // We introduce this threshold to help performance of instrumentation based 68 // PGO before we actually hook up inliner with analysis passes such as BPI and 69 // BFI. 70 static cl::opt<int> ColdThreshold( 71 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 72 cl::desc("Threshold for inlining functions with cold attribute")); 73 74 static cl::opt<int> 75 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 76 cl::ZeroOrMore, 77 cl::desc("Threshold for hot callsites ")); 78 79 static cl::opt<int> LocallyHotCallSiteThreshold( 80 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 81 cl::desc("Threshold for locally hot callsites ")); 82 83 static cl::opt<int> ColdCallSiteRelFreq( 84 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 85 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 86 "entry frequency, for a callsite to be cold in the absence of " 87 "profile information.")); 88 89 static cl::opt<int> HotCallSiteRelFreq( 90 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 91 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 92 "entry frequency, for a callsite to be hot in the absence of " 93 "profile information.")); 94 95 static cl::opt<bool> OptComputeFullInlineCost( 96 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 97 cl::desc("Compute the full inline cost of a call site even when the cost " 98 "exceeds the threshold.")); 99 100 namespace { 101 class InlineCostCallAnalyzer; 102 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 103 typedef InstVisitor<CallAnalyzer, bool> Base; 104 friend class InstVisitor<CallAnalyzer, bool>; 105 106 protected: 107 virtual ~CallAnalyzer() {} 108 /// The TargetTransformInfo available for this compilation. 109 const TargetTransformInfo &TTI; 110 111 /// Getter for the cache of @llvm.assume intrinsics. 112 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 113 114 /// Getter for BlockFrequencyInfo 115 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 116 117 /// Profile summary information. 118 ProfileSummaryInfo *PSI; 119 120 /// The called function. 121 Function &F; 122 123 // Cache the DataLayout since we use it a lot. 124 const DataLayout &DL; 125 126 /// The OptimizationRemarkEmitter available for this compilation. 127 OptimizationRemarkEmitter *ORE; 128 129 /// The candidate callsite being analyzed. Please do not use this to do 130 /// analysis in the caller function; we want the inline cost query to be 131 /// easily cacheable. Instead, use the cover function paramHasAttr. 132 CallBase &CandidateCall; 133 134 /// Extension points for handling callsite features. 135 /// Called after a basic block was analyzed. 136 virtual void onBlockAnalyzed(const BasicBlock *BB) {} 137 138 /// Called at the end of the analysis of the callsite. Return the outcome of 139 /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or 140 /// the reason it can't. 141 virtual InlineResult finalizeAnalysis() { return InlineResult::success(); } 142 /// Called when we're about to start processing a basic block, and every time 143 /// we are done processing an instruction. Return true if there is no point in 144 /// continuing the analysis (e.g. we've determined already the call site is 145 /// too expensive to inline) 146 virtual bool shouldStop() { return false; } 147 148 /// Called before the analysis of the callee body starts (with callsite 149 /// contexts propagated). It checks callsite-specific information. Return a 150 /// reason analysis can't continue if that's the case, or 'true' if it may 151 /// continue. 152 virtual InlineResult onAnalysisStart() { return InlineResult::success(); } 153 /// Called if the analysis engine decides SROA cannot be done for the given 154 /// alloca. 155 virtual void onDisableSROA(AllocaInst *Arg) {} 156 157 /// Called the analysis engine determines load elimination won't happen. 158 virtual void onDisableLoadElimination() {} 159 160 /// Called to account for a call. 161 virtual void onCallPenalty() {} 162 163 /// Called to account for the expectation the inlining would result in a load 164 /// elimination. 165 virtual void onLoadEliminationOpportunity() {} 166 167 /// Called to account for the cost of argument setup for the Call in the 168 /// callee's body (not the callsite currently under analysis). 169 virtual void onCallArgumentSetup(const CallBase &Call) {} 170 171 /// Called to account for a load relative intrinsic. 172 virtual void onLoadRelativeIntrinsic() {} 173 174 /// Called to account for a lowered call. 175 virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) { 176 } 177 178 /// Account for a jump table of given size. Return false to stop further 179 /// processing the switch instruction 180 virtual bool onJumpTable(unsigned JumpTableSize) { return true; } 181 182 /// Account for a case cluster of given size. Return false to stop further 183 /// processing of the instruction. 184 virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; } 185 186 /// Called at the end of processing a switch instruction, with the given 187 /// number of case clusters. 188 virtual void onFinalizeSwitch(unsigned JumpTableSize, 189 unsigned NumCaseCluster) {} 190 191 /// Called to account for any other instruction not specifically accounted 192 /// for. 193 virtual void onMissedSimplification() {} 194 195 /// Start accounting potential benefits due to SROA for the given alloca. 196 virtual void onInitializeSROAArg(AllocaInst *Arg) {} 197 198 /// Account SROA savings for the AllocaInst value. 199 virtual void onAggregateSROAUse(AllocaInst *V) {} 200 201 bool handleSROA(Value *V, bool DoNotDisable) { 202 // Check for SROA candidates in comparisons. 203 if (auto *SROAArg = getSROAArgForValueOrNull(V)) { 204 if (DoNotDisable) { 205 onAggregateSROAUse(SROAArg); 206 return true; 207 } 208 disableSROAForArg(SROAArg); 209 } 210 return false; 211 } 212 213 bool IsCallerRecursive = false; 214 bool IsRecursiveCall = false; 215 bool ExposesReturnsTwice = false; 216 bool HasDynamicAlloca = false; 217 bool ContainsNoDuplicateCall = false; 218 bool HasReturn = false; 219 bool HasIndirectBr = false; 220 bool HasUninlineableIntrinsic = false; 221 bool InitsVargArgs = false; 222 223 /// Number of bytes allocated statically by the callee. 224 uint64_t AllocatedSize = 0; 225 unsigned NumInstructions = 0; 226 unsigned NumVectorInstructions = 0; 227 228 /// While we walk the potentially-inlined instructions, we build up and 229 /// maintain a mapping of simplified values specific to this callsite. The 230 /// idea is to propagate any special information we have about arguments to 231 /// this call through the inlinable section of the function, and account for 232 /// likely simplifications post-inlining. The most important aspect we track 233 /// is CFG altering simplifications -- when we prove a basic block dead, that 234 /// can cause dramatic shifts in the cost of inlining a function. 235 DenseMap<Value *, Constant *> SimplifiedValues; 236 237 /// Keep track of the values which map back (through function arguments) to 238 /// allocas on the caller stack which could be simplified through SROA. 239 DenseMap<Value *, AllocaInst *> SROAArgValues; 240 241 /// Keep track of Allocas for which we believe we may get SROA optimization. 242 DenseSet<AllocaInst *> EnabledSROAAllocas; 243 244 /// Keep track of values which map to a pointer base and constant offset. 245 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 246 247 /// Keep track of dead blocks due to the constant arguments. 248 SetVector<BasicBlock *> DeadBlocks; 249 250 /// The mapping of the blocks to their known unique successors due to the 251 /// constant arguments. 252 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 253 254 /// Model the elimination of repeated loads that is expected to happen 255 /// whenever we simplify away the stores that would otherwise cause them to be 256 /// loads. 257 bool EnableLoadElimination; 258 SmallPtrSet<Value *, 16> LoadAddrSet; 259 260 AllocaInst *getSROAArgForValueOrNull(Value *V) const { 261 auto It = SROAArgValues.find(V); 262 if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0) 263 return nullptr; 264 return It->second; 265 } 266 267 // Custom simplification helper routines. 268 bool isAllocaDerivedArg(Value *V); 269 void disableSROAForArg(AllocaInst *SROAArg); 270 void disableSROA(Value *V); 271 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 272 void disableLoadElimination(); 273 bool isGEPFree(GetElementPtrInst &GEP); 274 bool canFoldInboundsGEP(GetElementPtrInst &I); 275 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 276 bool simplifyCallSite(Function *F, CallBase &Call); 277 template <typename Callable> 278 bool simplifyInstruction(Instruction &I, Callable Evaluate); 279 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 280 281 /// Return true if the given argument to the function being considered for 282 /// inlining has the given attribute set either at the call site or the 283 /// function declaration. Primarily used to inspect call site specific 284 /// attributes since these can be more precise than the ones on the callee 285 /// itself. 286 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 287 288 /// Return true if the given value is known non null within the callee if 289 /// inlined through this particular callsite. 290 bool isKnownNonNullInCallee(Value *V); 291 292 /// Return true if size growth is allowed when inlining the callee at \p Call. 293 bool allowSizeGrowth(CallBase &Call); 294 295 // Custom analysis routines. 296 InlineResult analyzeBlock(BasicBlock *BB, 297 SmallPtrSetImpl<const Value *> &EphValues); 298 299 // Disable several entry points to the visitor so we don't accidentally use 300 // them by declaring but not defining them here. 301 void visit(Module *); 302 void visit(Module &); 303 void visit(Function *); 304 void visit(Function &); 305 void visit(BasicBlock *); 306 void visit(BasicBlock &); 307 308 // Provide base case for our instruction visit. 309 bool visitInstruction(Instruction &I); 310 311 // Our visit overrides. 312 bool visitAlloca(AllocaInst &I); 313 bool visitPHI(PHINode &I); 314 bool visitGetElementPtr(GetElementPtrInst &I); 315 bool visitBitCast(BitCastInst &I); 316 bool visitPtrToInt(PtrToIntInst &I); 317 bool visitIntToPtr(IntToPtrInst &I); 318 bool visitCastInst(CastInst &I); 319 bool visitUnaryInstruction(UnaryInstruction &I); 320 bool visitCmpInst(CmpInst &I); 321 bool visitSub(BinaryOperator &I); 322 bool visitBinaryOperator(BinaryOperator &I); 323 bool visitFNeg(UnaryOperator &I); 324 bool visitLoad(LoadInst &I); 325 bool visitStore(StoreInst &I); 326 bool visitExtractValue(ExtractValueInst &I); 327 bool visitInsertValue(InsertValueInst &I); 328 bool visitCallBase(CallBase &Call); 329 bool visitReturnInst(ReturnInst &RI); 330 bool visitBranchInst(BranchInst &BI); 331 bool visitSelectInst(SelectInst &SI); 332 bool visitSwitchInst(SwitchInst &SI); 333 bool visitIndirectBrInst(IndirectBrInst &IBI); 334 bool visitResumeInst(ResumeInst &RI); 335 bool visitCleanupReturnInst(CleanupReturnInst &RI); 336 bool visitCatchReturnInst(CatchReturnInst &RI); 337 bool visitUnreachableInst(UnreachableInst &I); 338 339 public: 340 CallAnalyzer(const TargetTransformInfo &TTI, 341 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 342 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 343 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 344 Function &Callee, CallBase &Call) 345 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 346 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 347 CandidateCall(Call), EnableLoadElimination(true) {} 348 349 InlineResult analyze(); 350 351 // Keep a bunch of stats about the cost savings found so we can print them 352 // out when debugging. 353 unsigned NumConstantArgs = 0; 354 unsigned NumConstantOffsetPtrArgs = 0; 355 unsigned NumAllocaArgs = 0; 356 unsigned NumConstantPtrCmps = 0; 357 unsigned NumConstantPtrDiffs = 0; 358 unsigned NumInstructionsSimplified = 0; 359 360 void dump(); 361 }; 362 363 /// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note 364 /// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer 365 class InlineCostCallAnalyzer final : public CallAnalyzer { 366 const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 367 const bool ComputeFullInlineCost; 368 int LoadEliminationCost = 0; 369 /// Bonus to be applied when percentage of vector instructions in callee is 370 /// high (see more details in updateThreshold). 371 int VectorBonus = 0; 372 /// Bonus to be applied when the callee has only one reachable basic block. 373 int SingleBBBonus = 0; 374 375 /// Tunable parameters that control the analysis. 376 const InlineParams &Params; 377 378 /// Upper bound for the inlining cost. Bonuses are being applied to account 379 /// for speculative "expected profit" of the inlining decision. 380 int Threshold = 0; 381 382 /// Attempt to evaluate indirect calls to boost its inline cost. 383 const bool BoostIndirectCalls; 384 385 /// Inlining cost measured in abstract units, accounts for all the 386 /// instructions expected to be executed for a given function invocation. 387 /// Instructions that are statically proven to be dead based on call-site 388 /// arguments are not counted here. 389 int Cost = 0; 390 391 bool SingleBB = true; 392 393 unsigned SROACostSavings = 0; 394 unsigned SROACostSavingsLost = 0; 395 396 /// The mapping of caller Alloca values to their accumulated cost savings. If 397 /// we have to disable SROA for one of the allocas, this tells us how much 398 /// cost must be added. 399 DenseMap<AllocaInst *, int> SROAArgCosts; 400 401 /// Return true if \p Call is a cold callsite. 402 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 403 404 /// Update Threshold based on callsite properties such as callee 405 /// attributes and callee hotness for PGO builds. The Callee is explicitly 406 /// passed to support analyzing indirect calls whose target is inferred by 407 /// analysis. 408 void updateThreshold(CallBase &Call, Function &Callee); 409 /// Return a higher threshold if \p Call is a hot callsite. 410 Optional<int> getHotCallSiteThreshold(CallBase &Call, 411 BlockFrequencyInfo *CallerBFI); 412 413 /// Handle a capped 'int' increment for Cost. 414 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 415 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 416 Cost = (int)std::min(UpperBound, Cost + Inc); 417 } 418 419 void onDisableSROA(AllocaInst *Arg) override { 420 auto CostIt = SROAArgCosts.find(Arg); 421 if (CostIt == SROAArgCosts.end()) 422 return; 423 addCost(CostIt->second); 424 SROACostSavings -= CostIt->second; 425 SROACostSavingsLost += CostIt->second; 426 SROAArgCosts.erase(CostIt); 427 } 428 429 void onDisableLoadElimination() override { 430 addCost(LoadEliminationCost); 431 LoadEliminationCost = 0; 432 } 433 void onCallPenalty() override { addCost(InlineConstants::CallPenalty); } 434 void onCallArgumentSetup(const CallBase &Call) override { 435 // Pay the price of the argument setup. We account for the average 1 436 // instruction per call argument setup here. 437 addCost(Call.arg_size() * InlineConstants::InstrCost); 438 } 439 void onLoadRelativeIntrinsic() override { 440 // This is normally lowered to 4 LLVM instructions. 441 addCost(3 * InlineConstants::InstrCost); 442 } 443 void onLoweredCall(Function *F, CallBase &Call, 444 bool IsIndirectCall) override { 445 // We account for the average 1 instruction per call argument setup here. 446 addCost(Call.arg_size() * InlineConstants::InstrCost); 447 448 // If we have a constant that we are calling as a function, we can peer 449 // through it and see the function target. This happens not infrequently 450 // during devirtualization and so we want to give it a hefty bonus for 451 // inlining, but cap that bonus in the event that inlining wouldn't pan out. 452 // Pretend to inline the function, with a custom threshold. 453 if (IsIndirectCall && BoostIndirectCalls) { 454 auto IndirectCallParams = Params; 455 IndirectCallParams.DefaultThreshold = 456 InlineConstants::IndirectCallThreshold; 457 /// FIXME: if InlineCostCallAnalyzer is derived from, this may need 458 /// to instantiate the derived class. 459 InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, 460 Call, IndirectCallParams, false); 461 if (CA.analyze().isSuccess()) { 462 // We were able to inline the indirect call! Subtract the cost from the 463 // threshold to get the bonus we want to apply, but don't go below zero. 464 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 465 } 466 } else 467 // Otherwise simply add the cost for merely making the call. 468 addCost(InlineConstants::CallPenalty); 469 } 470 471 void onFinalizeSwitch(unsigned JumpTableSize, 472 unsigned NumCaseCluster) override { 473 // If suitable for a jump table, consider the cost for the table size and 474 // branch to destination. 475 // Maximum valid cost increased in this function. 476 if (JumpTableSize) { 477 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 478 4 * InlineConstants::InstrCost; 479 480 addCost(JTCost, (int64_t)CostUpperBound); 481 return; 482 } 483 // Considering forming a binary search, we should find the number of nodes 484 // which is same as the number of comparisons when lowered. For a given 485 // number of clusters, n, we can define a recursive function, f(n), to find 486 // the number of nodes in the tree. The recursion is : 487 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 488 // and f(n) = n, when n <= 3. 489 // This will lead a binary tree where the leaf should be either f(2) or f(3) 490 // when n > 3. So, the number of comparisons from leaves should be n, while 491 // the number of non-leaf should be : 492 // 2^(log2(n) - 1) - 1 493 // = 2^log2(n) * 2^-1 - 1 494 // = n / 2 - 1. 495 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 496 // number of comparisons in a simple closed form : 497 // n + n / 2 - 1 = n * 3 / 2 - 1 498 if (NumCaseCluster <= 3) { 499 // Suppose a comparison includes one compare and one conditional branch. 500 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 501 return; 502 } 503 504 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 505 int64_t SwitchCost = 506 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 507 508 addCost(SwitchCost, (int64_t)CostUpperBound); 509 } 510 void onMissedSimplification() override { 511 addCost(InlineConstants::InstrCost); 512 } 513 514 void onInitializeSROAArg(AllocaInst *Arg) override { 515 assert(Arg != nullptr && 516 "Should not initialize SROA costs for null value."); 517 SROAArgCosts[Arg] = 0; 518 } 519 520 void onAggregateSROAUse(AllocaInst *SROAArg) override { 521 auto CostIt = SROAArgCosts.find(SROAArg); 522 assert(CostIt != SROAArgCosts.end() && 523 "expected this argument to have a cost"); 524 CostIt->second += InlineConstants::InstrCost; 525 SROACostSavings += InlineConstants::InstrCost; 526 } 527 528 void onBlockAnalyzed(const BasicBlock *BB) override { 529 auto *TI = BB->getTerminator(); 530 // If we had any successors at this point, than post-inlining is likely to 531 // have them as well. Note that we assume any basic blocks which existed 532 // due to branches or switches which folded above will also fold after 533 // inlining. 534 if (SingleBB && TI->getNumSuccessors() > 1) { 535 // Take off the bonus we applied to the threshold. 536 Threshold -= SingleBBBonus; 537 SingleBB = false; 538 } 539 } 540 541 InlineResult finalizeAnalysis() override { 542 // Loops generally act a lot like calls in that they act like barriers to 543 // movement, require a certain amount of setup, etc. So when optimising for 544 // size, we penalise any call sites that perform loops. We do this after all 545 // other costs here, so will likely only be dealing with relatively small 546 // functions (and hence DT and LI will hopefully be cheap). 547 auto *Caller = CandidateCall.getFunction(); 548 if (Caller->hasMinSize()) { 549 DominatorTree DT(F); 550 LoopInfo LI(DT); 551 int NumLoops = 0; 552 for (Loop *L : LI) { 553 // Ignore loops that will not be executed 554 if (DeadBlocks.count(L->getHeader())) 555 continue; 556 NumLoops++; 557 } 558 addCost(NumLoops * InlineConstants::CallPenalty); 559 } 560 561 // We applied the maximum possible vector bonus at the beginning. Now, 562 // subtract the excess bonus, if any, from the Threshold before 563 // comparing against Cost. 564 if (NumVectorInstructions <= NumInstructions / 10) 565 Threshold -= VectorBonus; 566 else if (NumVectorInstructions <= NumInstructions / 2) 567 Threshold -= VectorBonus / 2; 568 569 if (Cost < std::max(1, Threshold)) 570 return InlineResult::success(); 571 return InlineResult::failure("Cost over threshold."); 572 } 573 bool shouldStop() override { 574 // Bail out the moment we cross the threshold. This means we'll under-count 575 // the cost, but only when undercounting doesn't matter. 576 return Cost >= Threshold && !ComputeFullInlineCost; 577 } 578 579 void onLoadEliminationOpportunity() override { 580 LoadEliminationCost += InlineConstants::InstrCost; 581 } 582 583 InlineResult onAnalysisStart() override { 584 // Perform some tweaks to the cost and threshold based on the direct 585 // callsite information. 586 587 // We want to more aggressively inline vector-dense kernels, so up the 588 // threshold, and we'll lower it if the % of vector instructions gets too 589 // low. Note that these bonuses are some what arbitrary and evolved over 590 // time by accident as much as because they are principled bonuses. 591 // 592 // FIXME: It would be nice to remove all such bonuses. At least it would be 593 // nice to base the bonus values on something more scientific. 594 assert(NumInstructions == 0); 595 assert(NumVectorInstructions == 0); 596 597 // Update the threshold based on callsite properties 598 updateThreshold(CandidateCall, F); 599 600 // While Threshold depends on commandline options that can take negative 601 // values, we want to enforce the invariant that the computed threshold and 602 // bonuses are non-negative. 603 assert(Threshold >= 0); 604 assert(SingleBBBonus >= 0); 605 assert(VectorBonus >= 0); 606 607 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 608 // this Threshold any time, and cost cannot decrease, we can stop processing 609 // the rest of the function body. 610 Threshold += (SingleBBBonus + VectorBonus); 611 612 // Give out bonuses for the callsite, as the instructions setting them up 613 // will be gone after inlining. 614 addCost(-getCallsiteCost(this->CandidateCall, DL)); 615 616 // If this function uses the coldcc calling convention, prefer not to inline 617 // it. 618 if (F.getCallingConv() == CallingConv::Cold) 619 Cost += InlineConstants::ColdccPenalty; 620 621 // Check if we're done. This can happen due to bonuses and penalties. 622 if (Cost >= Threshold && !ComputeFullInlineCost) 623 return InlineResult::failure("high cost"); 624 625 return InlineResult::success(); 626 } 627 628 public: 629 InlineCostCallAnalyzer( 630 const TargetTransformInfo &TTI, 631 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 632 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 633 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee, 634 CallBase &Call, const InlineParams &Params, bool BoostIndirect = true) 635 : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call), 636 ComputeFullInlineCost(OptComputeFullInlineCost || 637 Params.ComputeFullInlineCost || ORE), 638 Params(Params), Threshold(Params.DefaultThreshold), 639 BoostIndirectCalls(BoostIndirect) {} 640 void dump(); 641 642 virtual ~InlineCostCallAnalyzer() {} 643 int getThreshold() { return Threshold; } 644 int getCost() { return Cost; } 645 }; 646 } // namespace 647 648 /// Test whether the given value is an Alloca-derived function argument. 649 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 650 return SROAArgValues.count(V); 651 } 652 653 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) { 654 onDisableSROA(SROAArg); 655 EnabledSROAAllocas.erase(SROAArg); 656 disableLoadElimination(); 657 } 658 /// If 'V' maps to a SROA candidate, disable SROA for it. 659 void CallAnalyzer::disableSROA(Value *V) { 660 if (auto *SROAArg = getSROAArgForValueOrNull(V)) { 661 disableSROAForArg(SROAArg); 662 } 663 } 664 665 void CallAnalyzer::disableLoadElimination() { 666 if (EnableLoadElimination) { 667 onDisableLoadElimination(); 668 EnableLoadElimination = false; 669 } 670 } 671 672 /// Accumulate a constant GEP offset into an APInt if possible. 673 /// 674 /// Returns false if unable to compute the offset for any reason. Respects any 675 /// simplified values known during the analysis of this callsite. 676 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 677 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 678 assert(IntPtrWidth == Offset.getBitWidth()); 679 680 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 681 GTI != GTE; ++GTI) { 682 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 683 if (!OpC) 684 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 685 OpC = dyn_cast<ConstantInt>(SimpleOp); 686 if (!OpC) 687 return false; 688 if (OpC->isZero()) 689 continue; 690 691 // Handle a struct index, which adds its field offset to the pointer. 692 if (StructType *STy = GTI.getStructTypeOrNull()) { 693 unsigned ElementIdx = OpC->getZExtValue(); 694 const StructLayout *SL = DL.getStructLayout(STy); 695 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 696 continue; 697 } 698 699 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 700 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 701 } 702 return true; 703 } 704 705 /// Use TTI to check whether a GEP is free. 706 /// 707 /// Respects any simplified values known during the analysis of this callsite. 708 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 709 SmallVector<Value *, 4> Operands; 710 Operands.push_back(GEP.getOperand(0)); 711 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 712 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 713 Operands.push_back(SimpleOp); 714 else 715 Operands.push_back(*I); 716 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 717 } 718 719 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 720 // Check whether inlining will turn a dynamic alloca into a static 721 // alloca and handle that case. 722 if (I.isArrayAllocation()) { 723 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 724 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 725 Type *Ty = I.getAllocatedType(); 726 AllocatedSize = SaturatingMultiplyAdd( 727 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(), 728 AllocatedSize); 729 return Base::visitAlloca(I); 730 } 731 } 732 733 // Accumulate the allocated size. 734 if (I.isStaticAlloca()) { 735 Type *Ty = I.getAllocatedType(); 736 AllocatedSize = 737 SaturatingAdd(DL.getTypeAllocSize(Ty).getFixedSize(), AllocatedSize); 738 } 739 740 // We will happily inline static alloca instructions. 741 if (I.isStaticAlloca()) 742 return Base::visitAlloca(I); 743 744 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 745 // a variety of reasons, and so we would like to not inline them into 746 // functions which don't currently have a dynamic alloca. This simply 747 // disables inlining altogether in the presence of a dynamic alloca. 748 HasDynamicAlloca = true; 749 return false; 750 } 751 752 bool CallAnalyzer::visitPHI(PHINode &I) { 753 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 754 // though we don't want to propagate it's bonuses. The idea is to disable 755 // SROA if it *might* be used in an inappropriate manner. 756 757 // Phi nodes are always zero-cost. 758 // FIXME: Pointer sizes may differ between different address spaces, so do we 759 // need to use correct address space in the call to getPointerSizeInBits here? 760 // Or could we skip the getPointerSizeInBits call completely? As far as I can 761 // see the ZeroOffset is used as a dummy value, so we can probably use any 762 // bit width for the ZeroOffset? 763 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 764 bool CheckSROA = I.getType()->isPointerTy(); 765 766 // Track the constant or pointer with constant offset we've seen so far. 767 Constant *FirstC = nullptr; 768 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 769 Value *FirstV = nullptr; 770 771 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 772 BasicBlock *Pred = I.getIncomingBlock(i); 773 // If the incoming block is dead, skip the incoming block. 774 if (DeadBlocks.count(Pred)) 775 continue; 776 // If the parent block of phi is not the known successor of the incoming 777 // block, skip the incoming block. 778 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 779 if (KnownSuccessor && KnownSuccessor != I.getParent()) 780 continue; 781 782 Value *V = I.getIncomingValue(i); 783 // If the incoming value is this phi itself, skip the incoming value. 784 if (&I == V) 785 continue; 786 787 Constant *C = dyn_cast<Constant>(V); 788 if (!C) 789 C = SimplifiedValues.lookup(V); 790 791 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 792 if (!C && CheckSROA) 793 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 794 795 if (!C && !BaseAndOffset.first) 796 // The incoming value is neither a constant nor a pointer with constant 797 // offset, exit early. 798 return true; 799 800 if (FirstC) { 801 if (FirstC == C) 802 // If we've seen a constant incoming value before and it is the same 803 // constant we see this time, continue checking the next incoming value. 804 continue; 805 // Otherwise early exit because we either see a different constant or saw 806 // a constant before but we have a pointer with constant offset this time. 807 return true; 808 } 809 810 if (FirstV) { 811 // The same logic as above, but check pointer with constant offset here. 812 if (FirstBaseAndOffset == BaseAndOffset) 813 continue; 814 return true; 815 } 816 817 if (C) { 818 // This is the 1st time we've seen a constant, record it. 819 FirstC = C; 820 continue; 821 } 822 823 // The remaining case is that this is the 1st time we've seen a pointer with 824 // constant offset, record it. 825 FirstV = V; 826 FirstBaseAndOffset = BaseAndOffset; 827 } 828 829 // Check if we can map phi to a constant. 830 if (FirstC) { 831 SimplifiedValues[&I] = FirstC; 832 return true; 833 } 834 835 // Check if we can map phi to a pointer with constant offset. 836 if (FirstBaseAndOffset.first) { 837 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 838 839 if (auto *SROAArg = getSROAArgForValueOrNull(FirstV)) 840 SROAArgValues[&I] = SROAArg; 841 } 842 843 return true; 844 } 845 846 /// Check we can fold GEPs of constant-offset call site argument pointers. 847 /// This requires target data and inbounds GEPs. 848 /// 849 /// \return true if the specified GEP can be folded. 850 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 851 // Check if we have a base + offset for the pointer. 852 std::pair<Value *, APInt> BaseAndOffset = 853 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 854 if (!BaseAndOffset.first) 855 return false; 856 857 // Check if the offset of this GEP is constant, and if so accumulate it 858 // into Offset. 859 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 860 return false; 861 862 // Add the result as a new mapping to Base + Offset. 863 ConstantOffsetPtrs[&I] = BaseAndOffset; 864 865 return true; 866 } 867 868 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 869 auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand()); 870 871 // Lambda to check whether a GEP's indices are all constant. 872 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 873 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 874 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 875 return false; 876 return true; 877 }; 878 879 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 880 if (SROAArg) 881 SROAArgValues[&I] = SROAArg; 882 883 // Constant GEPs are modeled as free. 884 return true; 885 } 886 887 // Variable GEPs will require math and will disable SROA. 888 if (SROAArg) 889 disableSROAForArg(SROAArg); 890 return isGEPFree(I); 891 } 892 893 /// Simplify \p I if its operands are constants and update SimplifiedValues. 894 /// \p Evaluate is a callable specific to instruction type that evaluates the 895 /// instruction when all the operands are constants. 896 template <typename Callable> 897 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 898 SmallVector<Constant *, 2> COps; 899 for (Value *Op : I.operands()) { 900 Constant *COp = dyn_cast<Constant>(Op); 901 if (!COp) 902 COp = SimplifiedValues.lookup(Op); 903 if (!COp) 904 return false; 905 COps.push_back(COp); 906 } 907 auto *C = Evaluate(COps); 908 if (!C) 909 return false; 910 SimplifiedValues[&I] = C; 911 return true; 912 } 913 914 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 915 // Propagate constants through bitcasts. 916 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 917 return ConstantExpr::getBitCast(COps[0], I.getType()); 918 })) 919 return true; 920 921 // Track base/offsets through casts 922 std::pair<Value *, APInt> BaseAndOffset = 923 ConstantOffsetPtrs.lookup(I.getOperand(0)); 924 // Casts don't change the offset, just wrap it up. 925 if (BaseAndOffset.first) 926 ConstantOffsetPtrs[&I] = BaseAndOffset; 927 928 // Also look for SROA candidates here. 929 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) 930 SROAArgValues[&I] = SROAArg; 931 932 // Bitcasts are always zero cost. 933 return true; 934 } 935 936 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 937 // Propagate constants through ptrtoint. 938 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 939 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 940 })) 941 return true; 942 943 // Track base/offset pairs when converted to a plain integer provided the 944 // integer is large enough to represent the pointer. 945 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 946 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 947 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 948 std::pair<Value *, APInt> BaseAndOffset = 949 ConstantOffsetPtrs.lookup(I.getOperand(0)); 950 if (BaseAndOffset.first) 951 ConstantOffsetPtrs[&I] = BaseAndOffset; 952 } 953 954 // This is really weird. Technically, ptrtoint will disable SROA. However, 955 // unless that ptrtoint is *used* somewhere in the live basic blocks after 956 // inlining, it will be nuked, and SROA should proceed. All of the uses which 957 // would block SROA would also block SROA if applied directly to a pointer, 958 // and so we can just add the integer in here. The only places where SROA is 959 // preserved either cannot fire on an integer, or won't in-and-of themselves 960 // disable SROA (ext) w/o some later use that we would see and disable. 961 if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0))) 962 SROAArgValues[&I] = SROAArg; 963 964 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 965 } 966 967 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 968 // Propagate constants through ptrtoint. 969 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 970 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 971 })) 972 return true; 973 974 // Track base/offset pairs when round-tripped through a pointer without 975 // modifications provided the integer is not too large. 976 Value *Op = I.getOperand(0); 977 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 978 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 979 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 980 if (BaseAndOffset.first) 981 ConstantOffsetPtrs[&I] = BaseAndOffset; 982 } 983 984 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 985 if (auto *SROAArg = getSROAArgForValueOrNull(Op)) 986 SROAArgValues[&I] = SROAArg; 987 988 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 989 } 990 991 bool CallAnalyzer::visitCastInst(CastInst &I) { 992 // Propagate constants through casts. 993 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 994 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 995 })) 996 return true; 997 998 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 999 disableSROA(I.getOperand(0)); 1000 1001 // If this is a floating-point cast, and the target says this operation 1002 // is expensive, this may eventually become a library call. Treat the cost 1003 // as such. 1004 switch (I.getOpcode()) { 1005 case Instruction::FPTrunc: 1006 case Instruction::FPExt: 1007 case Instruction::UIToFP: 1008 case Instruction::SIToFP: 1009 case Instruction::FPToUI: 1010 case Instruction::FPToSI: 1011 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1012 onCallPenalty(); 1013 break; 1014 default: 1015 break; 1016 } 1017 1018 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 1019 } 1020 1021 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 1022 Value *Operand = I.getOperand(0); 1023 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1024 return ConstantFoldInstOperands(&I, COps[0], DL); 1025 })) 1026 return true; 1027 1028 // Disable any SROA on the argument to arbitrary unary instructions. 1029 disableSROA(Operand); 1030 1031 return false; 1032 } 1033 1034 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 1035 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 1036 } 1037 1038 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 1039 // Does the *call site* have the NonNull attribute set on an argument? We 1040 // use the attribute on the call site to memoize any analysis done in the 1041 // caller. This will also trip if the callee function has a non-null 1042 // parameter attribute, but that's a less interesting case because hopefully 1043 // the callee would already have been simplified based on that. 1044 if (Argument *A = dyn_cast<Argument>(V)) 1045 if (paramHasAttr(A, Attribute::NonNull)) 1046 return true; 1047 1048 // Is this an alloca in the caller? This is distinct from the attribute case 1049 // above because attributes aren't updated within the inliner itself and we 1050 // always want to catch the alloca derived case. 1051 if (isAllocaDerivedArg(V)) 1052 // We can actually predict the result of comparisons between an 1053 // alloca-derived value and null. Note that this fires regardless of 1054 // SROA firing. 1055 return true; 1056 1057 return false; 1058 } 1059 1060 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 1061 // If the normal destination of the invoke or the parent block of the call 1062 // site is unreachable-terminated, there is little point in inlining this 1063 // unless there is literally zero cost. 1064 // FIXME: Note that it is possible that an unreachable-terminated block has a 1065 // hot entry. For example, in below scenario inlining hot_call_X() may be 1066 // beneficial : 1067 // main() { 1068 // hot_call_1(); 1069 // ... 1070 // hot_call_N() 1071 // exit(0); 1072 // } 1073 // For now, we are not handling this corner case here as it is rare in real 1074 // code. In future, we should elaborate this based on BPI and BFI in more 1075 // general threshold adjusting heuristics in updateThreshold(). 1076 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 1077 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 1078 return false; 1079 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 1080 return false; 1081 1082 return true; 1083 } 1084 1085 bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call, 1086 BlockFrequencyInfo *CallerBFI) { 1087 // If global profile summary is available, then callsite's coldness is 1088 // determined based on that. 1089 if (PSI && PSI->hasProfileSummary()) 1090 return PSI->isColdCallSite(CallSite(&Call), CallerBFI); 1091 1092 // Otherwise we need BFI to be available. 1093 if (!CallerBFI) 1094 return false; 1095 1096 // Determine if the callsite is cold relative to caller's entry. We could 1097 // potentially cache the computation of scaled entry frequency, but the added 1098 // complexity is not worth it unless this scaling shows up high in the 1099 // profiles. 1100 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 1101 auto CallSiteBB = Call.getParent(); 1102 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 1103 auto CallerEntryFreq = 1104 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 1105 return CallSiteFreq < CallerEntryFreq * ColdProb; 1106 } 1107 1108 Optional<int> 1109 InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 1110 BlockFrequencyInfo *CallerBFI) { 1111 1112 // If global profile summary is available, then callsite's hotness is 1113 // determined based on that. 1114 if (PSI && PSI->hasProfileSummary() && 1115 PSI->isHotCallSite(CallSite(&Call), CallerBFI)) 1116 return Params.HotCallSiteThreshold; 1117 1118 // Otherwise we need BFI to be available and to have a locally hot callsite 1119 // threshold. 1120 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 1121 return None; 1122 1123 // Determine if the callsite is hot relative to caller's entry. We could 1124 // potentially cache the computation of scaled entry frequency, but the added 1125 // complexity is not worth it unless this scaling shows up high in the 1126 // profiles. 1127 auto CallSiteBB = Call.getParent(); 1128 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 1129 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 1130 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 1131 return Params.LocallyHotCallSiteThreshold; 1132 1133 // Otherwise treat it normally. 1134 return None; 1135 } 1136 1137 void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 1138 // If no size growth is allowed for this inlining, set Threshold to 0. 1139 if (!allowSizeGrowth(Call)) { 1140 Threshold = 0; 1141 return; 1142 } 1143 1144 Function *Caller = Call.getCaller(); 1145 1146 // return min(A, B) if B is valid. 1147 auto MinIfValid = [](int A, Optional<int> B) { 1148 return B ? std::min(A, B.getValue()) : A; 1149 }; 1150 1151 // return max(A, B) if B is valid. 1152 auto MaxIfValid = [](int A, Optional<int> B) { 1153 return B ? std::max(A, B.getValue()) : A; 1154 }; 1155 1156 // Various bonus percentages. These are multiplied by Threshold to get the 1157 // bonus values. 1158 // SingleBBBonus: This bonus is applied if the callee has a single reachable 1159 // basic block at the given callsite context. This is speculatively applied 1160 // and withdrawn if more than one basic block is seen. 1161 // 1162 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 1163 // of the last call to a static function as inlining such functions is 1164 // guaranteed to reduce code size. 1165 // 1166 // These bonus percentages may be set to 0 based on properties of the caller 1167 // and the callsite. 1168 int SingleBBBonusPercent = 50; 1169 int VectorBonusPercent = TTI.getInlinerVectorBonusPercent(); 1170 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 1171 1172 // Lambda to set all the above bonus and bonus percentages to 0. 1173 auto DisallowAllBonuses = [&]() { 1174 SingleBBBonusPercent = 0; 1175 VectorBonusPercent = 0; 1176 LastCallToStaticBonus = 0; 1177 }; 1178 1179 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 1180 // and reduce the threshold if the caller has the necessary attribute. 1181 if (Caller->hasMinSize()) { 1182 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 1183 // For minsize, we want to disable the single BB bonus and the vector 1184 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 1185 // a static function will, at the minimum, eliminate the parameter setup and 1186 // call/return instructions. 1187 SingleBBBonusPercent = 0; 1188 VectorBonusPercent = 0; 1189 } else if (Caller->hasOptSize()) 1190 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 1191 1192 // Adjust the threshold based on inlinehint attribute and profile based 1193 // hotness information if the caller does not have MinSize attribute. 1194 if (!Caller->hasMinSize()) { 1195 if (Callee.hasFnAttribute(Attribute::InlineHint)) 1196 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 1197 1198 // FIXME: After switching to the new passmanager, simplify the logic below 1199 // by checking only the callsite hotness/coldness as we will reliably 1200 // have local profile information. 1201 // 1202 // Callsite hotness and coldness can be determined if sample profile is 1203 // used (which adds hotness metadata to calls) or if caller's 1204 // BlockFrequencyInfo is available. 1205 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 1206 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 1207 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 1208 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 1209 // FIXME: This should update the threshold only if it exceeds the 1210 // current threshold, but AutoFDO + ThinLTO currently relies on this 1211 // behavior to prevent inlining of hot callsites during ThinLTO 1212 // compile phase. 1213 Threshold = HotCallSiteThreshold.getValue(); 1214 } else if (isColdCallSite(Call, CallerBFI)) { 1215 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 1216 // Do not apply bonuses for a cold callsite including the 1217 // LastCallToStatic bonus. While this bonus might result in code size 1218 // reduction, it can cause the size of a non-cold caller to increase 1219 // preventing it from being inlined. 1220 DisallowAllBonuses(); 1221 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 1222 } else if (PSI) { 1223 // Use callee's global profile information only if we have no way of 1224 // determining this via callsite information. 1225 if (PSI->isFunctionEntryHot(&Callee)) { 1226 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 1227 // If callsite hotness can not be determined, we may still know 1228 // that the callee is hot and treat it as a weaker hint for threshold 1229 // increase. 1230 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 1231 } else if (PSI->isFunctionEntryCold(&Callee)) { 1232 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 1233 // Do not apply bonuses for a cold callee including the 1234 // LastCallToStatic bonus. While this bonus might result in code size 1235 // reduction, it can cause the size of a non-cold caller to increase 1236 // preventing it from being inlined. 1237 DisallowAllBonuses(); 1238 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 1239 } 1240 } 1241 } 1242 1243 // Finally, take the target-specific inlining threshold multiplier into 1244 // account. 1245 Threshold *= TTI.getInliningThresholdMultiplier(); 1246 1247 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 1248 VectorBonus = Threshold * VectorBonusPercent / 100; 1249 1250 bool OnlyOneCallAndLocalLinkage = 1251 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1252 // If there is only one call of the function, and it has internal linkage, 1253 // the cost of inlining it drops dramatically. It may seem odd to update 1254 // Cost in updateThreshold, but the bonus depends on the logic in this method. 1255 if (OnlyOneCallAndLocalLinkage) 1256 Cost -= LastCallToStaticBonus; 1257 } 1258 1259 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 1260 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1261 // First try to handle simplified comparisons. 1262 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1263 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 1264 })) 1265 return true; 1266 1267 if (I.getOpcode() == Instruction::FCmp) 1268 return false; 1269 1270 // Otherwise look for a comparison between constant offset pointers with 1271 // a common base. 1272 Value *LHSBase, *RHSBase; 1273 APInt LHSOffset, RHSOffset; 1274 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1275 if (LHSBase) { 1276 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1277 if (RHSBase && LHSBase == RHSBase) { 1278 // We have common bases, fold the icmp to a constant based on the 1279 // offsets. 1280 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1281 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1282 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1283 SimplifiedValues[&I] = C; 1284 ++NumConstantPtrCmps; 1285 return true; 1286 } 1287 } 1288 } 1289 1290 // If the comparison is an equality comparison with null, we can simplify it 1291 // if we know the value (argument) can't be null 1292 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1293 isKnownNonNullInCallee(I.getOperand(0))) { 1294 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1295 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1296 : ConstantInt::getFalse(I.getType()); 1297 return true; 1298 } 1299 return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1))); 1300 } 1301 1302 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1303 // Try to handle a special case: we can fold computing the difference of two 1304 // constant-related pointers. 1305 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1306 Value *LHSBase, *RHSBase; 1307 APInt LHSOffset, RHSOffset; 1308 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1309 if (LHSBase) { 1310 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1311 if (RHSBase && LHSBase == RHSBase) { 1312 // We have common bases, fold the subtract to a constant based on the 1313 // offsets. 1314 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1315 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1316 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1317 SimplifiedValues[&I] = C; 1318 ++NumConstantPtrDiffs; 1319 return true; 1320 } 1321 } 1322 } 1323 1324 // Otherwise, fall back to the generic logic for simplifying and handling 1325 // instructions. 1326 return Base::visitSub(I); 1327 } 1328 1329 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1330 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1331 Constant *CLHS = dyn_cast<Constant>(LHS); 1332 if (!CLHS) 1333 CLHS = SimplifiedValues.lookup(LHS); 1334 Constant *CRHS = dyn_cast<Constant>(RHS); 1335 if (!CRHS) 1336 CRHS = SimplifiedValues.lookup(RHS); 1337 1338 Value *SimpleV = nullptr; 1339 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1340 SimpleV = SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, 1341 FI->getFastMathFlags(), DL); 1342 else 1343 SimpleV = 1344 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1345 1346 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1347 SimplifiedValues[&I] = C; 1348 1349 if (SimpleV) 1350 return true; 1351 1352 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1353 disableSROA(LHS); 1354 disableSROA(RHS); 1355 1356 // If the instruction is floating point, and the target says this operation 1357 // is expensive, this may eventually become a library call. Treat the cost 1358 // as such. Unless it's fneg which can be implemented with an xor. 1359 using namespace llvm::PatternMatch; 1360 if (I.getType()->isFloatingPointTy() && 1361 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && 1362 !match(&I, m_FNeg(m_Value()))) 1363 onCallPenalty(); 1364 1365 return false; 1366 } 1367 1368 bool CallAnalyzer::visitFNeg(UnaryOperator &I) { 1369 Value *Op = I.getOperand(0); 1370 Constant *COp = dyn_cast<Constant>(Op); 1371 if (!COp) 1372 COp = SimplifiedValues.lookup(Op); 1373 1374 Value *SimpleV = SimplifyFNegInst( 1375 COp ? COp : Op, cast<FPMathOperator>(I).getFastMathFlags(), DL); 1376 1377 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1378 SimplifiedValues[&I] = C; 1379 1380 if (SimpleV) 1381 return true; 1382 1383 // Disable any SROA on arguments to arbitrary, unsimplified fneg. 1384 disableSROA(Op); 1385 1386 return false; 1387 } 1388 1389 bool CallAnalyzer::visitLoad(LoadInst &I) { 1390 if (handleSROA(I.getPointerOperand(), I.isSimple())) 1391 return true; 1392 1393 // If the data is already loaded from this address and hasn't been clobbered 1394 // by any stores or calls, this load is likely to be redundant and can be 1395 // eliminated. 1396 if (EnableLoadElimination && 1397 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1398 onLoadEliminationOpportunity(); 1399 return true; 1400 } 1401 1402 return false; 1403 } 1404 1405 bool CallAnalyzer::visitStore(StoreInst &I) { 1406 if (handleSROA(I.getPointerOperand(), I.isSimple())) 1407 return true; 1408 1409 // The store can potentially clobber loads and prevent repeated loads from 1410 // being eliminated. 1411 // FIXME: 1412 // 1. We can probably keep an initial set of eliminatable loads substracted 1413 // from the cost even when we finally see a store. We just need to disable 1414 // *further* accumulation of elimination savings. 1415 // 2. We should probably at some point thread MemorySSA for the callee into 1416 // this and then use that to actually compute *really* precise savings. 1417 disableLoadElimination(); 1418 return false; 1419 } 1420 1421 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1422 // Constant folding for extract value is trivial. 1423 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1424 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1425 })) 1426 return true; 1427 1428 // SROA can look through these but give them a cost. 1429 return false; 1430 } 1431 1432 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1433 // Constant folding for insert value is trivial. 1434 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1435 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1436 /*InsertedValueOperand*/ COps[1], 1437 I.getIndices()); 1438 })) 1439 return true; 1440 1441 // SROA can look through these but give them a cost. 1442 return false; 1443 } 1444 1445 /// Try to simplify a call site. 1446 /// 1447 /// Takes a concrete function and callsite and tries to actually simplify it by 1448 /// analyzing the arguments and call itself with instsimplify. Returns true if 1449 /// it has simplified the callsite to some other entity (a constant), making it 1450 /// free. 1451 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1452 // FIXME: Using the instsimplify logic directly for this is inefficient 1453 // because we have to continually rebuild the argument list even when no 1454 // simplifications can be performed. Until that is fixed with remapping 1455 // inside of instsimplify, directly constant fold calls here. 1456 if (!canConstantFoldCallTo(&Call, F)) 1457 return false; 1458 1459 // Try to re-map the arguments to constants. 1460 SmallVector<Constant *, 4> ConstantArgs; 1461 ConstantArgs.reserve(Call.arg_size()); 1462 for (Value *I : Call.args()) { 1463 Constant *C = dyn_cast<Constant>(I); 1464 if (!C) 1465 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1466 if (!C) 1467 return false; // This argument doesn't map to a constant. 1468 1469 ConstantArgs.push_back(C); 1470 } 1471 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1472 SimplifiedValues[&Call] = C; 1473 return true; 1474 } 1475 1476 return false; 1477 } 1478 1479 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1480 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1481 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1482 // This aborts the entire analysis. 1483 ExposesReturnsTwice = true; 1484 return false; 1485 } 1486 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1487 ContainsNoDuplicateCall = true; 1488 1489 Value *Callee = Call.getCalledOperand(); 1490 Function *F = dyn_cast_or_null<Function>(Callee); 1491 bool IsIndirectCall = !F; 1492 if (IsIndirectCall) { 1493 // Check if this happens to be an indirect function call to a known function 1494 // in this inline context. If not, we've done all we can. 1495 F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1496 if (!F) { 1497 onCallArgumentSetup(Call); 1498 1499 if (!Call.onlyReadsMemory()) 1500 disableLoadElimination(); 1501 return Base::visitCallBase(Call); 1502 } 1503 } 1504 1505 assert(F && "Expected a call to a known function"); 1506 1507 // When we have a concrete function, first try to simplify it directly. 1508 if (simplifyCallSite(F, Call)) 1509 return true; 1510 1511 // Next check if it is an intrinsic we know about. 1512 // FIXME: Lift this into part of the InstVisitor. 1513 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1514 switch (II->getIntrinsicID()) { 1515 default: 1516 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1517 disableLoadElimination(); 1518 return Base::visitCallBase(Call); 1519 1520 case Intrinsic::load_relative: 1521 onLoadRelativeIntrinsic(); 1522 return false; 1523 1524 case Intrinsic::memset: 1525 case Intrinsic::memcpy: 1526 case Intrinsic::memmove: 1527 disableLoadElimination(); 1528 // SROA can usually chew through these intrinsics, but they aren't free. 1529 return false; 1530 case Intrinsic::icall_branch_funnel: 1531 case Intrinsic::localescape: 1532 HasUninlineableIntrinsic = true; 1533 return false; 1534 case Intrinsic::vastart: 1535 InitsVargArgs = true; 1536 return false; 1537 } 1538 } 1539 1540 if (F == Call.getFunction()) { 1541 // This flag will fully abort the analysis, so don't bother with anything 1542 // else. 1543 IsRecursiveCall = true; 1544 return false; 1545 } 1546 1547 if (TTI.isLoweredToCall(F)) { 1548 onLoweredCall(F, Call, IsIndirectCall); 1549 } 1550 1551 if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory()))) 1552 disableLoadElimination(); 1553 return Base::visitCallBase(Call); 1554 } 1555 1556 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1557 // At least one return instruction will be free after inlining. 1558 bool Free = !HasReturn; 1559 HasReturn = true; 1560 return Free; 1561 } 1562 1563 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1564 // We model unconditional branches as essentially free -- they really 1565 // shouldn't exist at all, but handling them makes the behavior of the 1566 // inliner more regular and predictable. Interestingly, conditional branches 1567 // which will fold away are also free. 1568 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1569 dyn_cast_or_null<ConstantInt>( 1570 SimplifiedValues.lookup(BI.getCondition())); 1571 } 1572 1573 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1574 bool CheckSROA = SI.getType()->isPointerTy(); 1575 Value *TrueVal = SI.getTrueValue(); 1576 Value *FalseVal = SI.getFalseValue(); 1577 1578 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1579 if (!TrueC) 1580 TrueC = SimplifiedValues.lookup(TrueVal); 1581 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1582 if (!FalseC) 1583 FalseC = SimplifiedValues.lookup(FalseVal); 1584 Constant *CondC = 1585 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1586 1587 if (!CondC) { 1588 // Select C, X, X => X 1589 if (TrueC == FalseC && TrueC) { 1590 SimplifiedValues[&SI] = TrueC; 1591 return true; 1592 } 1593 1594 if (!CheckSROA) 1595 return Base::visitSelectInst(SI); 1596 1597 std::pair<Value *, APInt> TrueBaseAndOffset = 1598 ConstantOffsetPtrs.lookup(TrueVal); 1599 std::pair<Value *, APInt> FalseBaseAndOffset = 1600 ConstantOffsetPtrs.lookup(FalseVal); 1601 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1602 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1603 1604 if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal)) 1605 SROAArgValues[&SI] = SROAArg; 1606 return true; 1607 } 1608 1609 return Base::visitSelectInst(SI); 1610 } 1611 1612 // Select condition is a constant. 1613 Value *SelectedV = CondC->isAllOnesValue() 1614 ? TrueVal 1615 : (CondC->isNullValue()) ? FalseVal : nullptr; 1616 if (!SelectedV) { 1617 // Condition is a vector constant that is not all 1s or all 0s. If all 1618 // operands are constants, ConstantExpr::getSelect() can handle the cases 1619 // such as select vectors. 1620 if (TrueC && FalseC) { 1621 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1622 SimplifiedValues[&SI] = C; 1623 return true; 1624 } 1625 } 1626 return Base::visitSelectInst(SI); 1627 } 1628 1629 // Condition is either all 1s or all 0s. SI can be simplified. 1630 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1631 SimplifiedValues[&SI] = SelectedC; 1632 return true; 1633 } 1634 1635 if (!CheckSROA) 1636 return true; 1637 1638 std::pair<Value *, APInt> BaseAndOffset = 1639 ConstantOffsetPtrs.lookup(SelectedV); 1640 if (BaseAndOffset.first) { 1641 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1642 1643 if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV)) 1644 SROAArgValues[&SI] = SROAArg; 1645 } 1646 1647 return true; 1648 } 1649 1650 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1651 // We model unconditional switches as free, see the comments on handling 1652 // branches. 1653 if (isa<ConstantInt>(SI.getCondition())) 1654 return true; 1655 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1656 if (isa<ConstantInt>(V)) 1657 return true; 1658 1659 // Assume the most general case where the switch is lowered into 1660 // either a jump table, bit test, or a balanced binary tree consisting of 1661 // case clusters without merging adjacent clusters with the same 1662 // destination. We do not consider the switches that are lowered with a mix 1663 // of jump table/bit test/binary search tree. The cost of the switch is 1664 // proportional to the size of the tree or the size of jump table range. 1665 // 1666 // NB: We convert large switches which are just used to initialize large phi 1667 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1668 // inlining those. It will prevent inlining in cases where the optimization 1669 // does not (yet) fire. 1670 1671 unsigned JumpTableSize = 0; 1672 BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr; 1673 unsigned NumCaseCluster = 1674 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI); 1675 1676 onFinalizeSwitch(JumpTableSize, NumCaseCluster); 1677 return false; 1678 } 1679 1680 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1681 // We never want to inline functions that contain an indirectbr. This is 1682 // incorrect because all the blockaddress's (in static global initializers 1683 // for example) would be referring to the original function, and this 1684 // indirect jump would jump from the inlined copy of the function into the 1685 // original function which is extremely undefined behavior. 1686 // FIXME: This logic isn't really right; we can safely inline functions with 1687 // indirectbr's as long as no other function or global references the 1688 // blockaddress of a block within the current function. 1689 HasIndirectBr = true; 1690 return false; 1691 } 1692 1693 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1694 // FIXME: It's not clear that a single instruction is an accurate model for 1695 // the inline cost of a resume instruction. 1696 return false; 1697 } 1698 1699 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1700 // FIXME: It's not clear that a single instruction is an accurate model for 1701 // the inline cost of a cleanupret instruction. 1702 return false; 1703 } 1704 1705 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1706 // FIXME: It's not clear that a single instruction is an accurate model for 1707 // the inline cost of a catchret instruction. 1708 return false; 1709 } 1710 1711 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1712 // FIXME: It might be reasonably to discount the cost of instructions leading 1713 // to unreachable as they have the lowest possible impact on both runtime and 1714 // code size. 1715 return true; // No actual code is needed for unreachable. 1716 } 1717 1718 bool CallAnalyzer::visitInstruction(Instruction &I) { 1719 // Some instructions are free. All of the free intrinsics can also be 1720 // handled by SROA, etc. 1721 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1722 return true; 1723 1724 // We found something we don't understand or can't handle. Mark any SROA-able 1725 // values in the operand list as no longer viable. 1726 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1727 disableSROA(*OI); 1728 1729 return false; 1730 } 1731 1732 /// Analyze a basic block for its contribution to the inline cost. 1733 /// 1734 /// This method walks the analyzer over every instruction in the given basic 1735 /// block and accounts for their cost during inlining at this callsite. It 1736 /// aborts early if the threshold has been exceeded or an impossible to inline 1737 /// construct has been detected. It returns false if inlining is no longer 1738 /// viable, and true if inlining remains viable. 1739 InlineResult 1740 CallAnalyzer::analyzeBlock(BasicBlock *BB, 1741 SmallPtrSetImpl<const Value *> &EphValues) { 1742 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1743 // FIXME: Currently, the number of instructions in a function regardless of 1744 // our ability to simplify them during inline to constants or dead code, 1745 // are actually used by the vector bonus heuristic. As long as that's true, 1746 // we have to special case debug intrinsics here to prevent differences in 1747 // inlining due to debug symbols. Eventually, the number of unsimplified 1748 // instructions shouldn't factor into the cost computation, but until then, 1749 // hack around it here. 1750 if (isa<DbgInfoIntrinsic>(I)) 1751 continue; 1752 1753 // Skip ephemeral values. 1754 if (EphValues.count(&*I)) 1755 continue; 1756 1757 ++NumInstructions; 1758 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1759 ++NumVectorInstructions; 1760 1761 // If the instruction simplified to a constant, there is no cost to this 1762 // instruction. Visit the instructions using our InstVisitor to account for 1763 // all of the per-instruction logic. The visit tree returns true if we 1764 // consumed the instruction in any way, and false if the instruction's base 1765 // cost should count against inlining. 1766 if (Base::visit(&*I)) 1767 ++NumInstructionsSimplified; 1768 else 1769 onMissedSimplification(); 1770 1771 using namespace ore; 1772 // If the visit this instruction detected an uninlinable pattern, abort. 1773 InlineResult IR = InlineResult::success(); 1774 if (IsRecursiveCall) 1775 IR = InlineResult::failure("recursive"); 1776 else if (ExposesReturnsTwice) 1777 IR = InlineResult::failure("exposes returns twice"); 1778 else if (HasDynamicAlloca) 1779 IR = InlineResult::failure("dynamic alloca"); 1780 else if (HasIndirectBr) 1781 IR = InlineResult::failure("indirect branch"); 1782 else if (HasUninlineableIntrinsic) 1783 IR = InlineResult::failure("uninlinable intrinsic"); 1784 else if (InitsVargArgs) 1785 IR = InlineResult::failure("varargs"); 1786 if (!IR.isSuccess()) { 1787 if (ORE) 1788 ORE->emit([&]() { 1789 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1790 &CandidateCall) 1791 << NV("Callee", &F) << " has uninlinable pattern (" 1792 << NV("InlineResult", IR.getFailureReason()) 1793 << ") and cost is not fully computed"; 1794 }); 1795 return IR; 1796 } 1797 1798 // If the caller is a recursive function then we don't want to inline 1799 // functions which allocate a lot of stack space because it would increase 1800 // the caller stack usage dramatically. 1801 if (IsCallerRecursive && 1802 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1803 auto IR = 1804 InlineResult::failure("recursive and allocates too much stack space"); 1805 if (ORE) 1806 ORE->emit([&]() { 1807 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1808 &CandidateCall) 1809 << NV("Callee", &F) << " is " 1810 << NV("InlineResult", IR.getFailureReason()) 1811 << ". Cost is not fully computed"; 1812 }); 1813 return IR; 1814 } 1815 1816 if (shouldStop()) 1817 return InlineResult::failure( 1818 "Call site analysis is not favorable to inlining."); 1819 } 1820 1821 return InlineResult::success(); 1822 } 1823 1824 /// Compute the base pointer and cumulative constant offsets for V. 1825 /// 1826 /// This strips all constant offsets off of V, leaving it the base pointer, and 1827 /// accumulates the total constant offset applied in the returned constant. It 1828 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1829 /// no constant offsets applied. 1830 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1831 if (!V->getType()->isPointerTy()) 1832 return nullptr; 1833 1834 unsigned AS = V->getType()->getPointerAddressSpace(); 1835 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1836 APInt Offset = APInt::getNullValue(IntPtrWidth); 1837 1838 // Even though we don't look through PHI nodes, we could be called on an 1839 // instruction in an unreachable block, which may be on a cycle. 1840 SmallPtrSet<Value *, 4> Visited; 1841 Visited.insert(V); 1842 do { 1843 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1844 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1845 return nullptr; 1846 V = GEP->getPointerOperand(); 1847 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1848 V = cast<Operator>(V)->getOperand(0); 1849 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1850 if (GA->isInterposable()) 1851 break; 1852 V = GA->getAliasee(); 1853 } else { 1854 break; 1855 } 1856 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1857 } while (Visited.insert(V).second); 1858 1859 Type *IdxPtrTy = DL.getIndexType(V->getType()); 1860 return cast<ConstantInt>(ConstantInt::get(IdxPtrTy, Offset)); 1861 } 1862 1863 /// Find dead blocks due to deleted CFG edges during inlining. 1864 /// 1865 /// If we know the successor of the current block, \p CurrBB, has to be \p 1866 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1867 /// no live incoming CFG edges. If one block is found to be dead, we can 1868 /// continue growing the dead block list by checking the successors of the dead 1869 /// blocks to see if all their incoming edges are dead or not. 1870 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1871 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1872 // A CFG edge is dead if the predecessor is dead or the predecessor has a 1873 // known successor which is not the one under exam. 1874 return (DeadBlocks.count(Pred) || 1875 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1876 }; 1877 1878 auto IsNewlyDead = [&](BasicBlock *BB) { 1879 // If all the edges to a block are dead, the block is also dead. 1880 return (!DeadBlocks.count(BB) && 1881 llvm::all_of(predecessors(BB), 1882 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1883 }; 1884 1885 for (BasicBlock *Succ : successors(CurrBB)) { 1886 if (Succ == NextBB || !IsNewlyDead(Succ)) 1887 continue; 1888 SmallVector<BasicBlock *, 4> NewDead; 1889 NewDead.push_back(Succ); 1890 while (!NewDead.empty()) { 1891 BasicBlock *Dead = NewDead.pop_back_val(); 1892 if (DeadBlocks.insert(Dead)) 1893 // Continue growing the dead block lists. 1894 for (BasicBlock *S : successors(Dead)) 1895 if (IsNewlyDead(S)) 1896 NewDead.push_back(S); 1897 } 1898 } 1899 } 1900 1901 /// Analyze a call site for potential inlining. 1902 /// 1903 /// Returns true if inlining this call is viable, and false if it is not 1904 /// viable. It computes the cost and adjusts the threshold based on numerous 1905 /// factors and heuristics. If this method returns false but the computed cost 1906 /// is below the computed threshold, then inlining was forcibly disabled by 1907 /// some artifact of the routine. 1908 InlineResult CallAnalyzer::analyze() { 1909 ++NumCallsAnalyzed; 1910 1911 auto Result = onAnalysisStart(); 1912 if (!Result.isSuccess()) 1913 return Result; 1914 1915 if (F.empty()) 1916 return InlineResult::success(); 1917 1918 Function *Caller = CandidateCall.getFunction(); 1919 // Check if the caller function is recursive itself. 1920 for (User *U : Caller->users()) { 1921 CallBase *Call = dyn_cast<CallBase>(U); 1922 if (Call && Call->getFunction() == Caller) { 1923 IsCallerRecursive = true; 1924 break; 1925 } 1926 } 1927 1928 // Populate our simplified values by mapping from function arguments to call 1929 // arguments with known important simplifications. 1930 auto CAI = CandidateCall.arg_begin(); 1931 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1932 FAI != FAE; ++FAI, ++CAI) { 1933 assert(CAI != CandidateCall.arg_end()); 1934 if (Constant *C = dyn_cast<Constant>(CAI)) 1935 SimplifiedValues[&*FAI] = C; 1936 1937 Value *PtrArg = *CAI; 1938 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1939 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1940 1941 // We can SROA any pointer arguments derived from alloca instructions. 1942 if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) { 1943 SROAArgValues[&*FAI] = SROAArg; 1944 onInitializeSROAArg(SROAArg); 1945 EnabledSROAAllocas.insert(SROAArg); 1946 } 1947 } 1948 } 1949 NumConstantArgs = SimplifiedValues.size(); 1950 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1951 NumAllocaArgs = SROAArgValues.size(); 1952 1953 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1954 // the ephemeral values multiple times (and they're completely determined by 1955 // the callee, so this is purely duplicate work). 1956 SmallPtrSet<const Value *, 32> EphValues; 1957 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1958 1959 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1960 // adding basic blocks of the callee which can be proven to be dead for this 1961 // particular call site in order to get more accurate cost estimates. This 1962 // requires a somewhat heavyweight iteration pattern: we need to walk the 1963 // basic blocks in a breadth-first order as we insert live successors. To 1964 // accomplish this, prioritizing for small iterations because we exit after 1965 // crossing our threshold, we use a small-size optimized SetVector. 1966 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1967 SmallPtrSet<BasicBlock *, 16>> 1968 BBSetVector; 1969 BBSetVector BBWorklist; 1970 BBWorklist.insert(&F.getEntryBlock()); 1971 1972 // Note that we *must not* cache the size, this loop grows the worklist. 1973 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1974 if (shouldStop()) 1975 break; 1976 1977 BasicBlock *BB = BBWorklist[Idx]; 1978 if (BB->empty()) 1979 continue; 1980 1981 // Disallow inlining a blockaddress with uses other than strictly callbr. 1982 // A blockaddress only has defined behavior for an indirect branch in the 1983 // same function, and we do not currently support inlining indirect 1984 // branches. But, the inliner may not see an indirect branch that ends up 1985 // being dead code at a particular call site. If the blockaddress escapes 1986 // the function, e.g., via a global variable, inlining may lead to an 1987 // invalid cross-function reference. 1988 // FIXME: pr/39560: continue relaxing this overt restriction. 1989 if (BB->hasAddressTaken()) 1990 for (User *U : BlockAddress::get(&*BB)->users()) 1991 if (!isa<CallBrInst>(*U)) 1992 return InlineResult::failure("blockaddress used outside of callbr"); 1993 1994 // Analyze the cost of this block. If we blow through the threshold, this 1995 // returns false, and we can bail on out. 1996 InlineResult IR = analyzeBlock(BB, EphValues); 1997 if (!IR.isSuccess()) 1998 return IR; 1999 2000 Instruction *TI = BB->getTerminator(); 2001 2002 // Add in the live successors by first checking whether we have terminator 2003 // that may be simplified based on the values simplified by this call. 2004 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 2005 if (BI->isConditional()) { 2006 Value *Cond = BI->getCondition(); 2007 if (ConstantInt *SimpleCond = 2008 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 2009 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 2010 BBWorklist.insert(NextBB); 2011 KnownSuccessors[BB] = NextBB; 2012 findDeadBlocks(BB, NextBB); 2013 continue; 2014 } 2015 } 2016 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 2017 Value *Cond = SI->getCondition(); 2018 if (ConstantInt *SimpleCond = 2019 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 2020 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 2021 BBWorklist.insert(NextBB); 2022 KnownSuccessors[BB] = NextBB; 2023 findDeadBlocks(BB, NextBB); 2024 continue; 2025 } 2026 } 2027 2028 // If we're unable to select a particular successor, just count all of 2029 // them. 2030 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 2031 ++TIdx) 2032 BBWorklist.insert(TI->getSuccessor(TIdx)); 2033 2034 onBlockAnalyzed(BB); 2035 } 2036 2037 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && 2038 &F == CandidateCall.getCalledFunction(); 2039 // If this is a noduplicate call, we can still inline as long as 2040 // inlining this would cause the removal of the caller (so the instruction 2041 // is not actually duplicated, just moved). 2042 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 2043 return InlineResult::failure("noduplicate"); 2044 2045 return finalizeAnalysis(); 2046 } 2047 2048 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 2049 /// Dump stats about this call's analysis. 2050 LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() { 2051 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 2052 DEBUG_PRINT_STAT(NumConstantArgs); 2053 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 2054 DEBUG_PRINT_STAT(NumAllocaArgs); 2055 DEBUG_PRINT_STAT(NumConstantPtrCmps); 2056 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 2057 DEBUG_PRINT_STAT(NumInstructionsSimplified); 2058 DEBUG_PRINT_STAT(NumInstructions); 2059 DEBUG_PRINT_STAT(SROACostSavings); 2060 DEBUG_PRINT_STAT(SROACostSavingsLost); 2061 DEBUG_PRINT_STAT(LoadEliminationCost); 2062 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 2063 DEBUG_PRINT_STAT(Cost); 2064 DEBUG_PRINT_STAT(Threshold); 2065 #undef DEBUG_PRINT_STAT 2066 } 2067 #endif 2068 2069 /// Test that there are no attribute conflicts between Caller and Callee 2070 /// that prevent inlining. 2071 static bool functionsHaveCompatibleAttributes(Function *Caller, 2072 Function *Callee, 2073 TargetTransformInfo &TTI) { 2074 return TTI.areInlineCompatible(Caller, Callee) && 2075 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 2076 } 2077 2078 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 2079 int Cost = 0; 2080 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 2081 if (Call.isByValArgument(I)) { 2082 // We approximate the number of loads and stores needed by dividing the 2083 // size of the byval type by the target's pointer size. 2084 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2085 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 2086 unsigned AS = PTy->getAddressSpace(); 2087 unsigned PointerSize = DL.getPointerSizeInBits(AS); 2088 // Ceiling division. 2089 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 2090 2091 // If it generates more than 8 stores it is likely to be expanded as an 2092 // inline memcpy so we take that as an upper bound. Otherwise we assume 2093 // one load and one store per word copied. 2094 // FIXME: The maxStoresPerMemcpy setting from the target should be used 2095 // here instead of a magic number of 8, but it's not available via 2096 // DataLayout. 2097 NumStores = std::min(NumStores, 8U); 2098 2099 Cost += 2 * NumStores * InlineConstants::InstrCost; 2100 } else { 2101 // For non-byval arguments subtract off one instruction per call 2102 // argument. 2103 Cost += InlineConstants::InstrCost; 2104 } 2105 } 2106 // The call instruction also disappears after inlining. 2107 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 2108 return Cost; 2109 } 2110 2111 InlineCost llvm::getInlineCost( 2112 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 2113 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2114 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2115 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2116 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2117 GetAssumptionCache, GetBFI, PSI, ORE); 2118 } 2119 2120 InlineCost llvm::getInlineCost( 2121 CallBase &Call, Function *Callee, const InlineParams &Params, 2122 TargetTransformInfo &CalleeTTI, 2123 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2124 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2125 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2126 2127 // Cannot inline indirect calls. 2128 if (!Callee) 2129 return llvm::InlineCost::getNever("indirect call"); 2130 2131 // Never inline calls with byval arguments that does not have the alloca 2132 // address space. Since byval arguments can be replaced with a copy to an 2133 // alloca, the inlined code would need to be adjusted to handle that the 2134 // argument is in the alloca address space (so it is a little bit complicated 2135 // to solve). 2136 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2137 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2138 if (Call.isByValArgument(I)) { 2139 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2140 if (PTy->getAddressSpace() != AllocaAS) 2141 return llvm::InlineCost::getNever("byval arguments without alloca" 2142 " address space"); 2143 } 2144 2145 // Calls to functions with always-inline attributes should be inlined 2146 // whenever possible. 2147 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2148 auto IsViable = isInlineViable(*Callee); 2149 if (IsViable.isSuccess()) 2150 return llvm::InlineCost::getAlways("always inline attribute"); 2151 return llvm::InlineCost::getNever(IsViable.getFailureReason()); 2152 } 2153 2154 // Never inline functions with conflicting attributes (unless callee has 2155 // always-inline attribute). 2156 Function *Caller = Call.getCaller(); 2157 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 2158 return llvm::InlineCost::getNever("conflicting attributes"); 2159 2160 // Don't inline this call if the caller has the optnone attribute. 2161 if (Caller->hasOptNone()) 2162 return llvm::InlineCost::getNever("optnone attribute"); 2163 2164 // Don't inline a function that treats null pointer as valid into a caller 2165 // that does not have this attribute. 2166 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2167 return llvm::InlineCost::getNever("nullptr definitions incompatible"); 2168 2169 // Don't inline functions which can be interposed at link-time. 2170 if (Callee->isInterposable()) 2171 return llvm::InlineCost::getNever("interposable"); 2172 2173 // Don't inline functions marked noinline. 2174 if (Callee->hasFnAttribute(Attribute::NoInline)) 2175 return llvm::InlineCost::getNever("noinline function attribute"); 2176 2177 // Don't inline call sites marked noinline. 2178 if (Call.isNoInline()) 2179 return llvm::InlineCost::getNever("noinline call site attribute"); 2180 2181 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2182 << "... (caller:" << Caller->getName() << ")\n"); 2183 2184 InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, 2185 *Callee, Call, Params); 2186 InlineResult ShouldInline = CA.analyze(); 2187 2188 LLVM_DEBUG(CA.dump()); 2189 2190 // Check if there was a reason to force inlining or no inlining. 2191 if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold()) 2192 return InlineCost::getNever(ShouldInline.getFailureReason()); 2193 if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold()) 2194 return InlineCost::getAlways("empty function"); 2195 2196 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2197 } 2198 2199 InlineResult llvm::isInlineViable(Function &F) { 2200 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2201 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2202 // Disallow inlining of functions which contain indirect branches. 2203 if (isa<IndirectBrInst>(BI->getTerminator())) 2204 return InlineResult::failure("contains indirect branches"); 2205 2206 // Disallow inlining of blockaddresses which are used by non-callbr 2207 // instructions. 2208 if (BI->hasAddressTaken()) 2209 for (User *U : BlockAddress::get(&*BI)->users()) 2210 if (!isa<CallBrInst>(*U)) 2211 return InlineResult::failure("blockaddress used outside of callbr"); 2212 2213 for (auto &II : *BI) { 2214 CallBase *Call = dyn_cast<CallBase>(&II); 2215 if (!Call) 2216 continue; 2217 2218 // Disallow recursive calls. 2219 if (&F == Call->getCalledFunction()) 2220 return InlineResult::failure("recursive call"); 2221 2222 // Disallow calls which expose returns-twice to a function not previously 2223 // attributed as such. 2224 if (!ReturnsTwice && isa<CallInst>(Call) && 2225 cast<CallInst>(Call)->canReturnTwice()) 2226 return InlineResult::failure("exposes returns-twice attribute"); 2227 2228 if (Call->getCalledFunction()) 2229 switch (Call->getCalledFunction()->getIntrinsicID()) { 2230 default: 2231 break; 2232 case llvm::Intrinsic::icall_branch_funnel: 2233 // Disallow inlining of @llvm.icall.branch.funnel because current 2234 // backend can't separate call targets from call arguments. 2235 return InlineResult::failure( 2236 "disallowed inlining of @llvm.icall.branch.funnel"); 2237 case llvm::Intrinsic::localescape: 2238 // Disallow inlining functions that call @llvm.localescape. Doing this 2239 // correctly would require major changes to the inliner. 2240 return InlineResult::failure( 2241 "disallowed inlining of @llvm.localescape"); 2242 case llvm::Intrinsic::vastart: 2243 // Disallow inlining of functions that initialize VarArgs with 2244 // va_start. 2245 return InlineResult::failure( 2246 "contains VarArgs initialized with va_start"); 2247 } 2248 } 2249 } 2250 2251 return InlineResult::success(); 2252 } 2253 2254 // APIs to create InlineParams based on command line flags and/or other 2255 // parameters. 2256 2257 InlineParams llvm::getInlineParams(int Threshold) { 2258 InlineParams Params; 2259 2260 // This field is the threshold to use for a callee by default. This is 2261 // derived from one or more of: 2262 // * optimization or size-optimization levels, 2263 // * a value passed to createFunctionInliningPass function, or 2264 // * the -inline-threshold flag. 2265 // If the -inline-threshold flag is explicitly specified, that is used 2266 // irrespective of anything else. 2267 if (InlineThreshold.getNumOccurrences() > 0) 2268 Params.DefaultThreshold = InlineThreshold; 2269 else 2270 Params.DefaultThreshold = Threshold; 2271 2272 // Set the HintThreshold knob from the -inlinehint-threshold. 2273 Params.HintThreshold = HintThreshold; 2274 2275 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2276 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2277 2278 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2279 // populate LocallyHotCallSiteThreshold. Later, we populate 2280 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2281 // we know that optimization level is O3 (in the getInlineParams variant that 2282 // takes the opt and size levels). 2283 // FIXME: Remove this check (and make the assignment unconditional) after 2284 // addressing size regression issues at O2. 2285 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2286 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2287 2288 // Set the ColdCallSiteThreshold knob from the 2289 // -inline-cold-callsite-threshold. 2290 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2291 2292 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2293 // -inlinehint-threshold commandline option is not explicitly given. If that 2294 // option is present, then its value applies even for callees with size and 2295 // minsize attributes. 2296 // If the -inline-threshold is not specified, set the ColdThreshold from the 2297 // -inlinecold-threshold even if it is not explicitly passed. If 2298 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2299 // explicitly specified to set the ColdThreshold knob 2300 if (InlineThreshold.getNumOccurrences() == 0) { 2301 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2302 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2303 Params.ColdThreshold = ColdThreshold; 2304 } else if (ColdThreshold.getNumOccurrences() > 0) { 2305 Params.ColdThreshold = ColdThreshold; 2306 } 2307 return Params; 2308 } 2309 2310 InlineParams llvm::getInlineParams() { 2311 return getInlineParams(DefaultThreshold); 2312 } 2313 2314 // Compute the default threshold for inlining based on the opt level and the 2315 // size opt level. 2316 static int computeThresholdFromOptLevels(unsigned OptLevel, 2317 unsigned SizeOptLevel) { 2318 if (OptLevel > 2) 2319 return InlineConstants::OptAggressiveThreshold; 2320 if (SizeOptLevel == 1) // -Os 2321 return InlineConstants::OptSizeThreshold; 2322 if (SizeOptLevel == 2) // -Oz 2323 return InlineConstants::OptMinSizeThreshold; 2324 return DefaultThreshold; 2325 } 2326 2327 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2328 auto Params = 2329 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2330 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2331 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2332 // when it is specified explicitly. 2333 if (OptLevel > 2) 2334 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2335 return Params; 2336 } 2337