1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/CFG.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/Support/Debug.h" 39 #include "llvm/Support/raw_ostream.h" 40 41 using namespace llvm; 42 43 #define DEBUG_TYPE "inline-cost" 44 45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 46 47 static cl::opt<int> InlineThreshold( 48 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 49 cl::desc("Control the amount of inlining to perform (default = 225)")); 50 51 static cl::opt<int> HintThreshold( 52 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 53 cl::desc("Threshold for inlining functions with inline hint")); 54 55 static cl::opt<int> 56 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 57 cl::init(45), cl::ZeroOrMore, 58 cl::desc("Threshold for inlining cold callsites")); 59 60 // We introduce this threshold to help performance of instrumentation based 61 // PGO before we actually hook up inliner with analysis passes such as BPI and 62 // BFI. 63 static cl::opt<int> ColdThreshold( 64 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 65 cl::desc("Threshold for inlining functions with cold attribute")); 66 67 static cl::opt<int> 68 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 69 cl::ZeroOrMore, 70 cl::desc("Threshold for hot callsites ")); 71 72 static cl::opt<int> LocallyHotCallSiteThreshold( 73 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 74 cl::desc("Threshold for locally hot callsites ")); 75 76 static cl::opt<int> ColdCallSiteRelFreq( 77 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 78 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 79 "entry frequency, for a callsite to be cold in the absence of " 80 "profile information.")); 81 82 static cl::opt<int> HotCallSiteRelFreq( 83 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 84 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 85 "entry frequency, for a callsite to be hot in the absence of " 86 "profile information.")); 87 88 static cl::opt<bool> OptComputeFullInlineCost( 89 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 90 cl::desc("Compute the full inline cost of a call site even when the cost " 91 "exceeds the threshold.")); 92 93 namespace { 94 95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 96 typedef InstVisitor<CallAnalyzer, bool> Base; 97 friend class InstVisitor<CallAnalyzer, bool>; 98 99 /// The TargetTransformInfo available for this compilation. 100 const TargetTransformInfo &TTI; 101 102 /// Getter for the cache of @llvm.assume intrinsics. 103 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 104 105 /// Getter for BlockFrequencyInfo 106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 107 108 /// Profile summary information. 109 ProfileSummaryInfo *PSI; 110 111 /// The called function. 112 Function &F; 113 114 // Cache the DataLayout since we use it a lot. 115 const DataLayout &DL; 116 117 /// The OptimizationRemarkEmitter available for this compilation. 118 OptimizationRemarkEmitter *ORE; 119 120 /// The candidate callsite being analyzed. Please do not use this to do 121 /// analysis in the caller function; we want the inline cost query to be 122 /// easily cacheable. Instead, use the cover function paramHasAttr. 123 CallBase &CandidateCall; 124 125 /// Tunable parameters that control the analysis. 126 const InlineParams &Params; 127 128 /// Upper bound for the inlining cost. Bonuses are being applied to account 129 /// for speculative "expected profit" of the inlining decision. 130 int Threshold; 131 132 /// Inlining cost measured in abstract units, accounts for all the 133 /// instructions expected to be executed for a given function invocation. 134 /// Instructions that are statically proven to be dead based on call-site 135 /// arguments are not counted here. 136 int Cost = 0; 137 138 bool ComputeFullInlineCost; 139 140 bool IsCallerRecursive = false; 141 bool IsRecursiveCall = false; 142 bool ExposesReturnsTwice = false; 143 bool HasDynamicAlloca = false; 144 bool ContainsNoDuplicateCall = false; 145 bool HasReturn = false; 146 bool HasIndirectBr = false; 147 bool HasUninlineableIntrinsic = false; 148 bool InitsVargArgs = false; 149 150 /// Number of bytes allocated statically by the callee. 151 uint64_t AllocatedSize = 0; 152 unsigned NumInstructions = 0; 153 unsigned NumVectorInstructions = 0; 154 155 /// Bonus to be applied when percentage of vector instructions in callee is 156 /// high (see more details in updateThreshold). 157 int VectorBonus = 0; 158 /// Bonus to be applied when the callee has only one reachable basic block. 159 int SingleBBBonus = 0; 160 161 /// While we walk the potentially-inlined instructions, we build up and 162 /// maintain a mapping of simplified values specific to this callsite. The 163 /// idea is to propagate any special information we have about arguments to 164 /// this call through the inlinable section of the function, and account for 165 /// likely simplifications post-inlining. The most important aspect we track 166 /// is CFG altering simplifications -- when we prove a basic block dead, that 167 /// can cause dramatic shifts in the cost of inlining a function. 168 DenseMap<Value *, Constant *> SimplifiedValues; 169 170 /// Keep track of the values which map back (through function arguments) to 171 /// allocas on the caller stack which could be simplified through SROA. 172 DenseMap<Value *, Value *> SROAArgValues; 173 174 /// The mapping of caller Alloca values to their accumulated cost savings. If 175 /// we have to disable SROA for one of the allocas, this tells us how much 176 /// cost must be added. 177 DenseMap<Value *, int> SROAArgCosts; 178 179 /// Keep track of values which map to a pointer base and constant offset. 180 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 181 182 /// Keep track of dead blocks due to the constant arguments. 183 SetVector<BasicBlock *> DeadBlocks; 184 185 /// The mapping of the blocks to their known unique successors due to the 186 /// constant arguments. 187 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 188 189 /// Model the elimination of repeated loads that is expected to happen 190 /// whenever we simplify away the stores that would otherwise cause them to be 191 /// loads. 192 bool EnableLoadElimination; 193 SmallPtrSet<Value *, 16> LoadAddrSet; 194 int LoadEliminationCost = 0; 195 196 // Custom simplification helper routines. 197 bool isAllocaDerivedArg(Value *V); 198 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 199 DenseMap<Value *, int>::iterator &CostIt); 200 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 201 void disableSROA(Value *V); 202 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 203 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 204 int InstructionCost); 205 void disableLoadElimination(); 206 bool isGEPFree(GetElementPtrInst &GEP); 207 bool canFoldInboundsGEP(GetElementPtrInst &I); 208 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 209 bool simplifyCallSite(Function *F, CallBase &Call); 210 template <typename Callable> 211 bool simplifyInstruction(Instruction &I, Callable Evaluate); 212 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 213 214 /// Return true if the given argument to the function being considered for 215 /// inlining has the given attribute set either at the call site or the 216 /// function declaration. Primarily used to inspect call site specific 217 /// attributes since these can be more precise than the ones on the callee 218 /// itself. 219 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 220 221 /// Return true if the given value is known non null within the callee if 222 /// inlined through this particular callsite. 223 bool isKnownNonNullInCallee(Value *V); 224 225 /// Update Threshold based on callsite properties such as callee 226 /// attributes and callee hotness for PGO builds. The Callee is explicitly 227 /// passed to support analyzing indirect calls whose target is inferred by 228 /// analysis. 229 void updateThreshold(CallBase &Call, Function &Callee); 230 231 /// Return true if size growth is allowed when inlining the callee at \p Call. 232 bool allowSizeGrowth(CallBase &Call); 233 234 /// Return true if \p Call is a cold callsite. 235 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 236 237 /// Return a higher threshold if \p Call is a hot callsite. 238 Optional<int> getHotCallSiteThreshold(CallBase &Call, 239 BlockFrequencyInfo *CallerBFI); 240 241 // Custom analysis routines. 242 InlineResult analyzeBlock(BasicBlock *BB, 243 SmallPtrSetImpl<const Value *> &EphValues); 244 245 /// Handle a capped 'int' increment for Cost. 246 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 247 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 248 Cost = (int)std::min(UpperBound, Cost + Inc); 249 } 250 251 // Disable several entry points to the visitor so we don't accidentally use 252 // them by declaring but not defining them here. 253 void visit(Module *); 254 void visit(Module &); 255 void visit(Function *); 256 void visit(Function &); 257 void visit(BasicBlock *); 258 void visit(BasicBlock &); 259 260 // Provide base case for our instruction visit. 261 bool visitInstruction(Instruction &I); 262 263 // Our visit overrides. 264 bool visitAlloca(AllocaInst &I); 265 bool visitPHI(PHINode &I); 266 bool visitGetElementPtr(GetElementPtrInst &I); 267 bool visitBitCast(BitCastInst &I); 268 bool visitPtrToInt(PtrToIntInst &I); 269 bool visitIntToPtr(IntToPtrInst &I); 270 bool visitCastInst(CastInst &I); 271 bool visitUnaryInstruction(UnaryInstruction &I); 272 bool visitCmpInst(CmpInst &I); 273 bool visitSub(BinaryOperator &I); 274 bool visitBinaryOperator(BinaryOperator &I); 275 bool visitLoad(LoadInst &I); 276 bool visitStore(StoreInst &I); 277 bool visitExtractValue(ExtractValueInst &I); 278 bool visitInsertValue(InsertValueInst &I); 279 bool visitCallBase(CallBase &Call); 280 bool visitReturnInst(ReturnInst &RI); 281 bool visitBranchInst(BranchInst &BI); 282 bool visitSelectInst(SelectInst &SI); 283 bool visitSwitchInst(SwitchInst &SI); 284 bool visitIndirectBrInst(IndirectBrInst &IBI); 285 bool visitResumeInst(ResumeInst &RI); 286 bool visitCleanupReturnInst(CleanupReturnInst &RI); 287 bool visitCatchReturnInst(CatchReturnInst &RI); 288 bool visitUnreachableInst(UnreachableInst &I); 289 290 public: 291 CallAnalyzer(const TargetTransformInfo &TTI, 292 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 293 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 294 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 295 Function &Callee, CallBase &Call, const InlineParams &Params) 296 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 297 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 298 CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), 299 ComputeFullInlineCost(OptComputeFullInlineCost || 300 Params.ComputeFullInlineCost || ORE), 301 EnableLoadElimination(true) {} 302 303 InlineResult analyzeCall(CallBase &Call); 304 305 int getThreshold() { return Threshold; } 306 int getCost() { return Cost; } 307 308 // Keep a bunch of stats about the cost savings found so we can print them 309 // out when debugging. 310 unsigned NumConstantArgs = 0; 311 unsigned NumConstantOffsetPtrArgs = 0; 312 unsigned NumAllocaArgs = 0; 313 unsigned NumConstantPtrCmps = 0; 314 unsigned NumConstantPtrDiffs = 0; 315 unsigned NumInstructionsSimplified = 0; 316 unsigned SROACostSavings = 0; 317 unsigned SROACostSavingsLost = 0; 318 319 void dump(); 320 }; 321 322 } // namespace 323 324 /// Test whether the given value is an Alloca-derived function argument. 325 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 326 return SROAArgValues.count(V); 327 } 328 329 /// Lookup the SROA-candidate argument and cost iterator which V maps to. 330 /// Returns false if V does not map to a SROA-candidate. 331 bool CallAnalyzer::lookupSROAArgAndCost( 332 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 333 if (SROAArgValues.empty() || SROAArgCosts.empty()) 334 return false; 335 336 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 337 if (ArgIt == SROAArgValues.end()) 338 return false; 339 340 Arg = ArgIt->second; 341 CostIt = SROAArgCosts.find(Arg); 342 return CostIt != SROAArgCosts.end(); 343 } 344 345 /// Disable SROA for the candidate marked by this cost iterator. 346 /// 347 /// This marks the candidate as no longer viable for SROA, and adds the cost 348 /// savings associated with it back into the inline cost measurement. 349 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 350 // If we're no longer able to perform SROA we need to undo its cost savings 351 // and prevent subsequent analysis. 352 addCost(CostIt->second); 353 SROACostSavings -= CostIt->second; 354 SROACostSavingsLost += CostIt->second; 355 SROAArgCosts.erase(CostIt); 356 disableLoadElimination(); 357 } 358 359 /// If 'V' maps to a SROA candidate, disable SROA for it. 360 void CallAnalyzer::disableSROA(Value *V) { 361 Value *SROAArg; 362 DenseMap<Value *, int>::iterator CostIt; 363 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 364 disableSROA(CostIt); 365 } 366 367 /// Accumulate the given cost for a particular SROA candidate. 368 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 369 int InstructionCost) { 370 CostIt->second += InstructionCost; 371 SROACostSavings += InstructionCost; 372 } 373 374 void CallAnalyzer::disableLoadElimination() { 375 if (EnableLoadElimination) { 376 addCost(LoadEliminationCost); 377 LoadEliminationCost = 0; 378 EnableLoadElimination = false; 379 } 380 } 381 382 /// Accumulate a constant GEP offset into an APInt if possible. 383 /// 384 /// Returns false if unable to compute the offset for any reason. Respects any 385 /// simplified values known during the analysis of this callsite. 386 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 387 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 388 assert(IntPtrWidth == Offset.getBitWidth()); 389 390 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 391 GTI != GTE; ++GTI) { 392 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 393 if (!OpC) 394 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 395 OpC = dyn_cast<ConstantInt>(SimpleOp); 396 if (!OpC) 397 return false; 398 if (OpC->isZero()) 399 continue; 400 401 // Handle a struct index, which adds its field offset to the pointer. 402 if (StructType *STy = GTI.getStructTypeOrNull()) { 403 unsigned ElementIdx = OpC->getZExtValue(); 404 const StructLayout *SL = DL.getStructLayout(STy); 405 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 406 continue; 407 } 408 409 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 410 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 411 } 412 return true; 413 } 414 415 /// Use TTI to check whether a GEP is free. 416 /// 417 /// Respects any simplified values known during the analysis of this callsite. 418 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 419 SmallVector<Value *, 4> Operands; 420 Operands.push_back(GEP.getOperand(0)); 421 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 422 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 423 Operands.push_back(SimpleOp); 424 else 425 Operands.push_back(*I); 426 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 427 } 428 429 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 430 // Check whether inlining will turn a dynamic alloca into a static 431 // alloca and handle that case. 432 if (I.isArrayAllocation()) { 433 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 434 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 435 Type *Ty = I.getAllocatedType(); 436 AllocatedSize = SaturatingMultiplyAdd( 437 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 438 return Base::visitAlloca(I); 439 } 440 } 441 442 // Accumulate the allocated size. 443 if (I.isStaticAlloca()) { 444 Type *Ty = I.getAllocatedType(); 445 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 446 } 447 448 // We will happily inline static alloca instructions. 449 if (I.isStaticAlloca()) 450 return Base::visitAlloca(I); 451 452 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 453 // a variety of reasons, and so we would like to not inline them into 454 // functions which don't currently have a dynamic alloca. This simply 455 // disables inlining altogether in the presence of a dynamic alloca. 456 HasDynamicAlloca = true; 457 return false; 458 } 459 460 bool CallAnalyzer::visitPHI(PHINode &I) { 461 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 462 // though we don't want to propagate it's bonuses. The idea is to disable 463 // SROA if it *might* be used in an inappropriate manner. 464 465 // Phi nodes are always zero-cost. 466 // FIXME: Pointer sizes may differ between different address spaces, so do we 467 // need to use correct address space in the call to getPointerSizeInBits here? 468 // Or could we skip the getPointerSizeInBits call completely? As far as I can 469 // see the ZeroOffset is used as a dummy value, so we can probably use any 470 // bit width for the ZeroOffset? 471 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 472 bool CheckSROA = I.getType()->isPointerTy(); 473 474 // Track the constant or pointer with constant offset we've seen so far. 475 Constant *FirstC = nullptr; 476 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 477 Value *FirstV = nullptr; 478 479 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 480 BasicBlock *Pred = I.getIncomingBlock(i); 481 // If the incoming block is dead, skip the incoming block. 482 if (DeadBlocks.count(Pred)) 483 continue; 484 // If the parent block of phi is not the known successor of the incoming 485 // block, skip the incoming block. 486 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 487 if (KnownSuccessor && KnownSuccessor != I.getParent()) 488 continue; 489 490 Value *V = I.getIncomingValue(i); 491 // If the incoming value is this phi itself, skip the incoming value. 492 if (&I == V) 493 continue; 494 495 Constant *C = dyn_cast<Constant>(V); 496 if (!C) 497 C = SimplifiedValues.lookup(V); 498 499 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 500 if (!C && CheckSROA) 501 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 502 503 if (!C && !BaseAndOffset.first) 504 // The incoming value is neither a constant nor a pointer with constant 505 // offset, exit early. 506 return true; 507 508 if (FirstC) { 509 if (FirstC == C) 510 // If we've seen a constant incoming value before and it is the same 511 // constant we see this time, continue checking the next incoming value. 512 continue; 513 // Otherwise early exit because we either see a different constant or saw 514 // a constant before but we have a pointer with constant offset this time. 515 return true; 516 } 517 518 if (FirstV) { 519 // The same logic as above, but check pointer with constant offset here. 520 if (FirstBaseAndOffset == BaseAndOffset) 521 continue; 522 return true; 523 } 524 525 if (C) { 526 // This is the 1st time we've seen a constant, record it. 527 FirstC = C; 528 continue; 529 } 530 531 // The remaining case is that this is the 1st time we've seen a pointer with 532 // constant offset, record it. 533 FirstV = V; 534 FirstBaseAndOffset = BaseAndOffset; 535 } 536 537 // Check if we can map phi to a constant. 538 if (FirstC) { 539 SimplifiedValues[&I] = FirstC; 540 return true; 541 } 542 543 // Check if we can map phi to a pointer with constant offset. 544 if (FirstBaseAndOffset.first) { 545 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 546 547 Value *SROAArg; 548 DenseMap<Value *, int>::iterator CostIt; 549 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 550 SROAArgValues[&I] = SROAArg; 551 } 552 553 return true; 554 } 555 556 /// Check we can fold GEPs of constant-offset call site argument pointers. 557 /// This requires target data and inbounds GEPs. 558 /// 559 /// \return true if the specified GEP can be folded. 560 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 561 // Check if we have a base + offset for the pointer. 562 std::pair<Value *, APInt> BaseAndOffset = 563 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 564 if (!BaseAndOffset.first) 565 return false; 566 567 // Check if the offset of this GEP is constant, and if so accumulate it 568 // into Offset. 569 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 570 return false; 571 572 // Add the result as a new mapping to Base + Offset. 573 ConstantOffsetPtrs[&I] = BaseAndOffset; 574 575 return true; 576 } 577 578 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 579 Value *SROAArg; 580 DenseMap<Value *, int>::iterator CostIt; 581 bool SROACandidate = 582 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 583 584 // Lambda to check whether a GEP's indices are all constant. 585 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 586 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 587 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 588 return false; 589 return true; 590 }; 591 592 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 593 if (SROACandidate) 594 SROAArgValues[&I] = SROAArg; 595 596 // Constant GEPs are modeled as free. 597 return true; 598 } 599 600 // Variable GEPs will require math and will disable SROA. 601 if (SROACandidate) 602 disableSROA(CostIt); 603 return isGEPFree(I); 604 } 605 606 /// Simplify \p I if its operands are constants and update SimplifiedValues. 607 /// \p Evaluate is a callable specific to instruction type that evaluates the 608 /// instruction when all the operands are constants. 609 template <typename Callable> 610 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 611 SmallVector<Constant *, 2> COps; 612 for (Value *Op : I.operands()) { 613 Constant *COp = dyn_cast<Constant>(Op); 614 if (!COp) 615 COp = SimplifiedValues.lookup(Op); 616 if (!COp) 617 return false; 618 COps.push_back(COp); 619 } 620 auto *C = Evaluate(COps); 621 if (!C) 622 return false; 623 SimplifiedValues[&I] = C; 624 return true; 625 } 626 627 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 628 // Propagate constants through bitcasts. 629 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 630 return ConstantExpr::getBitCast(COps[0], I.getType()); 631 })) 632 return true; 633 634 // Track base/offsets through casts 635 std::pair<Value *, APInt> BaseAndOffset = 636 ConstantOffsetPtrs.lookup(I.getOperand(0)); 637 // Casts don't change the offset, just wrap it up. 638 if (BaseAndOffset.first) 639 ConstantOffsetPtrs[&I] = BaseAndOffset; 640 641 // Also look for SROA candidates here. 642 Value *SROAArg; 643 DenseMap<Value *, int>::iterator CostIt; 644 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 645 SROAArgValues[&I] = SROAArg; 646 647 // Bitcasts are always zero cost. 648 return true; 649 } 650 651 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 652 // Propagate constants through ptrtoint. 653 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 654 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 655 })) 656 return true; 657 658 // Track base/offset pairs when converted to a plain integer provided the 659 // integer is large enough to represent the pointer. 660 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 661 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 662 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 663 std::pair<Value *, APInt> BaseAndOffset = 664 ConstantOffsetPtrs.lookup(I.getOperand(0)); 665 if (BaseAndOffset.first) 666 ConstantOffsetPtrs[&I] = BaseAndOffset; 667 } 668 669 // This is really weird. Technically, ptrtoint will disable SROA. However, 670 // unless that ptrtoint is *used* somewhere in the live basic blocks after 671 // inlining, it will be nuked, and SROA should proceed. All of the uses which 672 // would block SROA would also block SROA if applied directly to a pointer, 673 // and so we can just add the integer in here. The only places where SROA is 674 // preserved either cannot fire on an integer, or won't in-and-of themselves 675 // disable SROA (ext) w/o some later use that we would see and disable. 676 Value *SROAArg; 677 DenseMap<Value *, int>::iterator CostIt; 678 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 679 SROAArgValues[&I] = SROAArg; 680 681 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 682 } 683 684 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 685 // Propagate constants through ptrtoint. 686 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 687 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 688 })) 689 return true; 690 691 // Track base/offset pairs when round-tripped through a pointer without 692 // modifications provided the integer is not too large. 693 Value *Op = I.getOperand(0); 694 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 695 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 696 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 697 if (BaseAndOffset.first) 698 ConstantOffsetPtrs[&I] = BaseAndOffset; 699 } 700 701 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 702 Value *SROAArg; 703 DenseMap<Value *, int>::iterator CostIt; 704 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 705 SROAArgValues[&I] = SROAArg; 706 707 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 708 } 709 710 bool CallAnalyzer::visitCastInst(CastInst &I) { 711 // Propagate constants through ptrtoint. 712 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 713 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 714 })) 715 return true; 716 717 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 718 disableSROA(I.getOperand(0)); 719 720 // If this is a floating-point cast, and the target says this operation 721 // is expensive, this may eventually become a library call. Treat the cost 722 // as such. 723 switch (I.getOpcode()) { 724 case Instruction::FPTrunc: 725 case Instruction::FPExt: 726 case Instruction::UIToFP: 727 case Instruction::SIToFP: 728 case Instruction::FPToUI: 729 case Instruction::FPToSI: 730 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 731 addCost(InlineConstants::CallPenalty); 732 break; 733 default: 734 break; 735 } 736 737 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 738 } 739 740 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 741 Value *Operand = I.getOperand(0); 742 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 743 return ConstantFoldInstOperands(&I, COps[0], DL); 744 })) 745 return true; 746 747 // Disable any SROA on the argument to arbitrary unary operators. 748 disableSROA(Operand); 749 750 return false; 751 } 752 753 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 754 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 755 } 756 757 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 758 // Does the *call site* have the NonNull attribute set on an argument? We 759 // use the attribute on the call site to memoize any analysis done in the 760 // caller. This will also trip if the callee function has a non-null 761 // parameter attribute, but that's a less interesting case because hopefully 762 // the callee would already have been simplified based on that. 763 if (Argument *A = dyn_cast<Argument>(V)) 764 if (paramHasAttr(A, Attribute::NonNull)) 765 return true; 766 767 // Is this an alloca in the caller? This is distinct from the attribute case 768 // above because attributes aren't updated within the inliner itself and we 769 // always want to catch the alloca derived case. 770 if (isAllocaDerivedArg(V)) 771 // We can actually predict the result of comparisons between an 772 // alloca-derived value and null. Note that this fires regardless of 773 // SROA firing. 774 return true; 775 776 return false; 777 } 778 779 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 780 // If the normal destination of the invoke or the parent block of the call 781 // site is unreachable-terminated, there is little point in inlining this 782 // unless there is literally zero cost. 783 // FIXME: Note that it is possible that an unreachable-terminated block has a 784 // hot entry. For example, in below scenario inlining hot_call_X() may be 785 // beneficial : 786 // main() { 787 // hot_call_1(); 788 // ... 789 // hot_call_N() 790 // exit(0); 791 // } 792 // For now, we are not handling this corner case here as it is rare in real 793 // code. In future, we should elaborate this based on BPI and BFI in more 794 // general threshold adjusting heuristics in updateThreshold(). 795 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 796 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 797 return false; 798 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 799 return false; 800 801 return true; 802 } 803 804 bool CallAnalyzer::isColdCallSite(CallBase &Call, 805 BlockFrequencyInfo *CallerBFI) { 806 // If global profile summary is available, then callsite's coldness is 807 // determined based on that. 808 if (PSI && PSI->hasProfileSummary()) 809 return PSI->isColdCallSite(CallSite(&Call), CallerBFI); 810 811 // Otherwise we need BFI to be available. 812 if (!CallerBFI) 813 return false; 814 815 // Determine if the callsite is cold relative to caller's entry. We could 816 // potentially cache the computation of scaled entry frequency, but the added 817 // complexity is not worth it unless this scaling shows up high in the 818 // profiles. 819 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 820 auto CallSiteBB = Call.getParent(); 821 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 822 auto CallerEntryFreq = 823 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 824 return CallSiteFreq < CallerEntryFreq * ColdProb; 825 } 826 827 Optional<int> 828 CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 829 BlockFrequencyInfo *CallerBFI) { 830 831 // If global profile summary is available, then callsite's hotness is 832 // determined based on that. 833 if (PSI && PSI->hasProfileSummary() && 834 PSI->isHotCallSite(CallSite(&Call), CallerBFI)) 835 return Params.HotCallSiteThreshold; 836 837 // Otherwise we need BFI to be available and to have a locally hot callsite 838 // threshold. 839 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 840 return None; 841 842 // Determine if the callsite is hot relative to caller's entry. We could 843 // potentially cache the computation of scaled entry frequency, but the added 844 // complexity is not worth it unless this scaling shows up high in the 845 // profiles. 846 auto CallSiteBB = Call.getParent(); 847 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 848 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 849 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 850 return Params.LocallyHotCallSiteThreshold; 851 852 // Otherwise treat it normally. 853 return None; 854 } 855 856 void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 857 // If no size growth is allowed for this inlining, set Threshold to 0. 858 if (!allowSizeGrowth(Call)) { 859 Threshold = 0; 860 return; 861 } 862 863 Function *Caller = Call.getCaller(); 864 865 // return min(A, B) if B is valid. 866 auto MinIfValid = [](int A, Optional<int> B) { 867 return B ? std::min(A, B.getValue()) : A; 868 }; 869 870 // return max(A, B) if B is valid. 871 auto MaxIfValid = [](int A, Optional<int> B) { 872 return B ? std::max(A, B.getValue()) : A; 873 }; 874 875 // Various bonus percentages. These are multiplied by Threshold to get the 876 // bonus values. 877 // SingleBBBonus: This bonus is applied if the callee has a single reachable 878 // basic block at the given callsite context. This is speculatively applied 879 // and withdrawn if more than one basic block is seen. 880 // 881 // Vector bonuses: We want to more aggressively inline vector-dense kernels 882 // and apply this bonus based on the percentage of vector instructions. A 883 // bonus is applied if the vector instructions exceed 50% and half that amount 884 // is applied if it exceeds 10%. Note that these bonuses are some what 885 // arbitrary and evolved over time by accident as much as because they are 886 // principled bonuses. 887 // FIXME: It would be nice to base the bonus values on something more 888 // scientific. 889 // 890 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 891 // of the last call to a static function as inlining such functions is 892 // guaranteed to reduce code size. 893 // 894 // These bonus percentages may be set to 0 based on properties of the caller 895 // and the callsite. 896 int SingleBBBonusPercent = 50; 897 int VectorBonusPercent = 150; 898 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 899 900 // Lambda to set all the above bonus and bonus percentages to 0. 901 auto DisallowAllBonuses = [&]() { 902 SingleBBBonusPercent = 0; 903 VectorBonusPercent = 0; 904 LastCallToStaticBonus = 0; 905 }; 906 907 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 908 // and reduce the threshold if the caller has the necessary attribute. 909 if (Caller->hasMinSize()) { 910 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 911 // For minsize, we want to disable the single BB bonus and the vector 912 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 913 // a static function will, at the minimum, eliminate the parameter setup and 914 // call/return instructions. 915 SingleBBBonusPercent = 0; 916 VectorBonusPercent = 0; 917 } else if (Caller->hasOptSize()) 918 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 919 920 // Adjust the threshold based on inlinehint attribute and profile based 921 // hotness information if the caller does not have MinSize attribute. 922 if (!Caller->hasMinSize()) { 923 if (Callee.hasFnAttribute(Attribute::InlineHint)) 924 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 925 926 // FIXME: After switching to the new passmanager, simplify the logic below 927 // by checking only the callsite hotness/coldness as we will reliably 928 // have local profile information. 929 // 930 // Callsite hotness and coldness can be determined if sample profile is 931 // used (which adds hotness metadata to calls) or if caller's 932 // BlockFrequencyInfo is available. 933 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 934 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 935 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 936 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 937 // FIXME: This should update the threshold only if it exceeds the 938 // current threshold, but AutoFDO + ThinLTO currently relies on this 939 // behavior to prevent inlining of hot callsites during ThinLTO 940 // compile phase. 941 Threshold = HotCallSiteThreshold.getValue(); 942 } else if (isColdCallSite(Call, CallerBFI)) { 943 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 944 // Do not apply bonuses for a cold callsite including the 945 // LastCallToStatic bonus. While this bonus might result in code size 946 // reduction, it can cause the size of a non-cold caller to increase 947 // preventing it from being inlined. 948 DisallowAllBonuses(); 949 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 950 } else if (PSI) { 951 // Use callee's global profile information only if we have no way of 952 // determining this via callsite information. 953 if (PSI->isFunctionEntryHot(&Callee)) { 954 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 955 // If callsite hotness can not be determined, we may still know 956 // that the callee is hot and treat it as a weaker hint for threshold 957 // increase. 958 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 959 } else if (PSI->isFunctionEntryCold(&Callee)) { 960 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 961 // Do not apply bonuses for a cold callee including the 962 // LastCallToStatic bonus. While this bonus might result in code size 963 // reduction, it can cause the size of a non-cold caller to increase 964 // preventing it from being inlined. 965 DisallowAllBonuses(); 966 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 967 } 968 } 969 } 970 971 // Finally, take the target-specific inlining threshold multiplier into 972 // account. 973 Threshold *= TTI.getInliningThresholdMultiplier(); 974 975 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 976 VectorBonus = Threshold * VectorBonusPercent / 100; 977 978 bool OnlyOneCallAndLocalLinkage = 979 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 980 // If there is only one call of the function, and it has internal linkage, 981 // the cost of inlining it drops dramatically. It may seem odd to update 982 // Cost in updateThreshold, but the bonus depends on the logic in this method. 983 if (OnlyOneCallAndLocalLinkage) 984 Cost -= LastCallToStaticBonus; 985 } 986 987 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 988 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 989 // First try to handle simplified comparisons. 990 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 991 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 992 })) 993 return true; 994 995 if (I.getOpcode() == Instruction::FCmp) 996 return false; 997 998 // Otherwise look for a comparison between constant offset pointers with 999 // a common base. 1000 Value *LHSBase, *RHSBase; 1001 APInt LHSOffset, RHSOffset; 1002 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1003 if (LHSBase) { 1004 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1005 if (RHSBase && LHSBase == RHSBase) { 1006 // We have common bases, fold the icmp to a constant based on the 1007 // offsets. 1008 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1009 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1010 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1011 SimplifiedValues[&I] = C; 1012 ++NumConstantPtrCmps; 1013 return true; 1014 } 1015 } 1016 } 1017 1018 // If the comparison is an equality comparison with null, we can simplify it 1019 // if we know the value (argument) can't be null 1020 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1021 isKnownNonNullInCallee(I.getOperand(0))) { 1022 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1023 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1024 : ConstantInt::getFalse(I.getType()); 1025 return true; 1026 } 1027 // Finally check for SROA candidates in comparisons. 1028 Value *SROAArg; 1029 DenseMap<Value *, int>::iterator CostIt; 1030 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1031 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1032 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1033 return true; 1034 } 1035 1036 disableSROA(CostIt); 1037 } 1038 1039 return false; 1040 } 1041 1042 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1043 // Try to handle a special case: we can fold computing the difference of two 1044 // constant-related pointers. 1045 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1046 Value *LHSBase, *RHSBase; 1047 APInt LHSOffset, RHSOffset; 1048 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1049 if (LHSBase) { 1050 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1051 if (RHSBase && LHSBase == RHSBase) { 1052 // We have common bases, fold the subtract to a constant based on the 1053 // offsets. 1054 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1055 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1056 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1057 SimplifiedValues[&I] = C; 1058 ++NumConstantPtrDiffs; 1059 return true; 1060 } 1061 } 1062 } 1063 1064 // Otherwise, fall back to the generic logic for simplifying and handling 1065 // instructions. 1066 return Base::visitSub(I); 1067 } 1068 1069 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1070 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1071 Constant *CLHS = dyn_cast<Constant>(LHS); 1072 if (!CLHS) 1073 CLHS = SimplifiedValues.lookup(LHS); 1074 Constant *CRHS = dyn_cast<Constant>(RHS); 1075 if (!CRHS) 1076 CRHS = SimplifiedValues.lookup(RHS); 1077 1078 Value *SimpleV = nullptr; 1079 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1080 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1081 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1082 else 1083 SimpleV = 1084 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1085 1086 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1087 SimplifiedValues[&I] = C; 1088 1089 if (SimpleV) 1090 return true; 1091 1092 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1093 disableSROA(LHS); 1094 disableSROA(RHS); 1095 1096 // If the instruction is floating point, and the target says this operation 1097 // is expensive, this may eventually become a library call. Treat the cost 1098 // as such. 1099 if (I.getType()->isFloatingPointTy() && 1100 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1101 addCost(InlineConstants::CallPenalty); 1102 1103 return false; 1104 } 1105 1106 bool CallAnalyzer::visitLoad(LoadInst &I) { 1107 Value *SROAArg; 1108 DenseMap<Value *, int>::iterator CostIt; 1109 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1110 if (I.isSimple()) { 1111 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1112 return true; 1113 } 1114 1115 disableSROA(CostIt); 1116 } 1117 1118 // If the data is already loaded from this address and hasn't been clobbered 1119 // by any stores or calls, this load is likely to be redundant and can be 1120 // eliminated. 1121 if (EnableLoadElimination && 1122 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1123 LoadEliminationCost += InlineConstants::InstrCost; 1124 return true; 1125 } 1126 1127 return false; 1128 } 1129 1130 bool CallAnalyzer::visitStore(StoreInst &I) { 1131 Value *SROAArg; 1132 DenseMap<Value *, int>::iterator CostIt; 1133 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1134 if (I.isSimple()) { 1135 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1136 return true; 1137 } 1138 1139 disableSROA(CostIt); 1140 } 1141 1142 // The store can potentially clobber loads and prevent repeated loads from 1143 // being eliminated. 1144 // FIXME: 1145 // 1. We can probably keep an initial set of eliminatable loads substracted 1146 // from the cost even when we finally see a store. We just need to disable 1147 // *further* accumulation of elimination savings. 1148 // 2. We should probably at some point thread MemorySSA for the callee into 1149 // this and then use that to actually compute *really* precise savings. 1150 disableLoadElimination(); 1151 return false; 1152 } 1153 1154 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1155 // Constant folding for extract value is trivial. 1156 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1157 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1158 })) 1159 return true; 1160 1161 // SROA can look through these but give them a cost. 1162 return false; 1163 } 1164 1165 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1166 // Constant folding for insert value is trivial. 1167 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1168 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1169 /*InsertedValueOperand*/ COps[1], 1170 I.getIndices()); 1171 })) 1172 return true; 1173 1174 // SROA can look through these but give them a cost. 1175 return false; 1176 } 1177 1178 /// Try to simplify a call site. 1179 /// 1180 /// Takes a concrete function and callsite and tries to actually simplify it by 1181 /// analyzing the arguments and call itself with instsimplify. Returns true if 1182 /// it has simplified the callsite to some other entity (a constant), making it 1183 /// free. 1184 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1185 // FIXME: Using the instsimplify logic directly for this is inefficient 1186 // because we have to continually rebuild the argument list even when no 1187 // simplifications can be performed. Until that is fixed with remapping 1188 // inside of instsimplify, directly constant fold calls here. 1189 if (!canConstantFoldCallTo(&Call, F)) 1190 return false; 1191 1192 // Try to re-map the arguments to constants. 1193 SmallVector<Constant *, 4> ConstantArgs; 1194 ConstantArgs.reserve(Call.arg_size()); 1195 for (Value *I : Call.args()) { 1196 Constant *C = dyn_cast<Constant>(I); 1197 if (!C) 1198 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1199 if (!C) 1200 return false; // This argument doesn't map to a constant. 1201 1202 ConstantArgs.push_back(C); 1203 } 1204 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1205 SimplifiedValues[&Call] = C; 1206 return true; 1207 } 1208 1209 return false; 1210 } 1211 1212 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1213 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1214 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1215 // This aborts the entire analysis. 1216 ExposesReturnsTwice = true; 1217 return false; 1218 } 1219 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1220 ContainsNoDuplicateCall = true; 1221 1222 if (Function *F = Call.getCalledFunction()) { 1223 // When we have a concrete function, first try to simplify it directly. 1224 if (simplifyCallSite(F, Call)) 1225 return true; 1226 1227 // Next check if it is an intrinsic we know about. 1228 // FIXME: Lift this into part of the InstVisitor. 1229 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1230 switch (II->getIntrinsicID()) { 1231 default: 1232 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1233 disableLoadElimination(); 1234 return Base::visitCallBase(Call); 1235 1236 case Intrinsic::load_relative: 1237 // This is normally lowered to 4 LLVM instructions. 1238 addCost(3 * InlineConstants::InstrCost); 1239 return false; 1240 1241 case Intrinsic::memset: 1242 case Intrinsic::memcpy: 1243 case Intrinsic::memmove: 1244 disableLoadElimination(); 1245 // SROA can usually chew through these intrinsics, but they aren't free. 1246 return false; 1247 case Intrinsic::icall_branch_funnel: 1248 case Intrinsic::localescape: 1249 HasUninlineableIntrinsic = true; 1250 return false; 1251 case Intrinsic::vastart: 1252 InitsVargArgs = true; 1253 return false; 1254 } 1255 } 1256 1257 if (F == Call.getFunction()) { 1258 // This flag will fully abort the analysis, so don't bother with anything 1259 // else. 1260 IsRecursiveCall = true; 1261 return false; 1262 } 1263 1264 if (TTI.isLoweredToCall(F)) { 1265 // We account for the average 1 instruction per call argument setup 1266 // here. 1267 addCost(Call.arg_size() * InlineConstants::InstrCost); 1268 1269 // Everything other than inline ASM will also have a significant cost 1270 // merely from making the call. 1271 if (!isa<InlineAsm>(Call.getCalledValue())) 1272 addCost(InlineConstants::CallPenalty); 1273 } 1274 1275 if (!Call.onlyReadsMemory()) 1276 disableLoadElimination(); 1277 return Base::visitCallBase(Call); 1278 } 1279 1280 // Otherwise we're in a very special case -- an indirect function call. See 1281 // if we can be particularly clever about this. 1282 Value *Callee = Call.getCalledValue(); 1283 1284 // First, pay the price of the argument setup. We account for the average 1285 // 1 instruction per call argument setup here. 1286 addCost(Call.arg_size() * InlineConstants::InstrCost); 1287 1288 // Next, check if this happens to be an indirect function call to a known 1289 // function in this inline context. If not, we've done all we can. 1290 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1291 if (!F) { 1292 if (!Call.onlyReadsMemory()) 1293 disableLoadElimination(); 1294 return Base::visitCallBase(Call); 1295 } 1296 1297 // If we have a constant that we are calling as a function, we can peer 1298 // through it and see the function target. This happens not infrequently 1299 // during devirtualization and so we want to give it a hefty bonus for 1300 // inlining, but cap that bonus in the event that inlining wouldn't pan 1301 // out. Pretend to inline the function, with a custom threshold. 1302 auto IndirectCallParams = Params; 1303 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1304 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, 1305 IndirectCallParams); 1306 if (CA.analyzeCall(Call)) { 1307 // We were able to inline the indirect call! Subtract the cost from the 1308 // threshold to get the bonus we want to apply, but don't go below zero. 1309 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1310 } 1311 1312 if (!F->onlyReadsMemory()) 1313 disableLoadElimination(); 1314 return Base::visitCallBase(Call); 1315 } 1316 1317 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1318 // At least one return instruction will be free after inlining. 1319 bool Free = !HasReturn; 1320 HasReturn = true; 1321 return Free; 1322 } 1323 1324 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1325 // We model unconditional branches as essentially free -- they really 1326 // shouldn't exist at all, but handling them makes the behavior of the 1327 // inliner more regular and predictable. Interestingly, conditional branches 1328 // which will fold away are also free. 1329 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1330 dyn_cast_or_null<ConstantInt>( 1331 SimplifiedValues.lookup(BI.getCondition())); 1332 } 1333 1334 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1335 bool CheckSROA = SI.getType()->isPointerTy(); 1336 Value *TrueVal = SI.getTrueValue(); 1337 Value *FalseVal = SI.getFalseValue(); 1338 1339 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1340 if (!TrueC) 1341 TrueC = SimplifiedValues.lookup(TrueVal); 1342 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1343 if (!FalseC) 1344 FalseC = SimplifiedValues.lookup(FalseVal); 1345 Constant *CondC = 1346 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1347 1348 if (!CondC) { 1349 // Select C, X, X => X 1350 if (TrueC == FalseC && TrueC) { 1351 SimplifiedValues[&SI] = TrueC; 1352 return true; 1353 } 1354 1355 if (!CheckSROA) 1356 return Base::visitSelectInst(SI); 1357 1358 std::pair<Value *, APInt> TrueBaseAndOffset = 1359 ConstantOffsetPtrs.lookup(TrueVal); 1360 std::pair<Value *, APInt> FalseBaseAndOffset = 1361 ConstantOffsetPtrs.lookup(FalseVal); 1362 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1363 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1364 1365 Value *SROAArg; 1366 DenseMap<Value *, int>::iterator CostIt; 1367 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1368 SROAArgValues[&SI] = SROAArg; 1369 return true; 1370 } 1371 1372 return Base::visitSelectInst(SI); 1373 } 1374 1375 // Select condition is a constant. 1376 Value *SelectedV = CondC->isAllOnesValue() 1377 ? TrueVal 1378 : (CondC->isNullValue()) ? FalseVal : nullptr; 1379 if (!SelectedV) { 1380 // Condition is a vector constant that is not all 1s or all 0s. If all 1381 // operands are constants, ConstantExpr::getSelect() can handle the cases 1382 // such as select vectors. 1383 if (TrueC && FalseC) { 1384 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1385 SimplifiedValues[&SI] = C; 1386 return true; 1387 } 1388 } 1389 return Base::visitSelectInst(SI); 1390 } 1391 1392 // Condition is either all 1s or all 0s. SI can be simplified. 1393 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1394 SimplifiedValues[&SI] = SelectedC; 1395 return true; 1396 } 1397 1398 if (!CheckSROA) 1399 return true; 1400 1401 std::pair<Value *, APInt> BaseAndOffset = 1402 ConstantOffsetPtrs.lookup(SelectedV); 1403 if (BaseAndOffset.first) { 1404 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1405 1406 Value *SROAArg; 1407 DenseMap<Value *, int>::iterator CostIt; 1408 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1409 SROAArgValues[&SI] = SROAArg; 1410 } 1411 1412 return true; 1413 } 1414 1415 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1416 // We model unconditional switches as free, see the comments on handling 1417 // branches. 1418 if (isa<ConstantInt>(SI.getCondition())) 1419 return true; 1420 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1421 if (isa<ConstantInt>(V)) 1422 return true; 1423 1424 // Assume the most general case where the switch is lowered into 1425 // either a jump table, bit test, or a balanced binary tree consisting of 1426 // case clusters without merging adjacent clusters with the same 1427 // destination. We do not consider the switches that are lowered with a mix 1428 // of jump table/bit test/binary search tree. The cost of the switch is 1429 // proportional to the size of the tree or the size of jump table range. 1430 // 1431 // NB: We convert large switches which are just used to initialize large phi 1432 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1433 // inlining those. It will prevent inlining in cases where the optimization 1434 // does not (yet) fire. 1435 1436 // Maximum valid cost increased in this function. 1437 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1438 1439 // Exit early for a large switch, assuming one case needs at least one 1440 // instruction. 1441 // FIXME: This is not true for a bit test, but ignore such case for now to 1442 // save compile-time. 1443 int64_t CostLowerBound = 1444 std::min((int64_t)CostUpperBound, 1445 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1446 1447 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1448 addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); 1449 return false; 1450 } 1451 1452 unsigned JumpTableSize = 0; 1453 unsigned NumCaseCluster = 1454 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1455 1456 // If suitable for a jump table, consider the cost for the table size and 1457 // branch to destination. 1458 if (JumpTableSize) { 1459 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1460 4 * InlineConstants::InstrCost; 1461 1462 addCost(JTCost, (int64_t)CostUpperBound); 1463 return false; 1464 } 1465 1466 // Considering forming a binary search, we should find the number of nodes 1467 // which is same as the number of comparisons when lowered. For a given 1468 // number of clusters, n, we can define a recursive function, f(n), to find 1469 // the number of nodes in the tree. The recursion is : 1470 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1471 // and f(n) = n, when n <= 3. 1472 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1473 // when n > 3. So, the number of comparisons from leaves should be n, while 1474 // the number of non-leaf should be : 1475 // 2^(log2(n) - 1) - 1 1476 // = 2^log2(n) * 2^-1 - 1 1477 // = n / 2 - 1. 1478 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1479 // number of comparisons in a simple closed form : 1480 // n + n / 2 - 1 = n * 3 / 2 - 1 1481 if (NumCaseCluster <= 3) { 1482 // Suppose a comparison includes one compare and one conditional branch. 1483 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 1484 return false; 1485 } 1486 1487 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1488 int64_t SwitchCost = 1489 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1490 1491 addCost(SwitchCost, (int64_t)CostUpperBound); 1492 return false; 1493 } 1494 1495 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1496 // We never want to inline functions that contain an indirectbr. This is 1497 // incorrect because all the blockaddress's (in static global initializers 1498 // for example) would be referring to the original function, and this 1499 // indirect jump would jump from the inlined copy of the function into the 1500 // original function which is extremely undefined behavior. 1501 // FIXME: This logic isn't really right; we can safely inline functions with 1502 // indirectbr's as long as no other function or global references the 1503 // blockaddress of a block within the current function. 1504 HasIndirectBr = true; 1505 return false; 1506 } 1507 1508 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1509 // FIXME: It's not clear that a single instruction is an accurate model for 1510 // the inline cost of a resume instruction. 1511 return false; 1512 } 1513 1514 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1515 // FIXME: It's not clear that a single instruction is an accurate model for 1516 // the inline cost of a cleanupret instruction. 1517 return false; 1518 } 1519 1520 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1521 // FIXME: It's not clear that a single instruction is an accurate model for 1522 // the inline cost of a catchret instruction. 1523 return false; 1524 } 1525 1526 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1527 // FIXME: It might be reasonably to discount the cost of instructions leading 1528 // to unreachable as they have the lowest possible impact on both runtime and 1529 // code size. 1530 return true; // No actual code is needed for unreachable. 1531 } 1532 1533 bool CallAnalyzer::visitInstruction(Instruction &I) { 1534 // Some instructions are free. All of the free intrinsics can also be 1535 // handled by SROA, etc. 1536 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1537 return true; 1538 1539 // We found something we don't understand or can't handle. Mark any SROA-able 1540 // values in the operand list as no longer viable. 1541 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1542 disableSROA(*OI); 1543 1544 return false; 1545 } 1546 1547 /// Analyze a basic block for its contribution to the inline cost. 1548 /// 1549 /// This method walks the analyzer over every instruction in the given basic 1550 /// block and accounts for their cost during inlining at this callsite. It 1551 /// aborts early if the threshold has been exceeded or an impossible to inline 1552 /// construct has been detected. It returns false if inlining is no longer 1553 /// viable, and true if inlining remains viable. 1554 InlineResult 1555 CallAnalyzer::analyzeBlock(BasicBlock *BB, 1556 SmallPtrSetImpl<const Value *> &EphValues) { 1557 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1558 // FIXME: Currently, the number of instructions in a function regardless of 1559 // our ability to simplify them during inline to constants or dead code, 1560 // are actually used by the vector bonus heuristic. As long as that's true, 1561 // we have to special case debug intrinsics here to prevent differences in 1562 // inlining due to debug symbols. Eventually, the number of unsimplified 1563 // instructions shouldn't factor into the cost computation, but until then, 1564 // hack around it here. 1565 if (isa<DbgInfoIntrinsic>(I)) 1566 continue; 1567 1568 // Skip ephemeral values. 1569 if (EphValues.count(&*I)) 1570 continue; 1571 1572 ++NumInstructions; 1573 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1574 ++NumVectorInstructions; 1575 1576 // If the instruction simplified to a constant, there is no cost to this 1577 // instruction. Visit the instructions using our InstVisitor to account for 1578 // all of the per-instruction logic. The visit tree returns true if we 1579 // consumed the instruction in any way, and false if the instruction's base 1580 // cost should count against inlining. 1581 if (Base::visit(&*I)) 1582 ++NumInstructionsSimplified; 1583 else 1584 addCost(InlineConstants::InstrCost); 1585 1586 using namespace ore; 1587 // If the visit this instruction detected an uninlinable pattern, abort. 1588 InlineResult IR; 1589 if (IsRecursiveCall) 1590 IR = "recursive"; 1591 else if (ExposesReturnsTwice) 1592 IR = "exposes returns twice"; 1593 else if (HasDynamicAlloca) 1594 IR = "dynamic alloca"; 1595 else if (HasIndirectBr) 1596 IR = "indirect branch"; 1597 else if (HasUninlineableIntrinsic) 1598 IR = "uninlinable intrinsic"; 1599 else if (InitsVargArgs) 1600 IR = "varargs"; 1601 if (!IR) { 1602 if (ORE) 1603 ORE->emit([&]() { 1604 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1605 &CandidateCall) 1606 << NV("Callee", &F) << " has uninlinable pattern (" 1607 << NV("InlineResult", IR.message) 1608 << ") and cost is not fully computed"; 1609 }); 1610 return IR; 1611 } 1612 1613 // If the caller is a recursive function then we don't want to inline 1614 // functions which allocate a lot of stack space because it would increase 1615 // the caller stack usage dramatically. 1616 if (IsCallerRecursive && 1617 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1618 InlineResult IR = "recursive and allocates too much stack space"; 1619 if (ORE) 1620 ORE->emit([&]() { 1621 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1622 &CandidateCall) 1623 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) 1624 << ". Cost is not fully computed"; 1625 }); 1626 return IR; 1627 } 1628 1629 // Check if we've passed the maximum possible threshold so we don't spin in 1630 // huge basic blocks that will never inline. 1631 if (Cost >= Threshold && !ComputeFullInlineCost) 1632 return false; 1633 } 1634 1635 return true; 1636 } 1637 1638 /// Compute the base pointer and cumulative constant offsets for V. 1639 /// 1640 /// This strips all constant offsets off of V, leaving it the base pointer, and 1641 /// accumulates the total constant offset applied in the returned constant. It 1642 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1643 /// no constant offsets applied. 1644 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1645 if (!V->getType()->isPointerTy()) 1646 return nullptr; 1647 1648 unsigned AS = V->getType()->getPointerAddressSpace(); 1649 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1650 APInt Offset = APInt::getNullValue(IntPtrWidth); 1651 1652 // Even though we don't look through PHI nodes, we could be called on an 1653 // instruction in an unreachable block, which may be on a cycle. 1654 SmallPtrSet<Value *, 4> Visited; 1655 Visited.insert(V); 1656 do { 1657 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1658 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1659 return nullptr; 1660 V = GEP->getPointerOperand(); 1661 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1662 V = cast<Operator>(V)->getOperand(0); 1663 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1664 if (GA->isInterposable()) 1665 break; 1666 V = GA->getAliasee(); 1667 } else { 1668 break; 1669 } 1670 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1671 } while (Visited.insert(V).second); 1672 1673 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1674 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1675 } 1676 1677 /// Find dead blocks due to deleted CFG edges during inlining. 1678 /// 1679 /// If we know the successor of the current block, \p CurrBB, has to be \p 1680 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1681 /// no live incoming CFG edges. If one block is found to be dead, we can 1682 /// continue growing the dead block list by checking the successors of the dead 1683 /// blocks to see if all their incoming edges are dead or not. 1684 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1685 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1686 // A CFG edge is dead if the predecessor is dead or the predecessor has a 1687 // known successor which is not the one under exam. 1688 return (DeadBlocks.count(Pred) || 1689 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1690 }; 1691 1692 auto IsNewlyDead = [&](BasicBlock *BB) { 1693 // If all the edges to a block are dead, the block is also dead. 1694 return (!DeadBlocks.count(BB) && 1695 llvm::all_of(predecessors(BB), 1696 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1697 }; 1698 1699 for (BasicBlock *Succ : successors(CurrBB)) { 1700 if (Succ == NextBB || !IsNewlyDead(Succ)) 1701 continue; 1702 SmallVector<BasicBlock *, 4> NewDead; 1703 NewDead.push_back(Succ); 1704 while (!NewDead.empty()) { 1705 BasicBlock *Dead = NewDead.pop_back_val(); 1706 if (DeadBlocks.insert(Dead)) 1707 // Continue growing the dead block lists. 1708 for (BasicBlock *S : successors(Dead)) 1709 if (IsNewlyDead(S)) 1710 NewDead.push_back(S); 1711 } 1712 } 1713 } 1714 1715 /// Analyze a call site for potential inlining. 1716 /// 1717 /// Returns true if inlining this call is viable, and false if it is not 1718 /// viable. It computes the cost and adjusts the threshold based on numerous 1719 /// factors and heuristics. If this method returns false but the computed cost 1720 /// is below the computed threshold, then inlining was forcibly disabled by 1721 /// some artifact of the routine. 1722 InlineResult CallAnalyzer::analyzeCall(CallBase &Call) { 1723 ++NumCallsAnalyzed; 1724 1725 // Perform some tweaks to the cost and threshold based on the direct 1726 // callsite information. 1727 1728 // We want to more aggressively inline vector-dense kernels, so up the 1729 // threshold, and we'll lower it if the % of vector instructions gets too 1730 // low. Note that these bonuses are some what arbitrary and evolved over time 1731 // by accident as much as because they are principled bonuses. 1732 // 1733 // FIXME: It would be nice to remove all such bonuses. At least it would be 1734 // nice to base the bonus values on something more scientific. 1735 assert(NumInstructions == 0); 1736 assert(NumVectorInstructions == 0); 1737 1738 // Update the threshold based on callsite properties 1739 updateThreshold(Call, F); 1740 1741 // While Threshold depends on commandline options that can take negative 1742 // values, we want to enforce the invariant that the computed threshold and 1743 // bonuses are non-negative. 1744 assert(Threshold >= 0); 1745 assert(SingleBBBonus >= 0); 1746 assert(VectorBonus >= 0); 1747 1748 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1749 // this Threshold any time, and cost cannot decrease, we can stop processing 1750 // the rest of the function body. 1751 Threshold += (SingleBBBonus + VectorBonus); 1752 1753 // Give out bonuses for the callsite, as the instructions setting them up 1754 // will be gone after inlining. 1755 addCost(-getCallsiteCost(Call, DL)); 1756 1757 // If this function uses the coldcc calling convention, prefer not to inline 1758 // it. 1759 if (F.getCallingConv() == CallingConv::Cold) 1760 Cost += InlineConstants::ColdccPenalty; 1761 1762 // Check if we're done. This can happen due to bonuses and penalties. 1763 if (Cost >= Threshold && !ComputeFullInlineCost) 1764 return "high cost"; 1765 1766 if (F.empty()) 1767 return true; 1768 1769 Function *Caller = Call.getFunction(); 1770 // Check if the caller function is recursive itself. 1771 for (User *U : Caller->users()) { 1772 CallBase *Call = dyn_cast<CallBase>(U); 1773 if (Call && Call->getFunction() == Caller) { 1774 IsCallerRecursive = true; 1775 break; 1776 } 1777 } 1778 1779 // Populate our simplified values by mapping from function arguments to call 1780 // arguments with known important simplifications. 1781 auto CAI = Call.arg_begin(); 1782 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1783 FAI != FAE; ++FAI, ++CAI) { 1784 assert(CAI != Call.arg_end()); 1785 if (Constant *C = dyn_cast<Constant>(CAI)) 1786 SimplifiedValues[&*FAI] = C; 1787 1788 Value *PtrArg = *CAI; 1789 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1790 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1791 1792 // We can SROA any pointer arguments derived from alloca instructions. 1793 if (isa<AllocaInst>(PtrArg)) { 1794 SROAArgValues[&*FAI] = PtrArg; 1795 SROAArgCosts[PtrArg] = 0; 1796 } 1797 } 1798 } 1799 NumConstantArgs = SimplifiedValues.size(); 1800 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1801 NumAllocaArgs = SROAArgValues.size(); 1802 1803 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1804 // the ephemeral values multiple times (and they're completely determined by 1805 // the callee, so this is purely duplicate work). 1806 SmallPtrSet<const Value *, 32> EphValues; 1807 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1808 1809 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1810 // adding basic blocks of the callee which can be proven to be dead for this 1811 // particular call site in order to get more accurate cost estimates. This 1812 // requires a somewhat heavyweight iteration pattern: we need to walk the 1813 // basic blocks in a breadth-first order as we insert live successors. To 1814 // accomplish this, prioritizing for small iterations because we exit after 1815 // crossing our threshold, we use a small-size optimized SetVector. 1816 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1817 SmallPtrSet<BasicBlock *, 16>> 1818 BBSetVector; 1819 BBSetVector BBWorklist; 1820 BBWorklist.insert(&F.getEntryBlock()); 1821 bool SingleBB = true; 1822 // Note that we *must not* cache the size, this loop grows the worklist. 1823 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1824 // Bail out the moment we cross the threshold. This means we'll under-count 1825 // the cost, but only when undercounting doesn't matter. 1826 if (Cost >= Threshold && !ComputeFullInlineCost) 1827 break; 1828 1829 BasicBlock *BB = BBWorklist[Idx]; 1830 if (BB->empty()) 1831 continue; 1832 1833 // Disallow inlining a blockaddress with uses other than strictly callbr. 1834 // A blockaddress only has defined behavior for an indirect branch in the 1835 // same function, and we do not currently support inlining indirect 1836 // branches. But, the inliner may not see an indirect branch that ends up 1837 // being dead code at a particular call site. If the blockaddress escapes 1838 // the function, e.g., via a global variable, inlining may lead to an 1839 // invalid cross-function reference. 1840 // FIXME: pr/39560: continue relaxing this overt restriction. 1841 if (BB->hasAddressTaken()) 1842 for (User *U : BlockAddress::get(&*BB)->users()) 1843 if (!isa<CallBrInst>(*U)) 1844 return "blockaddress used outside of callbr"; 1845 1846 // Analyze the cost of this block. If we blow through the threshold, this 1847 // returns false, and we can bail on out. 1848 InlineResult IR = analyzeBlock(BB, EphValues); 1849 if (!IR) 1850 return IR; 1851 1852 Instruction *TI = BB->getTerminator(); 1853 1854 // Add in the live successors by first checking whether we have terminator 1855 // that may be simplified based on the values simplified by this call. 1856 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1857 if (BI->isConditional()) { 1858 Value *Cond = BI->getCondition(); 1859 if (ConstantInt *SimpleCond = 1860 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1861 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1862 BBWorklist.insert(NextBB); 1863 KnownSuccessors[BB] = NextBB; 1864 findDeadBlocks(BB, NextBB); 1865 continue; 1866 } 1867 } 1868 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1869 Value *Cond = SI->getCondition(); 1870 if (ConstantInt *SimpleCond = 1871 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1872 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1873 BBWorklist.insert(NextBB); 1874 KnownSuccessors[BB] = NextBB; 1875 findDeadBlocks(BB, NextBB); 1876 continue; 1877 } 1878 } 1879 1880 // If we're unable to select a particular successor, just count all of 1881 // them. 1882 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1883 ++TIdx) 1884 BBWorklist.insert(TI->getSuccessor(TIdx)); 1885 1886 // If we had any successors at this point, than post-inlining is likely to 1887 // have them as well. Note that we assume any basic blocks which existed 1888 // due to branches or switches which folded above will also fold after 1889 // inlining. 1890 if (SingleBB && TI->getNumSuccessors() > 1) { 1891 // Take off the bonus we applied to the threshold. 1892 Threshold -= SingleBBBonus; 1893 SingleBB = false; 1894 } 1895 } 1896 1897 bool OnlyOneCallAndLocalLinkage = 1898 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1899 // If this is a noduplicate call, we can still inline as long as 1900 // inlining this would cause the removal of the caller (so the instruction 1901 // is not actually duplicated, just moved). 1902 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1903 return "noduplicate"; 1904 1905 // Loops generally act a lot like calls in that they act like barriers to 1906 // movement, require a certain amount of setup, etc. So when optimising for 1907 // size, we penalise any call sites that perform loops. We do this after all 1908 // other costs here, so will likely only be dealing with relatively small 1909 // functions (and hence DT and LI will hopefully be cheap). 1910 if (Caller->hasMinSize()) { 1911 DominatorTree DT(F); 1912 LoopInfo LI(DT); 1913 int NumLoops = 0; 1914 for (Loop *L : LI) { 1915 // Ignore loops that will not be executed 1916 if (DeadBlocks.count(L->getHeader())) 1917 continue; 1918 NumLoops++; 1919 } 1920 addCost(NumLoops * InlineConstants::CallPenalty); 1921 } 1922 1923 // We applied the maximum possible vector bonus at the beginning. Now, 1924 // subtract the excess bonus, if any, from the Threshold before 1925 // comparing against Cost. 1926 if (NumVectorInstructions <= NumInstructions / 10) 1927 Threshold -= VectorBonus; 1928 else if (NumVectorInstructions <= NumInstructions / 2) 1929 Threshold -= VectorBonus/2; 1930 1931 return Cost < std::max(1, Threshold); 1932 } 1933 1934 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1935 /// Dump stats about this call's analysis. 1936 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1937 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1938 DEBUG_PRINT_STAT(NumConstantArgs); 1939 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1940 DEBUG_PRINT_STAT(NumAllocaArgs); 1941 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1942 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1943 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1944 DEBUG_PRINT_STAT(NumInstructions); 1945 DEBUG_PRINT_STAT(SROACostSavings); 1946 DEBUG_PRINT_STAT(SROACostSavingsLost); 1947 DEBUG_PRINT_STAT(LoadEliminationCost); 1948 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1949 DEBUG_PRINT_STAT(Cost); 1950 DEBUG_PRINT_STAT(Threshold); 1951 #undef DEBUG_PRINT_STAT 1952 } 1953 #endif 1954 1955 /// Test that there are no attribute conflicts between Caller and Callee 1956 /// that prevent inlining. 1957 static bool functionsHaveCompatibleAttributes(Function *Caller, 1958 Function *Callee, 1959 TargetTransformInfo &TTI) { 1960 return TTI.areInlineCompatible(Caller, Callee) && 1961 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1962 } 1963 1964 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 1965 int Cost = 0; 1966 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 1967 if (Call.isByValArgument(I)) { 1968 // We approximate the number of loads and stores needed by dividing the 1969 // size of the byval type by the target's pointer size. 1970 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 1971 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1972 unsigned AS = PTy->getAddressSpace(); 1973 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1974 // Ceiling division. 1975 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1976 1977 // If it generates more than 8 stores it is likely to be expanded as an 1978 // inline memcpy so we take that as an upper bound. Otherwise we assume 1979 // one load and one store per word copied. 1980 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1981 // here instead of a magic number of 8, but it's not available via 1982 // DataLayout. 1983 NumStores = std::min(NumStores, 8U); 1984 1985 Cost += 2 * NumStores * InlineConstants::InstrCost; 1986 } else { 1987 // For non-byval arguments subtract off one instruction per call 1988 // argument. 1989 Cost += InlineConstants::InstrCost; 1990 } 1991 } 1992 // The call instruction also disappears after inlining. 1993 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1994 return Cost; 1995 } 1996 1997 InlineCost llvm::getInlineCost( 1998 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1999 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2000 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2001 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2002 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2003 GetAssumptionCache, GetBFI, PSI, ORE); 2004 } 2005 2006 InlineCost llvm::getInlineCost( 2007 CallBase &Call, Function *Callee, const InlineParams &Params, 2008 TargetTransformInfo &CalleeTTI, 2009 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2010 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2011 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2012 2013 // Cannot inline indirect calls. 2014 if (!Callee) 2015 return llvm::InlineCost::getNever("indirect call"); 2016 2017 // Never inline calls with byval arguments that does not have the alloca 2018 // address space. Since byval arguments can be replaced with a copy to an 2019 // alloca, the inlined code would need to be adjusted to handle that the 2020 // argument is in the alloca address space (so it is a little bit complicated 2021 // to solve). 2022 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2023 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2024 if (Call.isByValArgument(I)) { 2025 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2026 if (PTy->getAddressSpace() != AllocaAS) 2027 return llvm::InlineCost::getNever("byval arguments without alloca" 2028 " address space"); 2029 } 2030 2031 // Calls to functions with always-inline attributes should be inlined 2032 // whenever possible. 2033 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2034 auto IsViable = isInlineViable(*Callee); 2035 if (IsViable) 2036 return llvm::InlineCost::getAlways("always inline attribute"); 2037 return llvm::InlineCost::getNever(IsViable.message); 2038 } 2039 2040 // Never inline functions with conflicting attributes (unless callee has 2041 // always-inline attribute). 2042 Function *Caller = Call.getCaller(); 2043 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 2044 return llvm::InlineCost::getNever("conflicting attributes"); 2045 2046 // Don't inline this call if the caller has the optnone attribute. 2047 if (Caller->hasOptNone()) 2048 return llvm::InlineCost::getNever("optnone attribute"); 2049 2050 // Don't inline a function that treats null pointer as valid into a caller 2051 // that does not have this attribute. 2052 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2053 return llvm::InlineCost::getNever("nullptr definitions incompatible"); 2054 2055 // Don't inline functions which can be interposed at link-time. 2056 if (Callee->isInterposable()) 2057 return llvm::InlineCost::getNever("interposable"); 2058 2059 // Don't inline functions marked noinline. 2060 if (Callee->hasFnAttribute(Attribute::NoInline)) 2061 return llvm::InlineCost::getNever("noinline function attribute"); 2062 2063 // Don't inline call sites marked noinline. 2064 if (Call.isNoInline()) 2065 return llvm::InlineCost::getNever("noinline call site attribute"); 2066 2067 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2068 << "... (caller:" << Caller->getName() << ")\n"); 2069 2070 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, 2071 Call, Params); 2072 InlineResult ShouldInline = CA.analyzeCall(Call); 2073 2074 LLVM_DEBUG(CA.dump()); 2075 2076 // Check if there was a reason to force inlining or no inlining. 2077 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2078 return InlineCost::getNever(ShouldInline.message); 2079 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2080 return InlineCost::getAlways("empty function"); 2081 2082 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2083 } 2084 2085 InlineResult llvm::isInlineViable(Function &F) { 2086 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2087 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2088 // Disallow inlining of functions which contain indirect branches. 2089 if (isa<IndirectBrInst>(BI->getTerminator())) 2090 return "contains indirect branches"; 2091 2092 // Disallow inlining of blockaddresses which are used by non-callbr 2093 // instructions. 2094 if (BI->hasAddressTaken()) 2095 for (User *U : BlockAddress::get(&*BI)->users()) 2096 if (!isa<CallBrInst>(*U)) 2097 return "blockaddress used outside of callbr"; 2098 2099 for (auto &II : *BI) { 2100 CallBase *Call = dyn_cast<CallBase>(&II); 2101 if (!Call) 2102 continue; 2103 2104 // Disallow recursive calls. 2105 if (&F == Call->getCalledFunction()) 2106 return "recursive call"; 2107 2108 // Disallow calls which expose returns-twice to a function not previously 2109 // attributed as such. 2110 if (!ReturnsTwice && isa<CallInst>(Call) && 2111 cast<CallInst>(Call)->canReturnTwice()) 2112 return "exposes returns-twice attribute"; 2113 2114 if (Call->getCalledFunction()) 2115 switch (Call->getCalledFunction()->getIntrinsicID()) { 2116 default: 2117 break; 2118 // Disallow inlining of @llvm.icall.branch.funnel because current 2119 // backend can't separate call targets from call arguments. 2120 case llvm::Intrinsic::icall_branch_funnel: 2121 return "disallowed inlining of @llvm.icall.branch.funnel"; 2122 // Disallow inlining functions that call @llvm.localescape. Doing this 2123 // correctly would require major changes to the inliner. 2124 case llvm::Intrinsic::localescape: 2125 return "disallowed inlining of @llvm.localescape"; 2126 // Disallow inlining of functions that initialize VarArgs with va_start. 2127 case llvm::Intrinsic::vastart: 2128 return "contains VarArgs initialized with va_start"; 2129 } 2130 } 2131 } 2132 2133 return true; 2134 } 2135 2136 // APIs to create InlineParams based on command line flags and/or other 2137 // parameters. 2138 2139 InlineParams llvm::getInlineParams(int Threshold) { 2140 InlineParams Params; 2141 2142 // This field is the threshold to use for a callee by default. This is 2143 // derived from one or more of: 2144 // * optimization or size-optimization levels, 2145 // * a value passed to createFunctionInliningPass function, or 2146 // * the -inline-threshold flag. 2147 // If the -inline-threshold flag is explicitly specified, that is used 2148 // irrespective of anything else. 2149 if (InlineThreshold.getNumOccurrences() > 0) 2150 Params.DefaultThreshold = InlineThreshold; 2151 else 2152 Params.DefaultThreshold = Threshold; 2153 2154 // Set the HintThreshold knob from the -inlinehint-threshold. 2155 Params.HintThreshold = HintThreshold; 2156 2157 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2158 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2159 2160 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2161 // populate LocallyHotCallSiteThreshold. Later, we populate 2162 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2163 // we know that optimization level is O3 (in the getInlineParams variant that 2164 // takes the opt and size levels). 2165 // FIXME: Remove this check (and make the assignment unconditional) after 2166 // addressing size regression issues at O2. 2167 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2168 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2169 2170 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2171 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2172 2173 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2174 // -inlinehint-threshold commandline option is not explicitly given. If that 2175 // option is present, then its value applies even for callees with size and 2176 // minsize attributes. 2177 // If the -inline-threshold is not specified, set the ColdThreshold from the 2178 // -inlinecold-threshold even if it is not explicitly passed. If 2179 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2180 // explicitly specified to set the ColdThreshold knob 2181 if (InlineThreshold.getNumOccurrences() == 0) { 2182 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2183 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2184 Params.ColdThreshold = ColdThreshold; 2185 } else if (ColdThreshold.getNumOccurrences() > 0) { 2186 Params.ColdThreshold = ColdThreshold; 2187 } 2188 return Params; 2189 } 2190 2191 InlineParams llvm::getInlineParams() { 2192 return getInlineParams(InlineThreshold); 2193 } 2194 2195 // Compute the default threshold for inlining based on the opt level and the 2196 // size opt level. 2197 static int computeThresholdFromOptLevels(unsigned OptLevel, 2198 unsigned SizeOptLevel) { 2199 if (OptLevel > 2) 2200 return InlineConstants::OptAggressiveThreshold; 2201 if (SizeOptLevel == 1) // -Os 2202 return InlineConstants::OptSizeThreshold; 2203 if (SizeOptLevel == 2) // -Oz 2204 return InlineConstants::OptMinSizeThreshold; 2205 return InlineThreshold; 2206 } 2207 2208 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2209 auto Params = 2210 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2211 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2212 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2213 // when it is specified explicitly. 2214 if (OptLevel > 2) 2215 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2216 return Params; 2217 } 2218