1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements inline cost analysis. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Analysis/InlineCost.h" 14 #include "llvm/ADT/STLExtras.h" 15 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/Statistic.h" 19 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/BlockFrequencyInfo.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/CFG.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/Dominators.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/IR/PatternMatch.h" 39 #include "llvm/Support/Debug.h" 40 #include "llvm/Support/raw_ostream.h" 41 42 using namespace llvm; 43 44 #define DEBUG_TYPE "inline-cost" 45 46 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 47 48 static cl::opt<int> InlineThreshold( 49 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 50 cl::desc("Control the amount of inlining to perform (default = 225)")); 51 52 static cl::opt<int> HintThreshold( 53 "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 54 cl::desc("Threshold for inlining functions with inline hint")); 55 56 static cl::opt<int> 57 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 58 cl::init(45), cl::ZeroOrMore, 59 cl::desc("Threshold for inlining cold callsites")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 static cl::opt<int> 69 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 70 cl::ZeroOrMore, 71 cl::desc("Threshold for hot callsites ")); 72 73 static cl::opt<int> LocallyHotCallSiteThreshold( 74 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 75 cl::desc("Threshold for locally hot callsites ")); 76 77 static cl::opt<int> ColdCallSiteRelFreq( 78 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 79 cl::desc("Maximum block frequency, expressed as a percentage of caller's " 80 "entry frequency, for a callsite to be cold in the absence of " 81 "profile information.")); 82 83 static cl::opt<int> HotCallSiteRelFreq( 84 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 85 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 86 "entry frequency, for a callsite to be hot in the absence of " 87 "profile information.")); 88 89 static cl::opt<bool> OptComputeFullInlineCost( 90 "inline-cost-full", cl::Hidden, cl::init(false), cl::ZeroOrMore, 91 cl::desc("Compute the full inline cost of a call site even when the cost " 92 "exceeds the threshold.")); 93 94 namespace { 95 96 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 97 typedef InstVisitor<CallAnalyzer, bool> Base; 98 friend class InstVisitor<CallAnalyzer, bool>; 99 100 /// The TargetTransformInfo available for this compilation. 101 const TargetTransformInfo &TTI; 102 103 /// Getter for the cache of @llvm.assume intrinsics. 104 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 105 106 /// Getter for BlockFrequencyInfo 107 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 108 109 /// Profile summary information. 110 ProfileSummaryInfo *PSI; 111 112 /// The called function. 113 Function &F; 114 115 // Cache the DataLayout since we use it a lot. 116 const DataLayout &DL; 117 118 /// The OptimizationRemarkEmitter available for this compilation. 119 OptimizationRemarkEmitter *ORE; 120 121 /// The candidate callsite being analyzed. Please do not use this to do 122 /// analysis in the caller function; we want the inline cost query to be 123 /// easily cacheable. Instead, use the cover function paramHasAttr. 124 CallBase &CandidateCall; 125 126 /// Tunable parameters that control the analysis. 127 const InlineParams &Params; 128 129 /// Upper bound for the inlining cost. Bonuses are being applied to account 130 /// for speculative "expected profit" of the inlining decision. 131 int Threshold; 132 133 /// Inlining cost measured in abstract units, accounts for all the 134 /// instructions expected to be executed for a given function invocation. 135 /// Instructions that are statically proven to be dead based on call-site 136 /// arguments are not counted here. 137 int Cost = 0; 138 139 bool ComputeFullInlineCost; 140 141 bool IsCallerRecursive = false; 142 bool IsRecursiveCall = false; 143 bool ExposesReturnsTwice = false; 144 bool HasDynamicAlloca = false; 145 bool ContainsNoDuplicateCall = false; 146 bool HasReturn = false; 147 bool HasIndirectBr = false; 148 bool HasUninlineableIntrinsic = false; 149 bool InitsVargArgs = false; 150 151 /// Number of bytes allocated statically by the callee. 152 uint64_t AllocatedSize = 0; 153 unsigned NumInstructions = 0; 154 unsigned NumVectorInstructions = 0; 155 156 /// Bonus to be applied when percentage of vector instructions in callee is 157 /// high (see more details in updateThreshold). 158 int VectorBonus = 0; 159 /// Bonus to be applied when the callee has only one reachable basic block. 160 int SingleBBBonus = 0; 161 162 /// While we walk the potentially-inlined instructions, we build up and 163 /// maintain a mapping of simplified values specific to this callsite. The 164 /// idea is to propagate any special information we have about arguments to 165 /// this call through the inlinable section of the function, and account for 166 /// likely simplifications post-inlining. The most important aspect we track 167 /// is CFG altering simplifications -- when we prove a basic block dead, that 168 /// can cause dramatic shifts in the cost of inlining a function. 169 DenseMap<Value *, Constant *> SimplifiedValues; 170 171 /// Keep track of the values which map back (through function arguments) to 172 /// allocas on the caller stack which could be simplified through SROA. 173 DenseMap<Value *, Value *> SROAArgValues; 174 175 /// The mapping of caller Alloca values to their accumulated cost savings. If 176 /// we have to disable SROA for one of the allocas, this tells us how much 177 /// cost must be added. 178 DenseMap<Value *, int> SROAArgCosts; 179 180 /// Keep track of values which map to a pointer base and constant offset. 181 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 182 183 /// Keep track of dead blocks due to the constant arguments. 184 SetVector<BasicBlock *> DeadBlocks; 185 186 /// The mapping of the blocks to their known unique successors due to the 187 /// constant arguments. 188 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 189 190 /// Model the elimination of repeated loads that is expected to happen 191 /// whenever we simplify away the stores that would otherwise cause them to be 192 /// loads. 193 bool EnableLoadElimination; 194 SmallPtrSet<Value *, 16> LoadAddrSet; 195 int LoadEliminationCost = 0; 196 197 // Custom simplification helper routines. 198 bool isAllocaDerivedArg(Value *V); 199 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 200 DenseMap<Value *, int>::iterator &CostIt); 201 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 202 void disableSROA(Value *V); 203 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 204 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 205 int InstructionCost); 206 void disableLoadElimination(); 207 bool isGEPFree(GetElementPtrInst &GEP); 208 bool canFoldInboundsGEP(GetElementPtrInst &I); 209 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 210 bool simplifyCallSite(Function *F, CallBase &Call); 211 template <typename Callable> 212 bool simplifyInstruction(Instruction &I, Callable Evaluate); 213 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 214 215 /// Return true if the given argument to the function being considered for 216 /// inlining has the given attribute set either at the call site or the 217 /// function declaration. Primarily used to inspect call site specific 218 /// attributes since these can be more precise than the ones on the callee 219 /// itself. 220 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 221 222 /// Return true if the given value is known non null within the callee if 223 /// inlined through this particular callsite. 224 bool isKnownNonNullInCallee(Value *V); 225 226 /// Update Threshold based on callsite properties such as callee 227 /// attributes and callee hotness for PGO builds. The Callee is explicitly 228 /// passed to support analyzing indirect calls whose target is inferred by 229 /// analysis. 230 void updateThreshold(CallBase &Call, Function &Callee); 231 232 /// Return true if size growth is allowed when inlining the callee at \p Call. 233 bool allowSizeGrowth(CallBase &Call); 234 235 /// Return true if \p Call is a cold callsite. 236 bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI); 237 238 /// Return a higher threshold if \p Call is a hot callsite. 239 Optional<int> getHotCallSiteThreshold(CallBase &Call, 240 BlockFrequencyInfo *CallerBFI); 241 242 // Custom analysis routines. 243 InlineResult analyzeBlock(BasicBlock *BB, 244 SmallPtrSetImpl<const Value *> &EphValues); 245 246 /// Handle a capped 'int' increment for Cost. 247 void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) { 248 assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound"); 249 Cost = (int)std::min(UpperBound, Cost + Inc); 250 } 251 252 // Disable several entry points to the visitor so we don't accidentally use 253 // them by declaring but not defining them here. 254 void visit(Module *); 255 void visit(Module &); 256 void visit(Function *); 257 void visit(Function &); 258 void visit(BasicBlock *); 259 void visit(BasicBlock &); 260 261 // Provide base case for our instruction visit. 262 bool visitInstruction(Instruction &I); 263 264 // Our visit overrides. 265 bool visitAlloca(AllocaInst &I); 266 bool visitPHI(PHINode &I); 267 bool visitGetElementPtr(GetElementPtrInst &I); 268 bool visitBitCast(BitCastInst &I); 269 bool visitPtrToInt(PtrToIntInst &I); 270 bool visitIntToPtr(IntToPtrInst &I); 271 bool visitCastInst(CastInst &I); 272 bool visitUnaryInstruction(UnaryInstruction &I); 273 bool visitCmpInst(CmpInst &I); 274 bool visitSub(BinaryOperator &I); 275 bool visitBinaryOperator(BinaryOperator &I); 276 bool visitFNeg(UnaryOperator &I); 277 bool visitLoad(LoadInst &I); 278 bool visitStore(StoreInst &I); 279 bool visitExtractValue(ExtractValueInst &I); 280 bool visitInsertValue(InsertValueInst &I); 281 bool visitCallBase(CallBase &Call); 282 bool visitReturnInst(ReturnInst &RI); 283 bool visitBranchInst(BranchInst &BI); 284 bool visitSelectInst(SelectInst &SI); 285 bool visitSwitchInst(SwitchInst &SI); 286 bool visitIndirectBrInst(IndirectBrInst &IBI); 287 bool visitResumeInst(ResumeInst &RI); 288 bool visitCleanupReturnInst(CleanupReturnInst &RI); 289 bool visitCatchReturnInst(CatchReturnInst &RI); 290 bool visitUnreachableInst(UnreachableInst &I); 291 292 public: 293 CallAnalyzer(const TargetTransformInfo &TTI, 294 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 295 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 296 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 297 Function &Callee, CallBase &Call, const InlineParams &Params) 298 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 299 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 300 CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold), 301 ComputeFullInlineCost(OptComputeFullInlineCost || 302 Params.ComputeFullInlineCost || ORE), 303 EnableLoadElimination(true) {} 304 305 InlineResult analyzeCall(CallBase &Call); 306 307 int getThreshold() { return Threshold; } 308 int getCost() { return Cost; } 309 310 // Keep a bunch of stats about the cost savings found so we can print them 311 // out when debugging. 312 unsigned NumConstantArgs = 0; 313 unsigned NumConstantOffsetPtrArgs = 0; 314 unsigned NumAllocaArgs = 0; 315 unsigned NumConstantPtrCmps = 0; 316 unsigned NumConstantPtrDiffs = 0; 317 unsigned NumInstructionsSimplified = 0; 318 unsigned SROACostSavings = 0; 319 unsigned SROACostSavingsLost = 0; 320 321 void dump(); 322 }; 323 324 } // namespace 325 326 /// Test whether the given value is an Alloca-derived function argument. 327 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 328 return SROAArgValues.count(V); 329 } 330 331 /// Lookup the SROA-candidate argument and cost iterator which V maps to. 332 /// Returns false if V does not map to a SROA-candidate. 333 bool CallAnalyzer::lookupSROAArgAndCost( 334 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 335 if (SROAArgValues.empty() || SROAArgCosts.empty()) 336 return false; 337 338 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 339 if (ArgIt == SROAArgValues.end()) 340 return false; 341 342 Arg = ArgIt->second; 343 CostIt = SROAArgCosts.find(Arg); 344 return CostIt != SROAArgCosts.end(); 345 } 346 347 /// Disable SROA for the candidate marked by this cost iterator. 348 /// 349 /// This marks the candidate as no longer viable for SROA, and adds the cost 350 /// savings associated with it back into the inline cost measurement. 351 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 352 // If we're no longer able to perform SROA we need to undo its cost savings 353 // and prevent subsequent analysis. 354 addCost(CostIt->second); 355 SROACostSavings -= CostIt->second; 356 SROACostSavingsLost += CostIt->second; 357 SROAArgCosts.erase(CostIt); 358 disableLoadElimination(); 359 } 360 361 /// If 'V' maps to a SROA candidate, disable SROA for it. 362 void CallAnalyzer::disableSROA(Value *V) { 363 Value *SROAArg; 364 DenseMap<Value *, int>::iterator CostIt; 365 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 366 disableSROA(CostIt); 367 } 368 369 /// Accumulate the given cost for a particular SROA candidate. 370 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 371 int InstructionCost) { 372 CostIt->second += InstructionCost; 373 SROACostSavings += InstructionCost; 374 } 375 376 void CallAnalyzer::disableLoadElimination() { 377 if (EnableLoadElimination) { 378 addCost(LoadEliminationCost); 379 LoadEliminationCost = 0; 380 EnableLoadElimination = false; 381 } 382 } 383 384 /// Accumulate a constant GEP offset into an APInt if possible. 385 /// 386 /// Returns false if unable to compute the offset for any reason. Respects any 387 /// simplified values known during the analysis of this callsite. 388 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 389 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 390 assert(IntPtrWidth == Offset.getBitWidth()); 391 392 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 393 GTI != GTE; ++GTI) { 394 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 395 if (!OpC) 396 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 397 OpC = dyn_cast<ConstantInt>(SimpleOp); 398 if (!OpC) 399 return false; 400 if (OpC->isZero()) 401 continue; 402 403 // Handle a struct index, which adds its field offset to the pointer. 404 if (StructType *STy = GTI.getStructTypeOrNull()) { 405 unsigned ElementIdx = OpC->getZExtValue(); 406 const StructLayout *SL = DL.getStructLayout(STy); 407 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 408 continue; 409 } 410 411 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 412 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 413 } 414 return true; 415 } 416 417 /// Use TTI to check whether a GEP is free. 418 /// 419 /// Respects any simplified values known during the analysis of this callsite. 420 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 421 SmallVector<Value *, 4> Operands; 422 Operands.push_back(GEP.getOperand(0)); 423 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 424 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 425 Operands.push_back(SimpleOp); 426 else 427 Operands.push_back(*I); 428 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 429 } 430 431 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 432 // Check whether inlining will turn a dynamic alloca into a static 433 // alloca and handle that case. 434 if (I.isArrayAllocation()) { 435 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 436 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 437 Type *Ty = I.getAllocatedType(); 438 AllocatedSize = SaturatingMultiplyAdd( 439 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 440 return Base::visitAlloca(I); 441 } 442 } 443 444 // Accumulate the allocated size. 445 if (I.isStaticAlloca()) { 446 Type *Ty = I.getAllocatedType(); 447 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 448 } 449 450 // We will happily inline static alloca instructions. 451 if (I.isStaticAlloca()) 452 return Base::visitAlloca(I); 453 454 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 455 // a variety of reasons, and so we would like to not inline them into 456 // functions which don't currently have a dynamic alloca. This simply 457 // disables inlining altogether in the presence of a dynamic alloca. 458 HasDynamicAlloca = true; 459 return false; 460 } 461 462 bool CallAnalyzer::visitPHI(PHINode &I) { 463 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 464 // though we don't want to propagate it's bonuses. The idea is to disable 465 // SROA if it *might* be used in an inappropriate manner. 466 467 // Phi nodes are always zero-cost. 468 // FIXME: Pointer sizes may differ between different address spaces, so do we 469 // need to use correct address space in the call to getPointerSizeInBits here? 470 // Or could we skip the getPointerSizeInBits call completely? As far as I can 471 // see the ZeroOffset is used as a dummy value, so we can probably use any 472 // bit width for the ZeroOffset? 473 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 474 bool CheckSROA = I.getType()->isPointerTy(); 475 476 // Track the constant or pointer with constant offset we've seen so far. 477 Constant *FirstC = nullptr; 478 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 479 Value *FirstV = nullptr; 480 481 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 482 BasicBlock *Pred = I.getIncomingBlock(i); 483 // If the incoming block is dead, skip the incoming block. 484 if (DeadBlocks.count(Pred)) 485 continue; 486 // If the parent block of phi is not the known successor of the incoming 487 // block, skip the incoming block. 488 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 489 if (KnownSuccessor && KnownSuccessor != I.getParent()) 490 continue; 491 492 Value *V = I.getIncomingValue(i); 493 // If the incoming value is this phi itself, skip the incoming value. 494 if (&I == V) 495 continue; 496 497 Constant *C = dyn_cast<Constant>(V); 498 if (!C) 499 C = SimplifiedValues.lookup(V); 500 501 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 502 if (!C && CheckSROA) 503 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 504 505 if (!C && !BaseAndOffset.first) 506 // The incoming value is neither a constant nor a pointer with constant 507 // offset, exit early. 508 return true; 509 510 if (FirstC) { 511 if (FirstC == C) 512 // If we've seen a constant incoming value before and it is the same 513 // constant we see this time, continue checking the next incoming value. 514 continue; 515 // Otherwise early exit because we either see a different constant or saw 516 // a constant before but we have a pointer with constant offset this time. 517 return true; 518 } 519 520 if (FirstV) { 521 // The same logic as above, but check pointer with constant offset here. 522 if (FirstBaseAndOffset == BaseAndOffset) 523 continue; 524 return true; 525 } 526 527 if (C) { 528 // This is the 1st time we've seen a constant, record it. 529 FirstC = C; 530 continue; 531 } 532 533 // The remaining case is that this is the 1st time we've seen a pointer with 534 // constant offset, record it. 535 FirstV = V; 536 FirstBaseAndOffset = BaseAndOffset; 537 } 538 539 // Check if we can map phi to a constant. 540 if (FirstC) { 541 SimplifiedValues[&I] = FirstC; 542 return true; 543 } 544 545 // Check if we can map phi to a pointer with constant offset. 546 if (FirstBaseAndOffset.first) { 547 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 548 549 Value *SROAArg; 550 DenseMap<Value *, int>::iterator CostIt; 551 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 552 SROAArgValues[&I] = SROAArg; 553 } 554 555 return true; 556 } 557 558 /// Check we can fold GEPs of constant-offset call site argument pointers. 559 /// This requires target data and inbounds GEPs. 560 /// 561 /// \return true if the specified GEP can be folded. 562 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 563 // Check if we have a base + offset for the pointer. 564 std::pair<Value *, APInt> BaseAndOffset = 565 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 566 if (!BaseAndOffset.first) 567 return false; 568 569 // Check if the offset of this GEP is constant, and if so accumulate it 570 // into Offset. 571 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 572 return false; 573 574 // Add the result as a new mapping to Base + Offset. 575 ConstantOffsetPtrs[&I] = BaseAndOffset; 576 577 return true; 578 } 579 580 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 581 Value *SROAArg; 582 DenseMap<Value *, int>::iterator CostIt; 583 bool SROACandidate = 584 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 585 586 // Lambda to check whether a GEP's indices are all constant. 587 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 588 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 589 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 590 return false; 591 return true; 592 }; 593 594 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 595 if (SROACandidate) 596 SROAArgValues[&I] = SROAArg; 597 598 // Constant GEPs are modeled as free. 599 return true; 600 } 601 602 // Variable GEPs will require math and will disable SROA. 603 if (SROACandidate) 604 disableSROA(CostIt); 605 return isGEPFree(I); 606 } 607 608 /// Simplify \p I if its operands are constants and update SimplifiedValues. 609 /// \p Evaluate is a callable specific to instruction type that evaluates the 610 /// instruction when all the operands are constants. 611 template <typename Callable> 612 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 613 SmallVector<Constant *, 2> COps; 614 for (Value *Op : I.operands()) { 615 Constant *COp = dyn_cast<Constant>(Op); 616 if (!COp) 617 COp = SimplifiedValues.lookup(Op); 618 if (!COp) 619 return false; 620 COps.push_back(COp); 621 } 622 auto *C = Evaluate(COps); 623 if (!C) 624 return false; 625 SimplifiedValues[&I] = C; 626 return true; 627 } 628 629 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 630 // Propagate constants through bitcasts. 631 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 632 return ConstantExpr::getBitCast(COps[0], I.getType()); 633 })) 634 return true; 635 636 // Track base/offsets through casts 637 std::pair<Value *, APInt> BaseAndOffset = 638 ConstantOffsetPtrs.lookup(I.getOperand(0)); 639 // Casts don't change the offset, just wrap it up. 640 if (BaseAndOffset.first) 641 ConstantOffsetPtrs[&I] = BaseAndOffset; 642 643 // Also look for SROA candidates here. 644 Value *SROAArg; 645 DenseMap<Value *, int>::iterator CostIt; 646 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 647 SROAArgValues[&I] = SROAArg; 648 649 // Bitcasts are always zero cost. 650 return true; 651 } 652 653 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 654 // Propagate constants through ptrtoint. 655 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 656 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 657 })) 658 return true; 659 660 // Track base/offset pairs when converted to a plain integer provided the 661 // integer is large enough to represent the pointer. 662 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 663 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 664 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 665 std::pair<Value *, APInt> BaseAndOffset = 666 ConstantOffsetPtrs.lookup(I.getOperand(0)); 667 if (BaseAndOffset.first) 668 ConstantOffsetPtrs[&I] = BaseAndOffset; 669 } 670 671 // This is really weird. Technically, ptrtoint will disable SROA. However, 672 // unless that ptrtoint is *used* somewhere in the live basic blocks after 673 // inlining, it will be nuked, and SROA should proceed. All of the uses which 674 // would block SROA would also block SROA if applied directly to a pointer, 675 // and so we can just add the integer in here. The only places where SROA is 676 // preserved either cannot fire on an integer, or won't in-and-of themselves 677 // disable SROA (ext) w/o some later use that we would see and disable. 678 Value *SROAArg; 679 DenseMap<Value *, int>::iterator CostIt; 680 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 681 SROAArgValues[&I] = SROAArg; 682 683 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 684 } 685 686 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 687 // Propagate constants through ptrtoint. 688 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 689 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 690 })) 691 return true; 692 693 // Track base/offset pairs when round-tripped through a pointer without 694 // modifications provided the integer is not too large. 695 Value *Op = I.getOperand(0); 696 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 697 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 698 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 699 if (BaseAndOffset.first) 700 ConstantOffsetPtrs[&I] = BaseAndOffset; 701 } 702 703 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 704 Value *SROAArg; 705 DenseMap<Value *, int>::iterator CostIt; 706 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 707 SROAArgValues[&I] = SROAArg; 708 709 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 710 } 711 712 bool CallAnalyzer::visitCastInst(CastInst &I) { 713 // Propagate constants through casts. 714 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 715 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 716 })) 717 return true; 718 719 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 720 disableSROA(I.getOperand(0)); 721 722 // If this is a floating-point cast, and the target says this operation 723 // is expensive, this may eventually become a library call. Treat the cost 724 // as such. 725 switch (I.getOpcode()) { 726 case Instruction::FPTrunc: 727 case Instruction::FPExt: 728 case Instruction::UIToFP: 729 case Instruction::SIToFP: 730 case Instruction::FPToUI: 731 case Instruction::FPToSI: 732 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 733 addCost(InlineConstants::CallPenalty); 734 break; 735 default: 736 break; 737 } 738 739 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 740 } 741 742 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 743 Value *Operand = I.getOperand(0); 744 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 745 return ConstantFoldInstOperands(&I, COps[0], DL); 746 })) 747 return true; 748 749 // Disable any SROA on the argument to arbitrary unary instructions. 750 disableSROA(Operand); 751 752 return false; 753 } 754 755 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 756 return CandidateCall.paramHasAttr(A->getArgNo(), Attr); 757 } 758 759 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 760 // Does the *call site* have the NonNull attribute set on an argument? We 761 // use the attribute on the call site to memoize any analysis done in the 762 // caller. This will also trip if the callee function has a non-null 763 // parameter attribute, but that's a less interesting case because hopefully 764 // the callee would already have been simplified based on that. 765 if (Argument *A = dyn_cast<Argument>(V)) 766 if (paramHasAttr(A, Attribute::NonNull)) 767 return true; 768 769 // Is this an alloca in the caller? This is distinct from the attribute case 770 // above because attributes aren't updated within the inliner itself and we 771 // always want to catch the alloca derived case. 772 if (isAllocaDerivedArg(V)) 773 // We can actually predict the result of comparisons between an 774 // alloca-derived value and null. Note that this fires regardless of 775 // SROA firing. 776 return true; 777 778 return false; 779 } 780 781 bool CallAnalyzer::allowSizeGrowth(CallBase &Call) { 782 // If the normal destination of the invoke or the parent block of the call 783 // site is unreachable-terminated, there is little point in inlining this 784 // unless there is literally zero cost. 785 // FIXME: Note that it is possible that an unreachable-terminated block has a 786 // hot entry. For example, in below scenario inlining hot_call_X() may be 787 // beneficial : 788 // main() { 789 // hot_call_1(); 790 // ... 791 // hot_call_N() 792 // exit(0); 793 // } 794 // For now, we are not handling this corner case here as it is rare in real 795 // code. In future, we should elaborate this based on BPI and BFI in more 796 // general threshold adjusting heuristics in updateThreshold(). 797 if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) { 798 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 799 return false; 800 } else if (isa<UnreachableInst>(Call.getParent()->getTerminator())) 801 return false; 802 803 return true; 804 } 805 806 bool CallAnalyzer::isColdCallSite(CallBase &Call, 807 BlockFrequencyInfo *CallerBFI) { 808 // If global profile summary is available, then callsite's coldness is 809 // determined based on that. 810 if (PSI && PSI->hasProfileSummary()) 811 return PSI->isColdCallSite(CallSite(&Call), CallerBFI); 812 813 // Otherwise we need BFI to be available. 814 if (!CallerBFI) 815 return false; 816 817 // Determine if the callsite is cold relative to caller's entry. We could 818 // potentially cache the computation of scaled entry frequency, but the added 819 // complexity is not worth it unless this scaling shows up high in the 820 // profiles. 821 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 822 auto CallSiteBB = Call.getParent(); 823 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 824 auto CallerEntryFreq = 825 CallerBFI->getBlockFreq(&(Call.getCaller()->getEntryBlock())); 826 return CallSiteFreq < CallerEntryFreq * ColdProb; 827 } 828 829 Optional<int> 830 CallAnalyzer::getHotCallSiteThreshold(CallBase &Call, 831 BlockFrequencyInfo *CallerBFI) { 832 833 // If global profile summary is available, then callsite's hotness is 834 // determined based on that. 835 if (PSI && PSI->hasProfileSummary() && 836 PSI->isHotCallSite(CallSite(&Call), CallerBFI)) 837 return Params.HotCallSiteThreshold; 838 839 // Otherwise we need BFI to be available and to have a locally hot callsite 840 // threshold. 841 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 842 return None; 843 844 // Determine if the callsite is hot relative to caller's entry. We could 845 // potentially cache the computation of scaled entry frequency, but the added 846 // complexity is not worth it unless this scaling shows up high in the 847 // profiles. 848 auto CallSiteBB = Call.getParent(); 849 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 850 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 851 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 852 return Params.LocallyHotCallSiteThreshold; 853 854 // Otherwise treat it normally. 855 return None; 856 } 857 858 void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) { 859 // If no size growth is allowed for this inlining, set Threshold to 0. 860 if (!allowSizeGrowth(Call)) { 861 Threshold = 0; 862 return; 863 } 864 865 Function *Caller = Call.getCaller(); 866 867 // return min(A, B) if B is valid. 868 auto MinIfValid = [](int A, Optional<int> B) { 869 return B ? std::min(A, B.getValue()) : A; 870 }; 871 872 // return max(A, B) if B is valid. 873 auto MaxIfValid = [](int A, Optional<int> B) { 874 return B ? std::max(A, B.getValue()) : A; 875 }; 876 877 // Various bonus percentages. These are multiplied by Threshold to get the 878 // bonus values. 879 // SingleBBBonus: This bonus is applied if the callee has a single reachable 880 // basic block at the given callsite context. This is speculatively applied 881 // and withdrawn if more than one basic block is seen. 882 // 883 // Vector bonuses: We want to more aggressively inline vector-dense kernels 884 // and apply this bonus based on the percentage of vector instructions. A 885 // bonus is applied if the vector instructions exceed 50% and half that amount 886 // is applied if it exceeds 10%. Note that these bonuses are some what 887 // arbitrary and evolved over time by accident as much as because they are 888 // principled bonuses. 889 // FIXME: It would be nice to base the bonus values on something more 890 // scientific. 891 // 892 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 893 // of the last call to a static function as inlining such functions is 894 // guaranteed to reduce code size. 895 // 896 // These bonus percentages may be set to 0 based on properties of the caller 897 // and the callsite. 898 int SingleBBBonusPercent = 50; 899 int VectorBonusPercent = 150; 900 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 901 902 // Lambda to set all the above bonus and bonus percentages to 0. 903 auto DisallowAllBonuses = [&]() { 904 SingleBBBonusPercent = 0; 905 VectorBonusPercent = 0; 906 LastCallToStaticBonus = 0; 907 }; 908 909 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 910 // and reduce the threshold if the caller has the necessary attribute. 911 if (Caller->hasMinSize()) { 912 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 913 // For minsize, we want to disable the single BB bonus and the vector 914 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 915 // a static function will, at the minimum, eliminate the parameter setup and 916 // call/return instructions. 917 SingleBBBonusPercent = 0; 918 VectorBonusPercent = 0; 919 } else if (Caller->hasOptSize()) 920 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 921 922 // Adjust the threshold based on inlinehint attribute and profile based 923 // hotness information if the caller does not have MinSize attribute. 924 if (!Caller->hasMinSize()) { 925 if (Callee.hasFnAttribute(Attribute::InlineHint)) 926 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 927 928 // FIXME: After switching to the new passmanager, simplify the logic below 929 // by checking only the callsite hotness/coldness as we will reliably 930 // have local profile information. 931 // 932 // Callsite hotness and coldness can be determined if sample profile is 933 // used (which adds hotness metadata to calls) or if caller's 934 // BlockFrequencyInfo is available. 935 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 936 auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI); 937 if (!Caller->hasOptSize() && HotCallSiteThreshold) { 938 LLVM_DEBUG(dbgs() << "Hot callsite.\n"); 939 // FIXME: This should update the threshold only if it exceeds the 940 // current threshold, but AutoFDO + ThinLTO currently relies on this 941 // behavior to prevent inlining of hot callsites during ThinLTO 942 // compile phase. 943 Threshold = HotCallSiteThreshold.getValue(); 944 } else if (isColdCallSite(Call, CallerBFI)) { 945 LLVM_DEBUG(dbgs() << "Cold callsite.\n"); 946 // Do not apply bonuses for a cold callsite including the 947 // LastCallToStatic bonus. While this bonus might result in code size 948 // reduction, it can cause the size of a non-cold caller to increase 949 // preventing it from being inlined. 950 DisallowAllBonuses(); 951 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 952 } else if (PSI) { 953 // Use callee's global profile information only if we have no way of 954 // determining this via callsite information. 955 if (PSI->isFunctionEntryHot(&Callee)) { 956 LLVM_DEBUG(dbgs() << "Hot callee.\n"); 957 // If callsite hotness can not be determined, we may still know 958 // that the callee is hot and treat it as a weaker hint for threshold 959 // increase. 960 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 961 } else if (PSI->isFunctionEntryCold(&Callee)) { 962 LLVM_DEBUG(dbgs() << "Cold callee.\n"); 963 // Do not apply bonuses for a cold callee including the 964 // LastCallToStatic bonus. While this bonus might result in code size 965 // reduction, it can cause the size of a non-cold caller to increase 966 // preventing it from being inlined. 967 DisallowAllBonuses(); 968 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 969 } 970 } 971 } 972 973 // Finally, take the target-specific inlining threshold multiplier into 974 // account. 975 Threshold *= TTI.getInliningThresholdMultiplier(); 976 977 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 978 VectorBonus = Threshold * VectorBonusPercent / 100; 979 980 bool OnlyOneCallAndLocalLinkage = 981 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 982 // If there is only one call of the function, and it has internal linkage, 983 // the cost of inlining it drops dramatically. It may seem odd to update 984 // Cost in updateThreshold, but the bonus depends on the logic in this method. 985 if (OnlyOneCallAndLocalLinkage) 986 Cost -= LastCallToStaticBonus; 987 } 988 989 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 990 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 991 // First try to handle simplified comparisons. 992 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 993 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 994 })) 995 return true; 996 997 if (I.getOpcode() == Instruction::FCmp) 998 return false; 999 1000 // Otherwise look for a comparison between constant offset pointers with 1001 // a common base. 1002 Value *LHSBase, *RHSBase; 1003 APInt LHSOffset, RHSOffset; 1004 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1005 if (LHSBase) { 1006 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1007 if (RHSBase && LHSBase == RHSBase) { 1008 // We have common bases, fold the icmp to a constant based on the 1009 // offsets. 1010 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1011 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1012 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 1013 SimplifiedValues[&I] = C; 1014 ++NumConstantPtrCmps; 1015 return true; 1016 } 1017 } 1018 } 1019 1020 // If the comparison is an equality comparison with null, we can simplify it 1021 // if we know the value (argument) can't be null 1022 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1023 isKnownNonNullInCallee(I.getOperand(0))) { 1024 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1025 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1026 : ConstantInt::getFalse(I.getType()); 1027 return true; 1028 } 1029 // Finally check for SROA candidates in comparisons. 1030 Value *SROAArg; 1031 DenseMap<Value *, int>::iterator CostIt; 1032 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1033 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1034 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1035 return true; 1036 } 1037 1038 disableSROA(CostIt); 1039 } 1040 1041 return false; 1042 } 1043 1044 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1045 // Try to handle a special case: we can fold computing the difference of two 1046 // constant-related pointers. 1047 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1048 Value *LHSBase, *RHSBase; 1049 APInt LHSOffset, RHSOffset; 1050 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1051 if (LHSBase) { 1052 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1053 if (RHSBase && LHSBase == RHSBase) { 1054 // We have common bases, fold the subtract to a constant based on the 1055 // offsets. 1056 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1057 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1058 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1059 SimplifiedValues[&I] = C; 1060 ++NumConstantPtrDiffs; 1061 return true; 1062 } 1063 } 1064 } 1065 1066 // Otherwise, fall back to the generic logic for simplifying and handling 1067 // instructions. 1068 return Base::visitSub(I); 1069 } 1070 1071 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1072 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1073 Constant *CLHS = dyn_cast<Constant>(LHS); 1074 if (!CLHS) 1075 CLHS = SimplifiedValues.lookup(LHS); 1076 Constant *CRHS = dyn_cast<Constant>(RHS); 1077 if (!CRHS) 1078 CRHS = SimplifiedValues.lookup(RHS); 1079 1080 Value *SimpleV = nullptr; 1081 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1082 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1083 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1084 else 1085 SimpleV = 1086 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1087 1088 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1089 SimplifiedValues[&I] = C; 1090 1091 if (SimpleV) 1092 return true; 1093 1094 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1095 disableSROA(LHS); 1096 disableSROA(RHS); 1097 1098 // If the instruction is floating point, and the target says this operation 1099 // is expensive, this may eventually become a library call. Treat the cost 1100 // as such. Unless it's fneg which can be implemented with an xor. 1101 using namespace llvm::PatternMatch; 1102 if (I.getType()->isFloatingPointTy() && 1103 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive && 1104 !match(&I, m_FNeg(m_Value()))) 1105 addCost(InlineConstants::CallPenalty); 1106 1107 return false; 1108 } 1109 1110 bool CallAnalyzer::visitFNeg(UnaryOperator &I) { 1111 Value *Op = I.getOperand(0); 1112 Constant *COp = dyn_cast<Constant>(Op); 1113 if (!COp) 1114 COp = SimplifiedValues.lookup(Op); 1115 1116 Value *SimpleV = SimplifyFNegInst(COp ? COp : Op, 1117 cast<FPMathOperator>(I).getFastMathFlags(), 1118 DL); 1119 1120 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1121 SimplifiedValues[&I] = C; 1122 1123 if (SimpleV) 1124 return true; 1125 1126 // Disable any SROA on arguments to arbitrary, unsimplified fneg. 1127 disableSROA(Op); 1128 1129 return false; 1130 } 1131 1132 bool CallAnalyzer::visitLoad(LoadInst &I) { 1133 Value *SROAArg; 1134 DenseMap<Value *, int>::iterator CostIt; 1135 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1136 if (I.isSimple()) { 1137 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1138 return true; 1139 } 1140 1141 disableSROA(CostIt); 1142 } 1143 1144 // If the data is already loaded from this address and hasn't been clobbered 1145 // by any stores or calls, this load is likely to be redundant and can be 1146 // eliminated. 1147 if (EnableLoadElimination && 1148 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1149 LoadEliminationCost += InlineConstants::InstrCost; 1150 return true; 1151 } 1152 1153 return false; 1154 } 1155 1156 bool CallAnalyzer::visitStore(StoreInst &I) { 1157 Value *SROAArg; 1158 DenseMap<Value *, int>::iterator CostIt; 1159 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1160 if (I.isSimple()) { 1161 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1162 return true; 1163 } 1164 1165 disableSROA(CostIt); 1166 } 1167 1168 // The store can potentially clobber loads and prevent repeated loads from 1169 // being eliminated. 1170 // FIXME: 1171 // 1. We can probably keep an initial set of eliminatable loads substracted 1172 // from the cost even when we finally see a store. We just need to disable 1173 // *further* accumulation of elimination savings. 1174 // 2. We should probably at some point thread MemorySSA for the callee into 1175 // this and then use that to actually compute *really* precise savings. 1176 disableLoadElimination(); 1177 return false; 1178 } 1179 1180 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1181 // Constant folding for extract value is trivial. 1182 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1183 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1184 })) 1185 return true; 1186 1187 // SROA can look through these but give them a cost. 1188 return false; 1189 } 1190 1191 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1192 // Constant folding for insert value is trivial. 1193 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1194 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1195 /*InsertedValueOperand*/ COps[1], 1196 I.getIndices()); 1197 })) 1198 return true; 1199 1200 // SROA can look through these but give them a cost. 1201 return false; 1202 } 1203 1204 /// Try to simplify a call site. 1205 /// 1206 /// Takes a concrete function and callsite and tries to actually simplify it by 1207 /// analyzing the arguments and call itself with instsimplify. Returns true if 1208 /// it has simplified the callsite to some other entity (a constant), making it 1209 /// free. 1210 bool CallAnalyzer::simplifyCallSite(Function *F, CallBase &Call) { 1211 // FIXME: Using the instsimplify logic directly for this is inefficient 1212 // because we have to continually rebuild the argument list even when no 1213 // simplifications can be performed. Until that is fixed with remapping 1214 // inside of instsimplify, directly constant fold calls here. 1215 if (!canConstantFoldCallTo(&Call, F)) 1216 return false; 1217 1218 // Try to re-map the arguments to constants. 1219 SmallVector<Constant *, 4> ConstantArgs; 1220 ConstantArgs.reserve(Call.arg_size()); 1221 for (Value *I : Call.args()) { 1222 Constant *C = dyn_cast<Constant>(I); 1223 if (!C) 1224 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(I)); 1225 if (!C) 1226 return false; // This argument doesn't map to a constant. 1227 1228 ConstantArgs.push_back(C); 1229 } 1230 if (Constant *C = ConstantFoldCall(&Call, F, ConstantArgs)) { 1231 SimplifiedValues[&Call] = C; 1232 return true; 1233 } 1234 1235 return false; 1236 } 1237 1238 bool CallAnalyzer::visitCallBase(CallBase &Call) { 1239 if (Call.hasFnAttr(Attribute::ReturnsTwice) && 1240 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1241 // This aborts the entire analysis. 1242 ExposesReturnsTwice = true; 1243 return false; 1244 } 1245 if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate()) 1246 ContainsNoDuplicateCall = true; 1247 1248 if (Function *F = Call.getCalledFunction()) { 1249 // When we have a concrete function, first try to simplify it directly. 1250 if (simplifyCallSite(F, Call)) 1251 return true; 1252 1253 // Next check if it is an intrinsic we know about. 1254 // FIXME: Lift this into part of the InstVisitor. 1255 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) { 1256 switch (II->getIntrinsicID()) { 1257 default: 1258 if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1259 disableLoadElimination(); 1260 return Base::visitCallBase(Call); 1261 1262 case Intrinsic::load_relative: 1263 // This is normally lowered to 4 LLVM instructions. 1264 addCost(3 * InlineConstants::InstrCost); 1265 return false; 1266 1267 case Intrinsic::memset: 1268 case Intrinsic::memcpy: 1269 case Intrinsic::memmove: 1270 disableLoadElimination(); 1271 // SROA can usually chew through these intrinsics, but they aren't free. 1272 return false; 1273 case Intrinsic::icall_branch_funnel: 1274 case Intrinsic::localescape: 1275 HasUninlineableIntrinsic = true; 1276 return false; 1277 case Intrinsic::vastart: 1278 InitsVargArgs = true; 1279 return false; 1280 } 1281 } 1282 1283 if (F == Call.getFunction()) { 1284 // This flag will fully abort the analysis, so don't bother with anything 1285 // else. 1286 IsRecursiveCall = true; 1287 return false; 1288 } 1289 1290 if (TTI.isLoweredToCall(F)) { 1291 // We account for the average 1 instruction per call argument setup 1292 // here. 1293 addCost(Call.arg_size() * InlineConstants::InstrCost); 1294 1295 // Everything other than inline ASM will also have a significant cost 1296 // merely from making the call. 1297 if (!isa<InlineAsm>(Call.getCalledValue())) 1298 addCost(InlineConstants::CallPenalty); 1299 } 1300 1301 if (!Call.onlyReadsMemory()) 1302 disableLoadElimination(); 1303 return Base::visitCallBase(Call); 1304 } 1305 1306 // Otherwise we're in a very special case -- an indirect function call. See 1307 // if we can be particularly clever about this. 1308 Value *Callee = Call.getCalledValue(); 1309 1310 // First, pay the price of the argument setup. We account for the average 1311 // 1 instruction per call argument setup here. 1312 addCost(Call.arg_size() * InlineConstants::InstrCost); 1313 1314 // Next, check if this happens to be an indirect function call to a known 1315 // function in this inline context. If not, we've done all we can. 1316 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1317 if (!F) { 1318 if (!Call.onlyReadsMemory()) 1319 disableLoadElimination(); 1320 return Base::visitCallBase(Call); 1321 } 1322 1323 // If we have a constant that we are calling as a function, we can peer 1324 // through it and see the function target. This happens not infrequently 1325 // during devirtualization and so we want to give it a hefty bonus for 1326 // inlining, but cap that bonus in the event that inlining wouldn't pan 1327 // out. Pretend to inline the function, with a custom threshold. 1328 auto IndirectCallParams = Params; 1329 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1330 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call, 1331 IndirectCallParams); 1332 if (CA.analyzeCall(Call)) { 1333 // We were able to inline the indirect call! Subtract the cost from the 1334 // threshold to get the bonus we want to apply, but don't go below zero. 1335 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1336 } 1337 1338 if (!F->onlyReadsMemory()) 1339 disableLoadElimination(); 1340 return Base::visitCallBase(Call); 1341 } 1342 1343 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1344 // At least one return instruction will be free after inlining. 1345 bool Free = !HasReturn; 1346 HasReturn = true; 1347 return Free; 1348 } 1349 1350 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1351 // We model unconditional branches as essentially free -- they really 1352 // shouldn't exist at all, but handling them makes the behavior of the 1353 // inliner more regular and predictable. Interestingly, conditional branches 1354 // which will fold away are also free. 1355 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1356 dyn_cast_or_null<ConstantInt>( 1357 SimplifiedValues.lookup(BI.getCondition())); 1358 } 1359 1360 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1361 bool CheckSROA = SI.getType()->isPointerTy(); 1362 Value *TrueVal = SI.getTrueValue(); 1363 Value *FalseVal = SI.getFalseValue(); 1364 1365 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1366 if (!TrueC) 1367 TrueC = SimplifiedValues.lookup(TrueVal); 1368 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1369 if (!FalseC) 1370 FalseC = SimplifiedValues.lookup(FalseVal); 1371 Constant *CondC = 1372 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1373 1374 if (!CondC) { 1375 // Select C, X, X => X 1376 if (TrueC == FalseC && TrueC) { 1377 SimplifiedValues[&SI] = TrueC; 1378 return true; 1379 } 1380 1381 if (!CheckSROA) 1382 return Base::visitSelectInst(SI); 1383 1384 std::pair<Value *, APInt> TrueBaseAndOffset = 1385 ConstantOffsetPtrs.lookup(TrueVal); 1386 std::pair<Value *, APInt> FalseBaseAndOffset = 1387 ConstantOffsetPtrs.lookup(FalseVal); 1388 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1389 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1390 1391 Value *SROAArg; 1392 DenseMap<Value *, int>::iterator CostIt; 1393 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1394 SROAArgValues[&SI] = SROAArg; 1395 return true; 1396 } 1397 1398 return Base::visitSelectInst(SI); 1399 } 1400 1401 // Select condition is a constant. 1402 Value *SelectedV = CondC->isAllOnesValue() 1403 ? TrueVal 1404 : (CondC->isNullValue()) ? FalseVal : nullptr; 1405 if (!SelectedV) { 1406 // Condition is a vector constant that is not all 1s or all 0s. If all 1407 // operands are constants, ConstantExpr::getSelect() can handle the cases 1408 // such as select vectors. 1409 if (TrueC && FalseC) { 1410 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1411 SimplifiedValues[&SI] = C; 1412 return true; 1413 } 1414 } 1415 return Base::visitSelectInst(SI); 1416 } 1417 1418 // Condition is either all 1s or all 0s. SI can be simplified. 1419 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1420 SimplifiedValues[&SI] = SelectedC; 1421 return true; 1422 } 1423 1424 if (!CheckSROA) 1425 return true; 1426 1427 std::pair<Value *, APInt> BaseAndOffset = 1428 ConstantOffsetPtrs.lookup(SelectedV); 1429 if (BaseAndOffset.first) { 1430 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1431 1432 Value *SROAArg; 1433 DenseMap<Value *, int>::iterator CostIt; 1434 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1435 SROAArgValues[&SI] = SROAArg; 1436 } 1437 1438 return true; 1439 } 1440 1441 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1442 // We model unconditional switches as free, see the comments on handling 1443 // branches. 1444 if (isa<ConstantInt>(SI.getCondition())) 1445 return true; 1446 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1447 if (isa<ConstantInt>(V)) 1448 return true; 1449 1450 // Assume the most general case where the switch is lowered into 1451 // either a jump table, bit test, or a balanced binary tree consisting of 1452 // case clusters without merging adjacent clusters with the same 1453 // destination. We do not consider the switches that are lowered with a mix 1454 // of jump table/bit test/binary search tree. The cost of the switch is 1455 // proportional to the size of the tree or the size of jump table range. 1456 // 1457 // NB: We convert large switches which are just used to initialize large phi 1458 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1459 // inlining those. It will prevent inlining in cases where the optimization 1460 // does not (yet) fire. 1461 1462 // Maximum valid cost increased in this function. 1463 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1464 1465 // Exit early for a large switch, assuming one case needs at least one 1466 // instruction. 1467 // FIXME: This is not true for a bit test, but ignore such case for now to 1468 // save compile-time. 1469 int64_t CostLowerBound = 1470 std::min((int64_t)CostUpperBound, 1471 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1472 1473 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1474 addCost((int64_t)SI.getNumCases() * InlineConstants::InstrCost); 1475 return false; 1476 } 1477 1478 unsigned JumpTableSize = 0; 1479 unsigned NumCaseCluster = 1480 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1481 1482 // If suitable for a jump table, consider the cost for the table size and 1483 // branch to destination. 1484 if (JumpTableSize) { 1485 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1486 4 * InlineConstants::InstrCost; 1487 1488 addCost(JTCost, (int64_t)CostUpperBound); 1489 return false; 1490 } 1491 1492 // Considering forming a binary search, we should find the number of nodes 1493 // which is same as the number of comparisons when lowered. For a given 1494 // number of clusters, n, we can define a recursive function, f(n), to find 1495 // the number of nodes in the tree. The recursion is : 1496 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1497 // and f(n) = n, when n <= 3. 1498 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1499 // when n > 3. So, the number of comparisons from leaves should be n, while 1500 // the number of non-leaf should be : 1501 // 2^(log2(n) - 1) - 1 1502 // = 2^log2(n) * 2^-1 - 1 1503 // = n / 2 - 1. 1504 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1505 // number of comparisons in a simple closed form : 1506 // n + n / 2 - 1 = n * 3 / 2 - 1 1507 if (NumCaseCluster <= 3) { 1508 // Suppose a comparison includes one compare and one conditional branch. 1509 addCost(NumCaseCluster * 2 * InlineConstants::InstrCost); 1510 return false; 1511 } 1512 1513 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1514 int64_t SwitchCost = 1515 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1516 1517 addCost(SwitchCost, (int64_t)CostUpperBound); 1518 return false; 1519 } 1520 1521 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1522 // We never want to inline functions that contain an indirectbr. This is 1523 // incorrect because all the blockaddress's (in static global initializers 1524 // for example) would be referring to the original function, and this 1525 // indirect jump would jump from the inlined copy of the function into the 1526 // original function which is extremely undefined behavior. 1527 // FIXME: This logic isn't really right; we can safely inline functions with 1528 // indirectbr's as long as no other function or global references the 1529 // blockaddress of a block within the current function. 1530 HasIndirectBr = true; 1531 return false; 1532 } 1533 1534 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1535 // FIXME: It's not clear that a single instruction is an accurate model for 1536 // the inline cost of a resume instruction. 1537 return false; 1538 } 1539 1540 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1541 // FIXME: It's not clear that a single instruction is an accurate model for 1542 // the inline cost of a cleanupret instruction. 1543 return false; 1544 } 1545 1546 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1547 // FIXME: It's not clear that a single instruction is an accurate model for 1548 // the inline cost of a catchret instruction. 1549 return false; 1550 } 1551 1552 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1553 // FIXME: It might be reasonably to discount the cost of instructions leading 1554 // to unreachable as they have the lowest possible impact on both runtime and 1555 // code size. 1556 return true; // No actual code is needed for unreachable. 1557 } 1558 1559 bool CallAnalyzer::visitInstruction(Instruction &I) { 1560 // Some instructions are free. All of the free intrinsics can also be 1561 // handled by SROA, etc. 1562 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1563 return true; 1564 1565 // We found something we don't understand or can't handle. Mark any SROA-able 1566 // values in the operand list as no longer viable. 1567 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1568 disableSROA(*OI); 1569 1570 return false; 1571 } 1572 1573 /// Analyze a basic block for its contribution to the inline cost. 1574 /// 1575 /// This method walks the analyzer over every instruction in the given basic 1576 /// block and accounts for their cost during inlining at this callsite. It 1577 /// aborts early if the threshold has been exceeded or an impossible to inline 1578 /// construct has been detected. It returns false if inlining is no longer 1579 /// viable, and true if inlining remains viable. 1580 InlineResult 1581 CallAnalyzer::analyzeBlock(BasicBlock *BB, 1582 SmallPtrSetImpl<const Value *> &EphValues) { 1583 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1584 // FIXME: Currently, the number of instructions in a function regardless of 1585 // our ability to simplify them during inline to constants or dead code, 1586 // are actually used by the vector bonus heuristic. As long as that's true, 1587 // we have to special case debug intrinsics here to prevent differences in 1588 // inlining due to debug symbols. Eventually, the number of unsimplified 1589 // instructions shouldn't factor into the cost computation, but until then, 1590 // hack around it here. 1591 if (isa<DbgInfoIntrinsic>(I)) 1592 continue; 1593 1594 // Skip ephemeral values. 1595 if (EphValues.count(&*I)) 1596 continue; 1597 1598 ++NumInstructions; 1599 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1600 ++NumVectorInstructions; 1601 1602 // If the instruction simplified to a constant, there is no cost to this 1603 // instruction. Visit the instructions using our InstVisitor to account for 1604 // all of the per-instruction logic. The visit tree returns true if we 1605 // consumed the instruction in any way, and false if the instruction's base 1606 // cost should count against inlining. 1607 if (Base::visit(&*I)) 1608 ++NumInstructionsSimplified; 1609 else 1610 addCost(InlineConstants::InstrCost); 1611 1612 using namespace ore; 1613 // If the visit this instruction detected an uninlinable pattern, abort. 1614 InlineResult IR; 1615 if (IsRecursiveCall) 1616 IR = "recursive"; 1617 else if (ExposesReturnsTwice) 1618 IR = "exposes returns twice"; 1619 else if (HasDynamicAlloca) 1620 IR = "dynamic alloca"; 1621 else if (HasIndirectBr) 1622 IR = "indirect branch"; 1623 else if (HasUninlineableIntrinsic) 1624 IR = "uninlinable intrinsic"; 1625 else if (InitsVargArgs) 1626 IR = "varargs"; 1627 if (!IR) { 1628 if (ORE) 1629 ORE->emit([&]() { 1630 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1631 &CandidateCall) 1632 << NV("Callee", &F) << " has uninlinable pattern (" 1633 << NV("InlineResult", IR.message) 1634 << ") and cost is not fully computed"; 1635 }); 1636 return IR; 1637 } 1638 1639 // If the caller is a recursive function then we don't want to inline 1640 // functions which allocate a lot of stack space because it would increase 1641 // the caller stack usage dramatically. 1642 if (IsCallerRecursive && 1643 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1644 InlineResult IR = "recursive and allocates too much stack space"; 1645 if (ORE) 1646 ORE->emit([&]() { 1647 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1648 &CandidateCall) 1649 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message) 1650 << ". Cost is not fully computed"; 1651 }); 1652 return IR; 1653 } 1654 1655 // Check if we've passed the maximum possible threshold so we don't spin in 1656 // huge basic blocks that will never inline. 1657 if (Cost >= Threshold && !ComputeFullInlineCost) 1658 return false; 1659 } 1660 1661 return true; 1662 } 1663 1664 /// Compute the base pointer and cumulative constant offsets for V. 1665 /// 1666 /// This strips all constant offsets off of V, leaving it the base pointer, and 1667 /// accumulates the total constant offset applied in the returned constant. It 1668 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1669 /// no constant offsets applied. 1670 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1671 if (!V->getType()->isPointerTy()) 1672 return nullptr; 1673 1674 unsigned AS = V->getType()->getPointerAddressSpace(); 1675 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1676 APInt Offset = APInt::getNullValue(IntPtrWidth); 1677 1678 // Even though we don't look through PHI nodes, we could be called on an 1679 // instruction in an unreachable block, which may be on a cycle. 1680 SmallPtrSet<Value *, 4> Visited; 1681 Visited.insert(V); 1682 do { 1683 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1684 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1685 return nullptr; 1686 V = GEP->getPointerOperand(); 1687 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1688 V = cast<Operator>(V)->getOperand(0); 1689 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1690 if (GA->isInterposable()) 1691 break; 1692 V = GA->getAliasee(); 1693 } else { 1694 break; 1695 } 1696 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1697 } while (Visited.insert(V).second); 1698 1699 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1700 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1701 } 1702 1703 /// Find dead blocks due to deleted CFG edges during inlining. 1704 /// 1705 /// If we know the successor of the current block, \p CurrBB, has to be \p 1706 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1707 /// no live incoming CFG edges. If one block is found to be dead, we can 1708 /// continue growing the dead block list by checking the successors of the dead 1709 /// blocks to see if all their incoming edges are dead or not. 1710 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1711 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1712 // A CFG edge is dead if the predecessor is dead or the predecessor has a 1713 // known successor which is not the one under exam. 1714 return (DeadBlocks.count(Pred) || 1715 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1716 }; 1717 1718 auto IsNewlyDead = [&](BasicBlock *BB) { 1719 // If all the edges to a block are dead, the block is also dead. 1720 return (!DeadBlocks.count(BB) && 1721 llvm::all_of(predecessors(BB), 1722 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1723 }; 1724 1725 for (BasicBlock *Succ : successors(CurrBB)) { 1726 if (Succ == NextBB || !IsNewlyDead(Succ)) 1727 continue; 1728 SmallVector<BasicBlock *, 4> NewDead; 1729 NewDead.push_back(Succ); 1730 while (!NewDead.empty()) { 1731 BasicBlock *Dead = NewDead.pop_back_val(); 1732 if (DeadBlocks.insert(Dead)) 1733 // Continue growing the dead block lists. 1734 for (BasicBlock *S : successors(Dead)) 1735 if (IsNewlyDead(S)) 1736 NewDead.push_back(S); 1737 } 1738 } 1739 } 1740 1741 /// Analyze a call site for potential inlining. 1742 /// 1743 /// Returns true if inlining this call is viable, and false if it is not 1744 /// viable. It computes the cost and adjusts the threshold based on numerous 1745 /// factors and heuristics. If this method returns false but the computed cost 1746 /// is below the computed threshold, then inlining was forcibly disabled by 1747 /// some artifact of the routine. 1748 InlineResult CallAnalyzer::analyzeCall(CallBase &Call) { 1749 ++NumCallsAnalyzed; 1750 1751 // Perform some tweaks to the cost and threshold based on the direct 1752 // callsite information. 1753 1754 // We want to more aggressively inline vector-dense kernels, so up the 1755 // threshold, and we'll lower it if the % of vector instructions gets too 1756 // low. Note that these bonuses are some what arbitrary and evolved over time 1757 // by accident as much as because they are principled bonuses. 1758 // 1759 // FIXME: It would be nice to remove all such bonuses. At least it would be 1760 // nice to base the bonus values on something more scientific. 1761 assert(NumInstructions == 0); 1762 assert(NumVectorInstructions == 0); 1763 1764 // Update the threshold based on callsite properties 1765 updateThreshold(Call, F); 1766 1767 // While Threshold depends on commandline options that can take negative 1768 // values, we want to enforce the invariant that the computed threshold and 1769 // bonuses are non-negative. 1770 assert(Threshold >= 0); 1771 assert(SingleBBBonus >= 0); 1772 assert(VectorBonus >= 0); 1773 1774 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1775 // this Threshold any time, and cost cannot decrease, we can stop processing 1776 // the rest of the function body. 1777 Threshold += (SingleBBBonus + VectorBonus); 1778 1779 // Give out bonuses for the callsite, as the instructions setting them up 1780 // will be gone after inlining. 1781 addCost(-getCallsiteCost(Call, DL)); 1782 1783 // If this function uses the coldcc calling convention, prefer not to inline 1784 // it. 1785 if (F.getCallingConv() == CallingConv::Cold) 1786 Cost += InlineConstants::ColdccPenalty; 1787 1788 // Check if we're done. This can happen due to bonuses and penalties. 1789 if (Cost >= Threshold && !ComputeFullInlineCost) 1790 return "high cost"; 1791 1792 if (F.empty()) 1793 return true; 1794 1795 Function *Caller = Call.getFunction(); 1796 // Check if the caller function is recursive itself. 1797 for (User *U : Caller->users()) { 1798 CallBase *Call = dyn_cast<CallBase>(U); 1799 if (Call && Call->getFunction() == Caller) { 1800 IsCallerRecursive = true; 1801 break; 1802 } 1803 } 1804 1805 // Populate our simplified values by mapping from function arguments to call 1806 // arguments with known important simplifications. 1807 auto CAI = Call.arg_begin(); 1808 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1809 FAI != FAE; ++FAI, ++CAI) { 1810 assert(CAI != Call.arg_end()); 1811 if (Constant *C = dyn_cast<Constant>(CAI)) 1812 SimplifiedValues[&*FAI] = C; 1813 1814 Value *PtrArg = *CAI; 1815 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1816 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1817 1818 // We can SROA any pointer arguments derived from alloca instructions. 1819 if (isa<AllocaInst>(PtrArg)) { 1820 SROAArgValues[&*FAI] = PtrArg; 1821 SROAArgCosts[PtrArg] = 0; 1822 } 1823 } 1824 } 1825 NumConstantArgs = SimplifiedValues.size(); 1826 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1827 NumAllocaArgs = SROAArgValues.size(); 1828 1829 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1830 // the ephemeral values multiple times (and they're completely determined by 1831 // the callee, so this is purely duplicate work). 1832 SmallPtrSet<const Value *, 32> EphValues; 1833 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1834 1835 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1836 // adding basic blocks of the callee which can be proven to be dead for this 1837 // particular call site in order to get more accurate cost estimates. This 1838 // requires a somewhat heavyweight iteration pattern: we need to walk the 1839 // basic blocks in a breadth-first order as we insert live successors. To 1840 // accomplish this, prioritizing for small iterations because we exit after 1841 // crossing our threshold, we use a small-size optimized SetVector. 1842 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1843 SmallPtrSet<BasicBlock *, 16>> 1844 BBSetVector; 1845 BBSetVector BBWorklist; 1846 BBWorklist.insert(&F.getEntryBlock()); 1847 bool SingleBB = true; 1848 // Note that we *must not* cache the size, this loop grows the worklist. 1849 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1850 // Bail out the moment we cross the threshold. This means we'll under-count 1851 // the cost, but only when undercounting doesn't matter. 1852 if (Cost >= Threshold && !ComputeFullInlineCost) 1853 break; 1854 1855 BasicBlock *BB = BBWorklist[Idx]; 1856 if (BB->empty()) 1857 continue; 1858 1859 // Disallow inlining a blockaddress with uses other than strictly callbr. 1860 // A blockaddress only has defined behavior for an indirect branch in the 1861 // same function, and we do not currently support inlining indirect 1862 // branches. But, the inliner may not see an indirect branch that ends up 1863 // being dead code at a particular call site. If the blockaddress escapes 1864 // the function, e.g., via a global variable, inlining may lead to an 1865 // invalid cross-function reference. 1866 // FIXME: pr/39560: continue relaxing this overt restriction. 1867 if (BB->hasAddressTaken()) 1868 for (User *U : BlockAddress::get(&*BB)->users()) 1869 if (!isa<CallBrInst>(*U)) 1870 return "blockaddress used outside of callbr"; 1871 1872 // Analyze the cost of this block. If we blow through the threshold, this 1873 // returns false, and we can bail on out. 1874 InlineResult IR = analyzeBlock(BB, EphValues); 1875 if (!IR) 1876 return IR; 1877 1878 Instruction *TI = BB->getTerminator(); 1879 1880 // Add in the live successors by first checking whether we have terminator 1881 // that may be simplified based on the values simplified by this call. 1882 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1883 if (BI->isConditional()) { 1884 Value *Cond = BI->getCondition(); 1885 if (ConstantInt *SimpleCond = 1886 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1887 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1888 BBWorklist.insert(NextBB); 1889 KnownSuccessors[BB] = NextBB; 1890 findDeadBlocks(BB, NextBB); 1891 continue; 1892 } 1893 } 1894 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1895 Value *Cond = SI->getCondition(); 1896 if (ConstantInt *SimpleCond = 1897 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1898 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1899 BBWorklist.insert(NextBB); 1900 KnownSuccessors[BB] = NextBB; 1901 findDeadBlocks(BB, NextBB); 1902 continue; 1903 } 1904 } 1905 1906 // If we're unable to select a particular successor, just count all of 1907 // them. 1908 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1909 ++TIdx) 1910 BBWorklist.insert(TI->getSuccessor(TIdx)); 1911 1912 // If we had any successors at this point, than post-inlining is likely to 1913 // have them as well. Note that we assume any basic blocks which existed 1914 // due to branches or switches which folded above will also fold after 1915 // inlining. 1916 if (SingleBB && TI->getNumSuccessors() > 1) { 1917 // Take off the bonus we applied to the threshold. 1918 Threshold -= SingleBBBonus; 1919 SingleBB = false; 1920 } 1921 } 1922 1923 bool OnlyOneCallAndLocalLinkage = 1924 F.hasLocalLinkage() && F.hasOneUse() && &F == Call.getCalledFunction(); 1925 // If this is a noduplicate call, we can still inline as long as 1926 // inlining this would cause the removal of the caller (so the instruction 1927 // is not actually duplicated, just moved). 1928 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1929 return "noduplicate"; 1930 1931 // Loops generally act a lot like calls in that they act like barriers to 1932 // movement, require a certain amount of setup, etc. So when optimising for 1933 // size, we penalise any call sites that perform loops. We do this after all 1934 // other costs here, so will likely only be dealing with relatively small 1935 // functions (and hence DT and LI will hopefully be cheap). 1936 if (Caller->hasMinSize()) { 1937 DominatorTree DT(F); 1938 LoopInfo LI(DT); 1939 int NumLoops = 0; 1940 for (Loop *L : LI) { 1941 // Ignore loops that will not be executed 1942 if (DeadBlocks.count(L->getHeader())) 1943 continue; 1944 NumLoops++; 1945 } 1946 addCost(NumLoops * InlineConstants::CallPenalty); 1947 } 1948 1949 // We applied the maximum possible vector bonus at the beginning. Now, 1950 // subtract the excess bonus, if any, from the Threshold before 1951 // comparing against Cost. 1952 if (NumVectorInstructions <= NumInstructions / 10) 1953 Threshold -= VectorBonus; 1954 else if (NumVectorInstructions <= NumInstructions / 2) 1955 Threshold -= VectorBonus/2; 1956 1957 return Cost < std::max(1, Threshold); 1958 } 1959 1960 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1961 /// Dump stats about this call's analysis. 1962 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1963 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1964 DEBUG_PRINT_STAT(NumConstantArgs); 1965 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1966 DEBUG_PRINT_STAT(NumAllocaArgs); 1967 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1968 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1969 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1970 DEBUG_PRINT_STAT(NumInstructions); 1971 DEBUG_PRINT_STAT(SROACostSavings); 1972 DEBUG_PRINT_STAT(SROACostSavingsLost); 1973 DEBUG_PRINT_STAT(LoadEliminationCost); 1974 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1975 DEBUG_PRINT_STAT(Cost); 1976 DEBUG_PRINT_STAT(Threshold); 1977 #undef DEBUG_PRINT_STAT 1978 } 1979 #endif 1980 1981 /// Test that there are no attribute conflicts between Caller and Callee 1982 /// that prevent inlining. 1983 static bool functionsHaveCompatibleAttributes(Function *Caller, 1984 Function *Callee, 1985 TargetTransformInfo &TTI) { 1986 return TTI.areInlineCompatible(Caller, Callee) && 1987 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1988 } 1989 1990 int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) { 1991 int Cost = 0; 1992 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) { 1993 if (Call.isByValArgument(I)) { 1994 // We approximate the number of loads and stores needed by dividing the 1995 // size of the byval type by the target's pointer size. 1996 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 1997 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1998 unsigned AS = PTy->getAddressSpace(); 1999 unsigned PointerSize = DL.getPointerSizeInBits(AS); 2000 // Ceiling division. 2001 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 2002 2003 // If it generates more than 8 stores it is likely to be expanded as an 2004 // inline memcpy so we take that as an upper bound. Otherwise we assume 2005 // one load and one store per word copied. 2006 // FIXME: The maxStoresPerMemcpy setting from the target should be used 2007 // here instead of a magic number of 8, but it's not available via 2008 // DataLayout. 2009 NumStores = std::min(NumStores, 8U); 2010 2011 Cost += 2 * NumStores * InlineConstants::InstrCost; 2012 } else { 2013 // For non-byval arguments subtract off one instruction per call 2014 // argument. 2015 Cost += InlineConstants::InstrCost; 2016 } 2017 } 2018 // The call instruction also disappears after inlining. 2019 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 2020 return Cost; 2021 } 2022 2023 InlineCost llvm::getInlineCost( 2024 CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 2025 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2026 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2027 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2028 return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI, 2029 GetAssumptionCache, GetBFI, PSI, ORE); 2030 } 2031 2032 InlineCost llvm::getInlineCost( 2033 CallBase &Call, Function *Callee, const InlineParams &Params, 2034 TargetTransformInfo &CalleeTTI, 2035 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 2036 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 2037 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 2038 2039 // Cannot inline indirect calls. 2040 if (!Callee) 2041 return llvm::InlineCost::getNever("indirect call"); 2042 2043 // Never inline calls with byval arguments that does not have the alloca 2044 // address space. Since byval arguments can be replaced with a copy to an 2045 // alloca, the inlined code would need to be adjusted to handle that the 2046 // argument is in the alloca address space (so it is a little bit complicated 2047 // to solve). 2048 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 2049 for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) 2050 if (Call.isByValArgument(I)) { 2051 PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType()); 2052 if (PTy->getAddressSpace() != AllocaAS) 2053 return llvm::InlineCost::getNever("byval arguments without alloca" 2054 " address space"); 2055 } 2056 2057 // Calls to functions with always-inline attributes should be inlined 2058 // whenever possible. 2059 if (Call.hasFnAttr(Attribute::AlwaysInline)) { 2060 auto IsViable = isInlineViable(*Callee); 2061 if (IsViable) 2062 return llvm::InlineCost::getAlways("always inline attribute"); 2063 return llvm::InlineCost::getNever(IsViable.message); 2064 } 2065 2066 // Never inline functions with conflicting attributes (unless callee has 2067 // always-inline attribute). 2068 Function *Caller = Call.getCaller(); 2069 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 2070 return llvm::InlineCost::getNever("conflicting attributes"); 2071 2072 // Don't inline this call if the caller has the optnone attribute. 2073 if (Caller->hasOptNone()) 2074 return llvm::InlineCost::getNever("optnone attribute"); 2075 2076 // Don't inline a function that treats null pointer as valid into a caller 2077 // that does not have this attribute. 2078 if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined()) 2079 return llvm::InlineCost::getNever("nullptr definitions incompatible"); 2080 2081 // Don't inline functions which can be interposed at link-time. 2082 if (Callee->isInterposable()) 2083 return llvm::InlineCost::getNever("interposable"); 2084 2085 // Don't inline functions marked noinline. 2086 if (Callee->hasFnAttribute(Attribute::NoInline)) 2087 return llvm::InlineCost::getNever("noinline function attribute"); 2088 2089 // Don't inline call sites marked noinline. 2090 if (Call.isNoInline()) 2091 return llvm::InlineCost::getNever("noinline call site attribute"); 2092 2093 LLVM_DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2094 << "... (caller:" << Caller->getName() << ")\n"); 2095 2096 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, 2097 Call, Params); 2098 InlineResult ShouldInline = CA.analyzeCall(Call); 2099 2100 LLVM_DEBUG(CA.dump()); 2101 2102 // Check if there was a reason to force inlining or no inlining. 2103 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2104 return InlineCost::getNever(ShouldInline.message); 2105 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2106 return InlineCost::getAlways("empty function"); 2107 2108 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2109 } 2110 2111 InlineResult llvm::isInlineViable(Function &F) { 2112 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2113 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2114 // Disallow inlining of functions which contain indirect branches. 2115 if (isa<IndirectBrInst>(BI->getTerminator())) 2116 return "contains indirect branches"; 2117 2118 // Disallow inlining of blockaddresses which are used by non-callbr 2119 // instructions. 2120 if (BI->hasAddressTaken()) 2121 for (User *U : BlockAddress::get(&*BI)->users()) 2122 if (!isa<CallBrInst>(*U)) 2123 return "blockaddress used outside of callbr"; 2124 2125 for (auto &II : *BI) { 2126 CallBase *Call = dyn_cast<CallBase>(&II); 2127 if (!Call) 2128 continue; 2129 2130 // Disallow recursive calls. 2131 if (&F == Call->getCalledFunction()) 2132 return "recursive call"; 2133 2134 // Disallow calls which expose returns-twice to a function not previously 2135 // attributed as such. 2136 if (!ReturnsTwice && isa<CallInst>(Call) && 2137 cast<CallInst>(Call)->canReturnTwice()) 2138 return "exposes returns-twice attribute"; 2139 2140 if (Call->getCalledFunction()) 2141 switch (Call->getCalledFunction()->getIntrinsicID()) { 2142 default: 2143 break; 2144 // Disallow inlining of @llvm.icall.branch.funnel because current 2145 // backend can't separate call targets from call arguments. 2146 case llvm::Intrinsic::icall_branch_funnel: 2147 return "disallowed inlining of @llvm.icall.branch.funnel"; 2148 // Disallow inlining functions that call @llvm.localescape. Doing this 2149 // correctly would require major changes to the inliner. 2150 case llvm::Intrinsic::localescape: 2151 return "disallowed inlining of @llvm.localescape"; 2152 // Disallow inlining of functions that initialize VarArgs with va_start. 2153 case llvm::Intrinsic::vastart: 2154 return "contains VarArgs initialized with va_start"; 2155 } 2156 } 2157 } 2158 2159 return true; 2160 } 2161 2162 // APIs to create InlineParams based on command line flags and/or other 2163 // parameters. 2164 2165 InlineParams llvm::getInlineParams(int Threshold) { 2166 InlineParams Params; 2167 2168 // This field is the threshold to use for a callee by default. This is 2169 // derived from one or more of: 2170 // * optimization or size-optimization levels, 2171 // * a value passed to createFunctionInliningPass function, or 2172 // * the -inline-threshold flag. 2173 // If the -inline-threshold flag is explicitly specified, that is used 2174 // irrespective of anything else. 2175 if (InlineThreshold.getNumOccurrences() > 0) 2176 Params.DefaultThreshold = InlineThreshold; 2177 else 2178 Params.DefaultThreshold = Threshold; 2179 2180 // Set the HintThreshold knob from the -inlinehint-threshold. 2181 Params.HintThreshold = HintThreshold; 2182 2183 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2184 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2185 2186 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2187 // populate LocallyHotCallSiteThreshold. Later, we populate 2188 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2189 // we know that optimization level is O3 (in the getInlineParams variant that 2190 // takes the opt and size levels). 2191 // FIXME: Remove this check (and make the assignment unconditional) after 2192 // addressing size regression issues at O2. 2193 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2194 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2195 2196 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2197 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2198 2199 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2200 // -inlinehint-threshold commandline option is not explicitly given. If that 2201 // option is present, then its value applies even for callees with size and 2202 // minsize attributes. 2203 // If the -inline-threshold is not specified, set the ColdThreshold from the 2204 // -inlinecold-threshold even if it is not explicitly passed. If 2205 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2206 // explicitly specified to set the ColdThreshold knob 2207 if (InlineThreshold.getNumOccurrences() == 0) { 2208 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2209 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2210 Params.ColdThreshold = ColdThreshold; 2211 } else if (ColdThreshold.getNumOccurrences() > 0) { 2212 Params.ColdThreshold = ColdThreshold; 2213 } 2214 return Params; 2215 } 2216 2217 InlineParams llvm::getInlineParams() { 2218 return getInlineParams(InlineThreshold); 2219 } 2220 2221 // Compute the default threshold for inlining based on the opt level and the 2222 // size opt level. 2223 static int computeThresholdFromOptLevels(unsigned OptLevel, 2224 unsigned SizeOptLevel) { 2225 if (OptLevel > 2) 2226 return InlineConstants::OptAggressiveThreshold; 2227 if (SizeOptLevel == 1) // -Os 2228 return InlineConstants::OptSizeThreshold; 2229 if (SizeOptLevel == 2) // -Oz 2230 return InlineConstants::OptMinSizeThreshold; 2231 return InlineThreshold; 2232 } 2233 2234 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2235 auto Params = 2236 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2237 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2238 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2239 // when it is specified explicitly. 2240 if (OptLevel > 2) 2241 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2242 return Params; 2243 } 2244