1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/CFG.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/GetElementPtrTypeIterator.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/InstVisitor.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Support/raw_ostream.h" 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "inline-cost" 43 44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 45 46 static cl::opt<int> InlineThreshold( 47 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 48 cl::desc("Control the amount of inlining to perform (default = 225)")); 49 50 static cl::opt<int> HintThreshold( 51 "inlinehint-threshold", cl::Hidden, cl::init(325), 52 cl::desc("Threshold for inlining functions with inline hint")); 53 54 static cl::opt<int> 55 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 56 cl::init(45), 57 cl::desc("Threshold for inlining cold callsites")); 58 59 // We introduce this threshold to help performance of instrumentation based 60 // PGO before we actually hook up inliner with analysis passes such as BPI and 61 // BFI. 62 static cl::opt<int> ColdThreshold( 63 "inlinecold-threshold", cl::Hidden, cl::init(45), 64 cl::desc("Threshold for inlining functions with cold attribute")); 65 66 static cl::opt<int> 67 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 68 cl::ZeroOrMore, 69 cl::desc("Threshold for hot callsites ")); 70 71 static cl::opt<int> LocallyHotCallSiteThreshold( 72 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 73 cl::desc("Threshold for locally hot callsites ")); 74 75 static cl::opt<int> ColdCallSiteRelFreq( 76 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 77 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 78 "entry frequency, for a callsite to be cold in the absence of " 79 "profile information.")); 80 81 static cl::opt<int> HotCallSiteRelFreq( 82 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 83 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 84 "entry frequency, for a callsite to be hot in the absence of " 85 "profile information.")); 86 87 static cl::opt<bool> OptComputeFullInlineCost( 88 "inline-cost-full", cl::Hidden, cl::init(false), 89 cl::desc("Compute the full inline cost of a call site even when the cost " 90 "exceeds the threshold.")); 91 92 namespace { 93 94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 95 typedef InstVisitor<CallAnalyzer, bool> Base; 96 friend class InstVisitor<CallAnalyzer, bool>; 97 98 /// The TargetTransformInfo available for this compilation. 99 const TargetTransformInfo &TTI; 100 101 /// Getter for the cache of @llvm.assume intrinsics. 102 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 103 104 /// Getter for BlockFrequencyInfo 105 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 106 107 /// Profile summary information. 108 ProfileSummaryInfo *PSI; 109 110 /// The called function. 111 Function &F; 112 113 // Cache the DataLayout since we use it a lot. 114 const DataLayout &DL; 115 116 /// The OptimizationRemarkEmitter available for this compilation. 117 OptimizationRemarkEmitter *ORE; 118 119 /// The candidate callsite being analyzed. Please do not use this to do 120 /// analysis in the caller function; we want the inline cost query to be 121 /// easily cacheable. Instead, use the cover function paramHasAttr. 122 CallSite CandidateCS; 123 124 /// Tunable parameters that control the analysis. 125 const InlineParams &Params; 126 127 int Threshold; 128 int Cost; 129 bool ComputeFullInlineCost; 130 131 bool IsCallerRecursive; 132 bool IsRecursiveCall; 133 bool ExposesReturnsTwice; 134 bool HasDynamicAlloca; 135 bool ContainsNoDuplicateCall; 136 bool HasReturn; 137 bool HasIndirectBr; 138 bool HasFrameEscape; 139 140 /// Number of bytes allocated statically by the callee. 141 uint64_t AllocatedSize; 142 unsigned NumInstructions, NumVectorInstructions; 143 int VectorBonus, TenPercentVectorBonus; 144 // Bonus to be applied when the callee has only one reachable basic block. 145 int SingleBBBonus; 146 147 /// While we walk the potentially-inlined instructions, we build up and 148 /// maintain a mapping of simplified values specific to this callsite. The 149 /// idea is to propagate any special information we have about arguments to 150 /// this call through the inlinable section of the function, and account for 151 /// likely simplifications post-inlining. The most important aspect we track 152 /// is CFG altering simplifications -- when we prove a basic block dead, that 153 /// can cause dramatic shifts in the cost of inlining a function. 154 DenseMap<Value *, Constant *> SimplifiedValues; 155 156 /// Keep track of the values which map back (through function arguments) to 157 /// allocas on the caller stack which could be simplified through SROA. 158 DenseMap<Value *, Value *> SROAArgValues; 159 160 /// The mapping of caller Alloca values to their accumulated cost savings. If 161 /// we have to disable SROA for one of the allocas, this tells us how much 162 /// cost must be added. 163 DenseMap<Value *, int> SROAArgCosts; 164 165 /// Keep track of values which map to a pointer base and constant offset. 166 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 167 168 /// Keep track of dead blocks due to the constant arguments. 169 SetVector<BasicBlock *> DeadBlocks; 170 171 /// The mapping of the blocks to their known unique successors due to the 172 /// constant arguments. 173 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 174 175 /// Model the elimination of repeated loads that is expected to happen 176 /// whenever we simplify away the stores that would otherwise cause them to be 177 /// loads. 178 bool EnableLoadElimination; 179 SmallPtrSet<Value *, 16> LoadAddrSet; 180 int LoadEliminationCost; 181 182 // Custom simplification helper routines. 183 bool isAllocaDerivedArg(Value *V); 184 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 185 DenseMap<Value *, int>::iterator &CostIt); 186 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 187 void disableSROA(Value *V); 188 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 189 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 190 int InstructionCost); 191 void disableLoadElimination(); 192 bool isGEPFree(GetElementPtrInst &GEP); 193 bool canFoldInboundsGEP(GetElementPtrInst &I); 194 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 195 bool simplifyCallSite(Function *F, CallSite CS); 196 template <typename Callable> 197 bool simplifyInstruction(Instruction &I, Callable Evaluate); 198 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 199 200 /// Return true if the given argument to the function being considered for 201 /// inlining has the given attribute set either at the call site or the 202 /// function declaration. Primarily used to inspect call site specific 203 /// attributes since these can be more precise than the ones on the callee 204 /// itself. 205 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 206 207 /// Return true if the given value is known non null within the callee if 208 /// inlined through this particular callsite. 209 bool isKnownNonNullInCallee(Value *V); 210 211 /// Update Threshold based on callsite properties such as callee 212 /// attributes and callee hotness for PGO builds. The Callee is explicitly 213 /// passed to support analyzing indirect calls whose target is inferred by 214 /// analysis. 215 void updateThreshold(CallSite CS, Function &Callee); 216 217 /// Return true if size growth is allowed when inlining the callee at CS. 218 bool allowSizeGrowth(CallSite CS); 219 220 /// Return true if \p CS is a cold callsite. 221 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 222 223 /// Return a higher threshold if \p CS is a hot callsite. 224 Optional<int> getHotCallSiteThreshold(CallSite CS, 225 BlockFrequencyInfo *CallerBFI); 226 227 // Custom analysis routines. 228 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 229 230 // Disable several entry points to the visitor so we don't accidentally use 231 // them by declaring but not defining them here. 232 void visit(Module *); 233 void visit(Module &); 234 void visit(Function *); 235 void visit(Function &); 236 void visit(BasicBlock *); 237 void visit(BasicBlock &); 238 239 // Provide base case for our instruction visit. 240 bool visitInstruction(Instruction &I); 241 242 // Our visit overrides. 243 bool visitAlloca(AllocaInst &I); 244 bool visitPHI(PHINode &I); 245 bool visitGetElementPtr(GetElementPtrInst &I); 246 bool visitBitCast(BitCastInst &I); 247 bool visitPtrToInt(PtrToIntInst &I); 248 bool visitIntToPtr(IntToPtrInst &I); 249 bool visitCastInst(CastInst &I); 250 bool visitUnaryInstruction(UnaryInstruction &I); 251 bool visitCmpInst(CmpInst &I); 252 bool visitSub(BinaryOperator &I); 253 bool visitBinaryOperator(BinaryOperator &I); 254 bool visitLoad(LoadInst &I); 255 bool visitStore(StoreInst &I); 256 bool visitExtractValue(ExtractValueInst &I); 257 bool visitInsertValue(InsertValueInst &I); 258 bool visitCallSite(CallSite CS); 259 bool visitReturnInst(ReturnInst &RI); 260 bool visitBranchInst(BranchInst &BI); 261 bool visitSelectInst(SelectInst &SI); 262 bool visitSwitchInst(SwitchInst &SI); 263 bool visitIndirectBrInst(IndirectBrInst &IBI); 264 bool visitResumeInst(ResumeInst &RI); 265 bool visitCleanupReturnInst(CleanupReturnInst &RI); 266 bool visitCatchReturnInst(CatchReturnInst &RI); 267 bool visitUnreachableInst(UnreachableInst &I); 268 269 public: 270 CallAnalyzer(const TargetTransformInfo &TTI, 271 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 272 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 273 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 274 Function &Callee, CallSite CSArg, const InlineParams &Params) 275 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 276 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 277 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 278 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 279 Params.ComputeFullInlineCost || ORE), 280 IsCallerRecursive(false), IsRecursiveCall(false), 281 ExposesReturnsTwice(false), HasDynamicAlloca(false), 282 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 283 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 284 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), 285 EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), 286 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 287 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 288 SROACostSavings(0), SROACostSavingsLost(0) {} 289 290 bool analyzeCall(CallSite CS); 291 292 int getThreshold() { return Threshold; } 293 int getCost() { return Cost; } 294 295 // Keep a bunch of stats about the cost savings found so we can print them 296 // out when debugging. 297 unsigned NumConstantArgs; 298 unsigned NumConstantOffsetPtrArgs; 299 unsigned NumAllocaArgs; 300 unsigned NumConstantPtrCmps; 301 unsigned NumConstantPtrDiffs; 302 unsigned NumInstructionsSimplified; 303 unsigned SROACostSavings; 304 unsigned SROACostSavingsLost; 305 306 void dump(); 307 }; 308 309 } // namespace 310 311 /// \brief Test whether the given value is an Alloca-derived function argument. 312 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 313 return SROAArgValues.count(V); 314 } 315 316 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 317 /// Returns false if V does not map to a SROA-candidate. 318 bool CallAnalyzer::lookupSROAArgAndCost( 319 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 320 if (SROAArgValues.empty() || SROAArgCosts.empty()) 321 return false; 322 323 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 324 if (ArgIt == SROAArgValues.end()) 325 return false; 326 327 Arg = ArgIt->second; 328 CostIt = SROAArgCosts.find(Arg); 329 return CostIt != SROAArgCosts.end(); 330 } 331 332 /// \brief Disable SROA for the candidate marked by this cost iterator. 333 /// 334 /// This marks the candidate as no longer viable for SROA, and adds the cost 335 /// savings associated with it back into the inline cost measurement. 336 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 337 // If we're no longer able to perform SROA we need to undo its cost savings 338 // and prevent subsequent analysis. 339 Cost += CostIt->second; 340 SROACostSavings -= CostIt->second; 341 SROACostSavingsLost += CostIt->second; 342 SROAArgCosts.erase(CostIt); 343 disableLoadElimination(); 344 } 345 346 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 347 void CallAnalyzer::disableSROA(Value *V) { 348 Value *SROAArg; 349 DenseMap<Value *, int>::iterator CostIt; 350 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 351 disableSROA(CostIt); 352 } 353 354 /// \brief Accumulate the given cost for a particular SROA candidate. 355 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 356 int InstructionCost) { 357 CostIt->second += InstructionCost; 358 SROACostSavings += InstructionCost; 359 } 360 361 void CallAnalyzer::disableLoadElimination() { 362 if (EnableLoadElimination) { 363 Cost += LoadEliminationCost; 364 LoadEliminationCost = 0; 365 EnableLoadElimination = false; 366 } 367 } 368 369 /// \brief Accumulate a constant GEP offset into an APInt if possible. 370 /// 371 /// Returns false if unable to compute the offset for any reason. Respects any 372 /// simplified values known during the analysis of this callsite. 373 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 374 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 375 assert(IntPtrWidth == Offset.getBitWidth()); 376 377 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 378 GTI != GTE; ++GTI) { 379 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 380 if (!OpC) 381 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 382 OpC = dyn_cast<ConstantInt>(SimpleOp); 383 if (!OpC) 384 return false; 385 if (OpC->isZero()) 386 continue; 387 388 // Handle a struct index, which adds its field offset to the pointer. 389 if (StructType *STy = GTI.getStructTypeOrNull()) { 390 unsigned ElementIdx = OpC->getZExtValue(); 391 const StructLayout *SL = DL.getStructLayout(STy); 392 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 393 continue; 394 } 395 396 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 397 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 398 } 399 return true; 400 } 401 402 /// \brief Use TTI to check whether a GEP is free. 403 /// 404 /// Respects any simplified values known during the analysis of this callsite. 405 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 406 SmallVector<Value *, 4> Operands; 407 Operands.push_back(GEP.getOperand(0)); 408 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 409 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 410 Operands.push_back(SimpleOp); 411 else 412 Operands.push_back(*I); 413 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 414 } 415 416 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 417 // Check whether inlining will turn a dynamic alloca into a static 418 // alloca and handle that case. 419 if (I.isArrayAllocation()) { 420 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 421 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 422 Type *Ty = I.getAllocatedType(); 423 AllocatedSize = SaturatingMultiplyAdd( 424 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 425 return Base::visitAlloca(I); 426 } 427 } 428 429 // Accumulate the allocated size. 430 if (I.isStaticAlloca()) { 431 Type *Ty = I.getAllocatedType(); 432 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 433 } 434 435 // We will happily inline static alloca instructions. 436 if (I.isStaticAlloca()) 437 return Base::visitAlloca(I); 438 439 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 440 // a variety of reasons, and so we would like to not inline them into 441 // functions which don't currently have a dynamic alloca. This simply 442 // disables inlining altogether in the presence of a dynamic alloca. 443 HasDynamicAlloca = true; 444 return false; 445 } 446 447 bool CallAnalyzer::visitPHI(PHINode &I) { 448 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 449 // though we don't want to propagate it's bonuses. The idea is to disable 450 // SROA if it *might* be used in an inappropriate manner. 451 452 // Phi nodes are always zero-cost. 453 454 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits()); 455 bool CheckSROA = I.getType()->isPointerTy(); 456 457 // Track the constant or pointer with constant offset we've seen so far. 458 Constant *FirstC = nullptr; 459 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 460 Value *FirstV = nullptr; 461 462 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 463 BasicBlock *Pred = I.getIncomingBlock(i); 464 // If the incoming block is dead, skip the incoming block. 465 if (DeadBlocks.count(Pred)) 466 continue; 467 // If the parent block of phi is not the known successor of the incoming 468 // block, skip the incoming block. 469 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 470 if (KnownSuccessor && KnownSuccessor != I.getParent()) 471 continue; 472 473 Value *V = I.getIncomingValue(i); 474 // If the incoming value is this phi itself, skip the incoming value. 475 if (&I == V) 476 continue; 477 478 Constant *C = dyn_cast<Constant>(V); 479 if (!C) 480 C = SimplifiedValues.lookup(V); 481 482 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 483 if (!C && CheckSROA) 484 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 485 486 if (!C && !BaseAndOffset.first) 487 // The incoming value is neither a constant nor a pointer with constant 488 // offset, exit early. 489 return true; 490 491 if (FirstC) { 492 if (FirstC == C) 493 // If we've seen a constant incoming value before and it is the same 494 // constant we see this time, continue checking the next incoming value. 495 continue; 496 // Otherwise early exit because we either see a different constant or saw 497 // a constant before but we have a pointer with constant offset this time. 498 return true; 499 } 500 501 if (FirstV) { 502 // The same logic as above, but check pointer with constant offset here. 503 if (FirstBaseAndOffset == BaseAndOffset) 504 continue; 505 return true; 506 } 507 508 if (C) { 509 // This is the 1st time we've seen a constant, record it. 510 FirstC = C; 511 continue; 512 } 513 514 // The remaining case is that this is the 1st time we've seen a pointer with 515 // constant offset, record it. 516 FirstV = V; 517 FirstBaseAndOffset = BaseAndOffset; 518 } 519 520 // Check if we can map phi to a constant. 521 if (FirstC) { 522 SimplifiedValues[&I] = FirstC; 523 return true; 524 } 525 526 // Check if we can map phi to a pointer with constant offset. 527 if (FirstBaseAndOffset.first) { 528 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 529 530 Value *SROAArg; 531 DenseMap<Value *, int>::iterator CostIt; 532 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 533 SROAArgValues[&I] = SROAArg; 534 } 535 536 return true; 537 } 538 539 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. 540 /// This requires target data and inbounds GEPs. 541 /// 542 /// \return true if the specified GEP can be folded. 543 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 544 // Check if we have a base + offset for the pointer. 545 std::pair<Value *, APInt> BaseAndOffset = 546 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 547 if (!BaseAndOffset.first) 548 return false; 549 550 // Check if the offset of this GEP is constant, and if so accumulate it 551 // into Offset. 552 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 553 return false; 554 555 // Add the result as a new mapping to Base + Offset. 556 ConstantOffsetPtrs[&I] = BaseAndOffset; 557 558 return true; 559 } 560 561 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 562 Value *SROAArg; 563 DenseMap<Value *, int>::iterator CostIt; 564 bool SROACandidate = 565 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 566 567 // Lambda to check whether a GEP's indices are all constant. 568 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 569 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 570 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 571 return false; 572 return true; 573 }; 574 575 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 576 if (SROACandidate) 577 SROAArgValues[&I] = SROAArg; 578 579 // Constant GEPs are modeled as free. 580 return true; 581 } 582 583 // Variable GEPs will require math and will disable SROA. 584 if (SROACandidate) 585 disableSROA(CostIt); 586 return isGEPFree(I); 587 } 588 589 /// Simplify \p I if its operands are constants and update SimplifiedValues. 590 /// \p Evaluate is a callable specific to instruction type that evaluates the 591 /// instruction when all the operands are constants. 592 template <typename Callable> 593 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 594 SmallVector<Constant *, 2> COps; 595 for (Value *Op : I.operands()) { 596 Constant *COp = dyn_cast<Constant>(Op); 597 if (!COp) 598 COp = SimplifiedValues.lookup(Op); 599 if (!COp) 600 return false; 601 COps.push_back(COp); 602 } 603 auto *C = Evaluate(COps); 604 if (!C) 605 return false; 606 SimplifiedValues[&I] = C; 607 return true; 608 } 609 610 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 611 // Propagate constants through bitcasts. 612 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 613 return ConstantExpr::getBitCast(COps[0], I.getType()); 614 })) 615 return true; 616 617 // Track base/offsets through casts 618 std::pair<Value *, APInt> BaseAndOffset = 619 ConstantOffsetPtrs.lookup(I.getOperand(0)); 620 // Casts don't change the offset, just wrap it up. 621 if (BaseAndOffset.first) 622 ConstantOffsetPtrs[&I] = BaseAndOffset; 623 624 // Also look for SROA candidates here. 625 Value *SROAArg; 626 DenseMap<Value *, int>::iterator CostIt; 627 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 628 SROAArgValues[&I] = SROAArg; 629 630 // Bitcasts are always zero cost. 631 return true; 632 } 633 634 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 635 // Propagate constants through ptrtoint. 636 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 637 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 638 })) 639 return true; 640 641 // Track base/offset pairs when converted to a plain integer provided the 642 // integer is large enough to represent the pointer. 643 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 644 if (IntegerSize >= DL.getPointerSizeInBits()) { 645 std::pair<Value *, APInt> BaseAndOffset = 646 ConstantOffsetPtrs.lookup(I.getOperand(0)); 647 if (BaseAndOffset.first) 648 ConstantOffsetPtrs[&I] = BaseAndOffset; 649 } 650 651 // This is really weird. Technically, ptrtoint will disable SROA. However, 652 // unless that ptrtoint is *used* somewhere in the live basic blocks after 653 // inlining, it will be nuked, and SROA should proceed. All of the uses which 654 // would block SROA would also block SROA if applied directly to a pointer, 655 // and so we can just add the integer in here. The only places where SROA is 656 // preserved either cannot fire on an integer, or won't in-and-of themselves 657 // disable SROA (ext) w/o some later use that we would see and disable. 658 Value *SROAArg; 659 DenseMap<Value *, int>::iterator CostIt; 660 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 661 SROAArgValues[&I] = SROAArg; 662 663 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 664 } 665 666 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 667 // Propagate constants through ptrtoint. 668 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 669 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 670 })) 671 return true; 672 673 // Track base/offset pairs when round-tripped through a pointer without 674 // modifications provided the integer is not too large. 675 Value *Op = I.getOperand(0); 676 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 677 if (IntegerSize <= DL.getPointerSizeInBits()) { 678 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 679 if (BaseAndOffset.first) 680 ConstantOffsetPtrs[&I] = BaseAndOffset; 681 } 682 683 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 684 Value *SROAArg; 685 DenseMap<Value *, int>::iterator CostIt; 686 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 687 SROAArgValues[&I] = SROAArg; 688 689 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 690 } 691 692 bool CallAnalyzer::visitCastInst(CastInst &I) { 693 // Propagate constants through ptrtoint. 694 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 695 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 696 })) 697 return true; 698 699 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 700 disableSROA(I.getOperand(0)); 701 702 // If this is a floating-point cast, and the target says this operation 703 // is expensive, this may eventually become a library call. Treat the cost 704 // as such. 705 switch (I.getOpcode()) { 706 case Instruction::FPTrunc: 707 case Instruction::FPExt: 708 case Instruction::UIToFP: 709 case Instruction::SIToFP: 710 case Instruction::FPToUI: 711 case Instruction::FPToSI: 712 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 713 Cost += InlineConstants::CallPenalty; 714 default: 715 break; 716 } 717 718 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 719 } 720 721 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 722 Value *Operand = I.getOperand(0); 723 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 724 return ConstantFoldInstOperands(&I, COps[0], DL); 725 })) 726 return true; 727 728 // Disable any SROA on the argument to arbitrary unary operators. 729 disableSROA(Operand); 730 731 return false; 732 } 733 734 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 735 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 736 } 737 738 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 739 // Does the *call site* have the NonNull attribute set on an argument? We 740 // use the attribute on the call site to memoize any analysis done in the 741 // caller. This will also trip if the callee function has a non-null 742 // parameter attribute, but that's a less interesting case because hopefully 743 // the callee would already have been simplified based on that. 744 if (Argument *A = dyn_cast<Argument>(V)) 745 if (paramHasAttr(A, Attribute::NonNull)) 746 return true; 747 748 // Is this an alloca in the caller? This is distinct from the attribute case 749 // above because attributes aren't updated within the inliner itself and we 750 // always want to catch the alloca derived case. 751 if (isAllocaDerivedArg(V)) 752 // We can actually predict the result of comparisons between an 753 // alloca-derived value and null. Note that this fires regardless of 754 // SROA firing. 755 return true; 756 757 return false; 758 } 759 760 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 761 // If the normal destination of the invoke or the parent block of the call 762 // site is unreachable-terminated, there is little point in inlining this 763 // unless there is literally zero cost. 764 // FIXME: Note that it is possible that an unreachable-terminated block has a 765 // hot entry. For example, in below scenario inlining hot_call_X() may be 766 // beneficial : 767 // main() { 768 // hot_call_1(); 769 // ... 770 // hot_call_N() 771 // exit(0); 772 // } 773 // For now, we are not handling this corner case here as it is rare in real 774 // code. In future, we should elaborate this based on BPI and BFI in more 775 // general threshold adjusting heuristics in updateThreshold(). 776 Instruction *Instr = CS.getInstruction(); 777 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 778 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 779 return false; 780 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 781 return false; 782 783 return true; 784 } 785 786 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 787 // If global profile summary is available, then callsite's coldness is 788 // determined based on that. 789 if (PSI && PSI->hasProfileSummary()) 790 return PSI->isColdCallSite(CS, CallerBFI); 791 792 // Otherwise we need BFI to be available. 793 if (!CallerBFI) 794 return false; 795 796 // Determine if the callsite is cold relative to caller's entry. We could 797 // potentially cache the computation of scaled entry frequency, but the added 798 // complexity is not worth it unless this scaling shows up high in the 799 // profiles. 800 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 801 auto CallSiteBB = CS.getInstruction()->getParent(); 802 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 803 auto CallerEntryFreq = 804 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 805 return CallSiteFreq < CallerEntryFreq * ColdProb; 806 } 807 808 Optional<int> 809 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 810 BlockFrequencyInfo *CallerBFI) { 811 812 // If global profile summary is available, then callsite's hotness is 813 // determined based on that. 814 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 815 return Params.HotCallSiteThreshold; 816 817 // Otherwise we need BFI to be available and to have a locally hot callsite 818 // threshold. 819 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 820 return None; 821 822 // Determine if the callsite is hot relative to caller's entry. We could 823 // potentially cache the computation of scaled entry frequency, but the added 824 // complexity is not worth it unless this scaling shows up high in the 825 // profiles. 826 auto CallSiteBB = CS.getInstruction()->getParent(); 827 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 828 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 829 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 830 return Params.LocallyHotCallSiteThreshold; 831 832 // Otherwise treat it normally. 833 return None; 834 } 835 836 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 837 // If no size growth is allowed for this inlining, set Threshold to 0. 838 if (!allowSizeGrowth(CS)) { 839 Threshold = 0; 840 return; 841 } 842 843 Function *Caller = CS.getCaller(); 844 845 // return min(A, B) if B is valid. 846 auto MinIfValid = [](int A, Optional<int> B) { 847 return B ? std::min(A, B.getValue()) : A; 848 }; 849 850 // return max(A, B) if B is valid. 851 auto MaxIfValid = [](int A, Optional<int> B) { 852 return B ? std::max(A, B.getValue()) : A; 853 }; 854 855 // Various bonus percentages. These are multiplied by Threshold to get the 856 // bonus values. 857 // SingleBBBonus: This bonus is applied if the callee has a single reachable 858 // basic block at the given callsite context. This is speculatively applied 859 // and withdrawn if more than one basic block is seen. 860 // 861 // Vector bonuses: We want to more aggressively inline vector-dense kernels 862 // and apply this bonus based on the percentage of vector instructions. A 863 // bonus is applied if the vector instructions exceed 50% and half that amount 864 // is applied if it exceeds 10%. Note that these bonuses are some what 865 // arbitrary and evolved over time by accident as much as because they are 866 // principled bonuses. 867 // FIXME: It would be nice to base the bonus values on something more 868 // scientific. 869 // 870 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 871 // of the last call to a static function as inlining such functions is 872 // guaranteed to reduce code size. 873 // 874 // These bonus percentages may be set to 0 based on properties of the caller 875 // and the callsite. 876 int SingleBBBonusPercent = 50; 877 int VectorBonusPercent = 150; 878 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 879 880 // Lambda to set all the above bonus and bonus percentages to 0. 881 auto DisallowAllBonuses = [&]() { 882 SingleBBBonusPercent = 0; 883 VectorBonusPercent = 0; 884 LastCallToStaticBonus = 0; 885 }; 886 887 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 888 // and reduce the threshold if the caller has the necessary attribute. 889 if (Caller->optForMinSize()) { 890 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 891 // For minsize, we want to disable the single BB bonus and the vector 892 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 893 // a static function will, at the minimum, eliminate the parameter setup and 894 // call/return instructions. 895 SingleBBBonusPercent = 0; 896 VectorBonusPercent = 0; 897 } else if (Caller->optForSize()) 898 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 899 900 // Adjust the threshold based on inlinehint attribute and profile based 901 // hotness information if the caller does not have MinSize attribute. 902 if (!Caller->optForMinSize()) { 903 if (Callee.hasFnAttribute(Attribute::InlineHint)) 904 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 905 906 // FIXME: After switching to the new passmanager, simplify the logic below 907 // by checking only the callsite hotness/coldness as we will reliably 908 // have local profile information. 909 // 910 // Callsite hotness and coldness can be determined if sample profile is 911 // used (which adds hotness metadata to calls) or if caller's 912 // BlockFrequencyInfo is available. 913 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 914 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 915 if (!Caller->optForSize() && HotCallSiteThreshold) { 916 DEBUG(dbgs() << "Hot callsite.\n"); 917 // FIXME: This should update the threshold only if it exceeds the 918 // current threshold, but AutoFDO + ThinLTO currently relies on this 919 // behavior to prevent inlining of hot callsites during ThinLTO 920 // compile phase. 921 Threshold = HotCallSiteThreshold.getValue(); 922 } else if (isColdCallSite(CS, CallerBFI)) { 923 DEBUG(dbgs() << "Cold callsite.\n"); 924 // Do not apply bonuses for a cold callsite including the 925 // LastCallToStatic bonus. While this bonus might result in code size 926 // reduction, it can cause the size of a non-cold caller to increase 927 // preventing it from being inlined. 928 DisallowAllBonuses(); 929 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 930 } else if (PSI) { 931 // Use callee's global profile information only if we have no way of 932 // determining this via callsite information. 933 if (PSI->isFunctionEntryHot(&Callee)) { 934 DEBUG(dbgs() << "Hot callee.\n"); 935 // If callsite hotness can not be determined, we may still know 936 // that the callee is hot and treat it as a weaker hint for threshold 937 // increase. 938 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 939 } else if (PSI->isFunctionEntryCold(&Callee)) { 940 DEBUG(dbgs() << "Cold callee.\n"); 941 // Do not apply bonuses for a cold callee including the 942 // LastCallToStatic bonus. While this bonus might result in code size 943 // reduction, it can cause the size of a non-cold caller to increase 944 // preventing it from being inlined. 945 DisallowAllBonuses(); 946 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 947 } 948 } 949 } 950 951 // Finally, take the target-specific inlining threshold multiplier into 952 // account. 953 Threshold *= TTI.getInliningThresholdMultiplier(); 954 955 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 956 VectorBonus = Threshold * VectorBonusPercent / 100; 957 958 bool OnlyOneCallAndLocalLinkage = 959 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 960 // If there is only one call of the function, and it has internal linkage, 961 // the cost of inlining it drops dramatically. It may seem odd to update 962 // Cost in updateThreshold, but the bonus depends on the logic in this method. 963 if (OnlyOneCallAndLocalLinkage) 964 Cost -= LastCallToStaticBonus; 965 } 966 967 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 968 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 969 // First try to handle simplified comparisons. 970 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 971 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 972 })) 973 return true; 974 975 if (I.getOpcode() == Instruction::FCmp) 976 return false; 977 978 // Otherwise look for a comparison between constant offset pointers with 979 // a common base. 980 Value *LHSBase, *RHSBase; 981 APInt LHSOffset, RHSOffset; 982 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 983 if (LHSBase) { 984 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 985 if (RHSBase && LHSBase == RHSBase) { 986 // We have common bases, fold the icmp to a constant based on the 987 // offsets. 988 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 989 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 990 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 991 SimplifiedValues[&I] = C; 992 ++NumConstantPtrCmps; 993 return true; 994 } 995 } 996 } 997 998 // If the comparison is an equality comparison with null, we can simplify it 999 // if we know the value (argument) can't be null 1000 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1001 isKnownNonNullInCallee(I.getOperand(0))) { 1002 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1003 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1004 : ConstantInt::getFalse(I.getType()); 1005 return true; 1006 } 1007 // Finally check for SROA candidates in comparisons. 1008 Value *SROAArg; 1009 DenseMap<Value *, int>::iterator CostIt; 1010 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1011 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1012 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1013 return true; 1014 } 1015 1016 disableSROA(CostIt); 1017 } 1018 1019 return false; 1020 } 1021 1022 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1023 // Try to handle a special case: we can fold computing the difference of two 1024 // constant-related pointers. 1025 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1026 Value *LHSBase, *RHSBase; 1027 APInt LHSOffset, RHSOffset; 1028 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1029 if (LHSBase) { 1030 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1031 if (RHSBase && LHSBase == RHSBase) { 1032 // We have common bases, fold the subtract to a constant based on the 1033 // offsets. 1034 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1035 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1036 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1037 SimplifiedValues[&I] = C; 1038 ++NumConstantPtrDiffs; 1039 return true; 1040 } 1041 } 1042 } 1043 1044 // Otherwise, fall back to the generic logic for simplifying and handling 1045 // instructions. 1046 return Base::visitSub(I); 1047 } 1048 1049 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1050 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1051 Constant *CLHS = dyn_cast<Constant>(LHS); 1052 if (!CLHS) 1053 CLHS = SimplifiedValues.lookup(LHS); 1054 Constant *CRHS = dyn_cast<Constant>(RHS); 1055 if (!CRHS) 1056 CRHS = SimplifiedValues.lookup(RHS); 1057 1058 Value *SimpleV = nullptr; 1059 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1060 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1061 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1062 else 1063 SimpleV = 1064 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1065 1066 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1067 SimplifiedValues[&I] = C; 1068 1069 if (SimpleV) 1070 return true; 1071 1072 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1073 disableSROA(LHS); 1074 disableSROA(RHS); 1075 1076 // If the instruction is floating point, and the target says this operation 1077 // is expensive, this may eventually become a library call. Treat the cost 1078 // as such. 1079 if (I.getType()->isFloatingPointTy() && 1080 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1081 Cost += InlineConstants::CallPenalty; 1082 1083 return false; 1084 } 1085 1086 bool CallAnalyzer::visitLoad(LoadInst &I) { 1087 Value *SROAArg; 1088 DenseMap<Value *, int>::iterator CostIt; 1089 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1090 if (I.isSimple()) { 1091 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1092 return true; 1093 } 1094 1095 disableSROA(CostIt); 1096 } 1097 1098 // If the data is already loaded from this address and hasn't been clobbered 1099 // by any stores or calls, this load is likely to be redundant and can be 1100 // eliminated. 1101 if (EnableLoadElimination && 1102 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1103 LoadEliminationCost += InlineConstants::InstrCost; 1104 return true; 1105 } 1106 1107 return false; 1108 } 1109 1110 bool CallAnalyzer::visitStore(StoreInst &I) { 1111 Value *SROAArg; 1112 DenseMap<Value *, int>::iterator CostIt; 1113 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1114 if (I.isSimple()) { 1115 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1116 return true; 1117 } 1118 1119 disableSROA(CostIt); 1120 } 1121 1122 // The store can potentially clobber loads and prevent repeated loads from 1123 // being eliminated. 1124 // FIXME: 1125 // 1. We can probably keep an initial set of eliminatable loads substracted 1126 // from the cost even when we finally see a store. We just need to disable 1127 // *further* accumulation of elimination savings. 1128 // 2. We should probably at some point thread MemorySSA for the callee into 1129 // this and then use that to actually compute *really* precise savings. 1130 disableLoadElimination(); 1131 return false; 1132 } 1133 1134 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1135 // Constant folding for extract value is trivial. 1136 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1137 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1138 })) 1139 return true; 1140 1141 // SROA can look through these but give them a cost. 1142 return false; 1143 } 1144 1145 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1146 // Constant folding for insert value is trivial. 1147 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1148 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1149 /*InsertedValueOperand*/ COps[1], 1150 I.getIndices()); 1151 })) 1152 return true; 1153 1154 // SROA can look through these but give them a cost. 1155 return false; 1156 } 1157 1158 /// \brief Try to simplify a call site. 1159 /// 1160 /// Takes a concrete function and callsite and tries to actually simplify it by 1161 /// analyzing the arguments and call itself with instsimplify. Returns true if 1162 /// it has simplified the callsite to some other entity (a constant), making it 1163 /// free. 1164 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1165 // FIXME: Using the instsimplify logic directly for this is inefficient 1166 // because we have to continually rebuild the argument list even when no 1167 // simplifications can be performed. Until that is fixed with remapping 1168 // inside of instsimplify, directly constant fold calls here. 1169 if (!canConstantFoldCallTo(CS, F)) 1170 return false; 1171 1172 // Try to re-map the arguments to constants. 1173 SmallVector<Constant *, 4> ConstantArgs; 1174 ConstantArgs.reserve(CS.arg_size()); 1175 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1176 ++I) { 1177 Constant *C = dyn_cast<Constant>(*I); 1178 if (!C) 1179 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1180 if (!C) 1181 return false; // This argument doesn't map to a constant. 1182 1183 ConstantArgs.push_back(C); 1184 } 1185 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1186 SimplifiedValues[CS.getInstruction()] = C; 1187 return true; 1188 } 1189 1190 return false; 1191 } 1192 1193 bool CallAnalyzer::visitCallSite(CallSite CS) { 1194 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1195 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1196 // This aborts the entire analysis. 1197 ExposesReturnsTwice = true; 1198 return false; 1199 } 1200 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1201 ContainsNoDuplicateCall = true; 1202 1203 if (Function *F = CS.getCalledFunction()) { 1204 // When we have a concrete function, first try to simplify it directly. 1205 if (simplifyCallSite(F, CS)) 1206 return true; 1207 1208 // Next check if it is an intrinsic we know about. 1209 // FIXME: Lift this into part of the InstVisitor. 1210 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1211 switch (II->getIntrinsicID()) { 1212 default: 1213 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1214 disableLoadElimination(); 1215 return Base::visitCallSite(CS); 1216 1217 case Intrinsic::load_relative: 1218 // This is normally lowered to 4 LLVM instructions. 1219 Cost += 3 * InlineConstants::InstrCost; 1220 return false; 1221 1222 case Intrinsic::memset: 1223 case Intrinsic::memcpy: 1224 case Intrinsic::memmove: 1225 disableLoadElimination(); 1226 // SROA can usually chew through these intrinsics, but they aren't free. 1227 return false; 1228 case Intrinsic::localescape: 1229 HasFrameEscape = true; 1230 return false; 1231 } 1232 } 1233 1234 if (F == CS.getInstruction()->getFunction()) { 1235 // This flag will fully abort the analysis, so don't bother with anything 1236 // else. 1237 IsRecursiveCall = true; 1238 return false; 1239 } 1240 1241 if (TTI.isLoweredToCall(F)) { 1242 // We account for the average 1 instruction per call argument setup 1243 // here. 1244 Cost += CS.arg_size() * InlineConstants::InstrCost; 1245 1246 // Everything other than inline ASM will also have a significant cost 1247 // merely from making the call. 1248 if (!isa<InlineAsm>(CS.getCalledValue())) 1249 Cost += InlineConstants::CallPenalty; 1250 } 1251 1252 if (!CS.onlyReadsMemory()) 1253 disableLoadElimination(); 1254 return Base::visitCallSite(CS); 1255 } 1256 1257 // Otherwise we're in a very special case -- an indirect function call. See 1258 // if we can be particularly clever about this. 1259 Value *Callee = CS.getCalledValue(); 1260 1261 // First, pay the price of the argument setup. We account for the average 1262 // 1 instruction per call argument setup here. 1263 Cost += CS.arg_size() * InlineConstants::InstrCost; 1264 1265 // Next, check if this happens to be an indirect function call to a known 1266 // function in this inline context. If not, we've done all we can. 1267 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1268 if (!F) { 1269 if (!CS.onlyReadsMemory()) 1270 disableLoadElimination(); 1271 return Base::visitCallSite(CS); 1272 } 1273 1274 // If we have a constant that we are calling as a function, we can peer 1275 // through it and see the function target. This happens not infrequently 1276 // during devirtualization and so we want to give it a hefty bonus for 1277 // inlining, but cap that bonus in the event that inlining wouldn't pan 1278 // out. Pretend to inline the function, with a custom threshold. 1279 auto IndirectCallParams = Params; 1280 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1281 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1282 IndirectCallParams); 1283 if (CA.analyzeCall(CS)) { 1284 // We were able to inline the indirect call! Subtract the cost from the 1285 // threshold to get the bonus we want to apply, but don't go below zero. 1286 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1287 } 1288 1289 if (!F->onlyReadsMemory()) 1290 disableLoadElimination(); 1291 return Base::visitCallSite(CS); 1292 } 1293 1294 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1295 // At least one return instruction will be free after inlining. 1296 bool Free = !HasReturn; 1297 HasReturn = true; 1298 return Free; 1299 } 1300 1301 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1302 // We model unconditional branches as essentially free -- they really 1303 // shouldn't exist at all, but handling them makes the behavior of the 1304 // inliner more regular and predictable. Interestingly, conditional branches 1305 // which will fold away are also free. 1306 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1307 dyn_cast_or_null<ConstantInt>( 1308 SimplifiedValues.lookup(BI.getCondition())); 1309 } 1310 1311 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1312 bool CheckSROA = SI.getType()->isPointerTy(); 1313 Value *TrueVal = SI.getTrueValue(); 1314 Value *FalseVal = SI.getFalseValue(); 1315 1316 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1317 if (!TrueC) 1318 TrueC = SimplifiedValues.lookup(TrueVal); 1319 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1320 if (!FalseC) 1321 FalseC = SimplifiedValues.lookup(FalseVal); 1322 Constant *CondC = 1323 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1324 1325 if (!CondC) { 1326 // Select C, X, X => X 1327 if (TrueC == FalseC && TrueC) { 1328 SimplifiedValues[&SI] = TrueC; 1329 return true; 1330 } 1331 1332 if (!CheckSROA) 1333 return Base::visitSelectInst(SI); 1334 1335 std::pair<Value *, APInt> TrueBaseAndOffset = 1336 ConstantOffsetPtrs.lookup(TrueVal); 1337 std::pair<Value *, APInt> FalseBaseAndOffset = 1338 ConstantOffsetPtrs.lookup(FalseVal); 1339 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1340 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1341 1342 Value *SROAArg; 1343 DenseMap<Value *, int>::iterator CostIt; 1344 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1345 SROAArgValues[&SI] = SROAArg; 1346 return true; 1347 } 1348 1349 return Base::visitSelectInst(SI); 1350 } 1351 1352 // Select condition is a constant. 1353 Value *SelectedV = CondC->isAllOnesValue() 1354 ? TrueVal 1355 : (CondC->isNullValue()) ? FalseVal : nullptr; 1356 if (!SelectedV) { 1357 // Condition is a vector constant that is not all 1s or all 0s. If all 1358 // operands are constants, ConstantExpr::getSelect() can handle the cases 1359 // such as select vectors. 1360 if (TrueC && FalseC) { 1361 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1362 SimplifiedValues[&SI] = C; 1363 return true; 1364 } 1365 } 1366 return Base::visitSelectInst(SI); 1367 } 1368 1369 // Condition is either all 1s or all 0s. SI can be simplified. 1370 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1371 SimplifiedValues[&SI] = SelectedC; 1372 return true; 1373 } 1374 1375 if (!CheckSROA) 1376 return true; 1377 1378 std::pair<Value *, APInt> BaseAndOffset = 1379 ConstantOffsetPtrs.lookup(SelectedV); 1380 if (BaseAndOffset.first) { 1381 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1382 1383 Value *SROAArg; 1384 DenseMap<Value *, int>::iterator CostIt; 1385 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1386 SROAArgValues[&SI] = SROAArg; 1387 } 1388 1389 return true; 1390 } 1391 1392 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1393 // We model unconditional switches as free, see the comments on handling 1394 // branches. 1395 if (isa<ConstantInt>(SI.getCondition())) 1396 return true; 1397 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1398 if (isa<ConstantInt>(V)) 1399 return true; 1400 1401 // Assume the most general case where the switch is lowered into 1402 // either a jump table, bit test, or a balanced binary tree consisting of 1403 // case clusters without merging adjacent clusters with the same 1404 // destination. We do not consider the switches that are lowered with a mix 1405 // of jump table/bit test/binary search tree. The cost of the switch is 1406 // proportional to the size of the tree or the size of jump table range. 1407 // 1408 // NB: We convert large switches which are just used to initialize large phi 1409 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1410 // inlining those. It will prevent inlining in cases where the optimization 1411 // does not (yet) fire. 1412 1413 // Maximum valid cost increased in this function. 1414 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1415 1416 // Exit early for a large switch, assuming one case needs at least one 1417 // instruction. 1418 // FIXME: This is not true for a bit test, but ignore such case for now to 1419 // save compile-time. 1420 int64_t CostLowerBound = 1421 std::min((int64_t)CostUpperBound, 1422 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1423 1424 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1425 Cost = CostLowerBound; 1426 return false; 1427 } 1428 1429 unsigned JumpTableSize = 0; 1430 unsigned NumCaseCluster = 1431 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1432 1433 // If suitable for a jump table, consider the cost for the table size and 1434 // branch to destination. 1435 if (JumpTableSize) { 1436 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1437 4 * InlineConstants::InstrCost; 1438 1439 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1440 return false; 1441 } 1442 1443 // Considering forming a binary search, we should find the number of nodes 1444 // which is same as the number of comparisons when lowered. For a given 1445 // number of clusters, n, we can define a recursive function, f(n), to find 1446 // the number of nodes in the tree. The recursion is : 1447 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1448 // and f(n) = n, when n <= 3. 1449 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1450 // when n > 3. So, the number of comparisons from leaves should be n, while 1451 // the number of non-leaf should be : 1452 // 2^(log2(n) - 1) - 1 1453 // = 2^log2(n) * 2^-1 - 1 1454 // = n / 2 - 1. 1455 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1456 // number of comparisons in a simple closed form : 1457 // n + n / 2 - 1 = n * 3 / 2 - 1 1458 if (NumCaseCluster <= 3) { 1459 // Suppose a comparison includes one compare and one conditional branch. 1460 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1461 return false; 1462 } 1463 1464 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1465 int64_t SwitchCost = 1466 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1467 1468 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1469 return false; 1470 } 1471 1472 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1473 // We never want to inline functions that contain an indirectbr. This is 1474 // incorrect because all the blockaddress's (in static global initializers 1475 // for example) would be referring to the original function, and this 1476 // indirect jump would jump from the inlined copy of the function into the 1477 // original function which is extremely undefined behavior. 1478 // FIXME: This logic isn't really right; we can safely inline functions with 1479 // indirectbr's as long as no other function or global references the 1480 // blockaddress of a block within the current function. 1481 HasIndirectBr = true; 1482 return false; 1483 } 1484 1485 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1486 // FIXME: It's not clear that a single instruction is an accurate model for 1487 // the inline cost of a resume instruction. 1488 return false; 1489 } 1490 1491 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1492 // FIXME: It's not clear that a single instruction is an accurate model for 1493 // the inline cost of a cleanupret instruction. 1494 return false; 1495 } 1496 1497 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1498 // FIXME: It's not clear that a single instruction is an accurate model for 1499 // the inline cost of a catchret instruction. 1500 return false; 1501 } 1502 1503 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1504 // FIXME: It might be reasonably to discount the cost of instructions leading 1505 // to unreachable as they have the lowest possible impact on both runtime and 1506 // code size. 1507 return true; // No actual code is needed for unreachable. 1508 } 1509 1510 bool CallAnalyzer::visitInstruction(Instruction &I) { 1511 // Some instructions are free. All of the free intrinsics can also be 1512 // handled by SROA, etc. 1513 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1514 return true; 1515 1516 // We found something we don't understand or can't handle. Mark any SROA-able 1517 // values in the operand list as no longer viable. 1518 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1519 disableSROA(*OI); 1520 1521 return false; 1522 } 1523 1524 /// \brief Analyze a basic block for its contribution to the inline cost. 1525 /// 1526 /// This method walks the analyzer over every instruction in the given basic 1527 /// block and accounts for their cost during inlining at this callsite. It 1528 /// aborts early if the threshold has been exceeded or an impossible to inline 1529 /// construct has been detected. It returns false if inlining is no longer 1530 /// viable, and true if inlining remains viable. 1531 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1532 SmallPtrSetImpl<const Value *> &EphValues) { 1533 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1534 // FIXME: Currently, the number of instructions in a function regardless of 1535 // our ability to simplify them during inline to constants or dead code, 1536 // are actually used by the vector bonus heuristic. As long as that's true, 1537 // we have to special case debug intrinsics here to prevent differences in 1538 // inlining due to debug symbols. Eventually, the number of unsimplified 1539 // instructions shouldn't factor into the cost computation, but until then, 1540 // hack around it here. 1541 if (isa<DbgInfoIntrinsic>(I)) 1542 continue; 1543 1544 // Skip ephemeral values. 1545 if (EphValues.count(&*I)) 1546 continue; 1547 1548 ++NumInstructions; 1549 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1550 ++NumVectorInstructions; 1551 1552 // If the instruction simplified to a constant, there is no cost to this 1553 // instruction. Visit the instructions using our InstVisitor to account for 1554 // all of the per-instruction logic. The visit tree returns true if we 1555 // consumed the instruction in any way, and false if the instruction's base 1556 // cost should count against inlining. 1557 if (Base::visit(&*I)) 1558 ++NumInstructionsSimplified; 1559 else 1560 Cost += InlineConstants::InstrCost; 1561 1562 using namespace ore; 1563 // If the visit this instruction detected an uninlinable pattern, abort. 1564 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1565 HasIndirectBr || HasFrameEscape) { 1566 if (ORE) 1567 ORE->emit([&]() { 1568 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1569 CandidateCS.getInstruction()) 1570 << NV("Callee", &F) 1571 << " has uninlinable pattern and cost is not fully computed"; 1572 }); 1573 return false; 1574 } 1575 1576 // If the caller is a recursive function then we don't want to inline 1577 // functions which allocate a lot of stack space because it would increase 1578 // the caller stack usage dramatically. 1579 if (IsCallerRecursive && 1580 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1581 if (ORE) 1582 ORE->emit([&]() { 1583 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1584 CandidateCS.getInstruction()) 1585 << NV("Callee", &F) 1586 << " is recursive and allocates too much stack space. Cost is " 1587 "not fully computed"; 1588 }); 1589 return false; 1590 } 1591 1592 // Check if we've past the maximum possible threshold so we don't spin in 1593 // huge basic blocks that will never inline. 1594 if (Cost >= Threshold && !ComputeFullInlineCost) 1595 return false; 1596 } 1597 1598 return true; 1599 } 1600 1601 /// \brief Compute the base pointer and cumulative constant offsets for V. 1602 /// 1603 /// This strips all constant offsets off of V, leaving it the base pointer, and 1604 /// accumulates the total constant offset applied in the returned constant. It 1605 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1606 /// no constant offsets applied. 1607 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1608 if (!V->getType()->isPointerTy()) 1609 return nullptr; 1610 1611 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1612 APInt Offset = APInt::getNullValue(IntPtrWidth); 1613 1614 // Even though we don't look through PHI nodes, we could be called on an 1615 // instruction in an unreachable block, which may be on a cycle. 1616 SmallPtrSet<Value *, 4> Visited; 1617 Visited.insert(V); 1618 do { 1619 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1620 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1621 return nullptr; 1622 V = GEP->getPointerOperand(); 1623 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1624 V = cast<Operator>(V)->getOperand(0); 1625 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1626 if (GA->isInterposable()) 1627 break; 1628 V = GA->getAliasee(); 1629 } else { 1630 break; 1631 } 1632 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1633 } while (Visited.insert(V).second); 1634 1635 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1636 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1637 } 1638 1639 /// \brief Find dead blocks due to deleted CFG edges during inlining. 1640 /// 1641 /// If we know the successor of the current block, \p CurrBB, has to be \p 1642 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1643 /// no live incoming CFG edges. If one block is found to be dead, we can 1644 /// continue growing the dead block list by checking the successors of the dead 1645 /// blocks to see if all their incoming edges are dead or not. 1646 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1647 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1648 // A CFG edge is dead if the predecessor is dead or the predessor has a 1649 // known successor which is not the one under exam. 1650 return (DeadBlocks.count(Pred) || 1651 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1652 }; 1653 1654 auto IsNewlyDead = [&](BasicBlock *BB) { 1655 // If all the edges to a block are dead, the block is also dead. 1656 return (!DeadBlocks.count(BB) && 1657 llvm::all_of(predecessors(BB), 1658 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1659 }; 1660 1661 for (BasicBlock *Succ : successors(CurrBB)) { 1662 if (Succ == NextBB || !IsNewlyDead(Succ)) 1663 continue; 1664 SmallVector<BasicBlock *, 4> NewDead; 1665 NewDead.push_back(Succ); 1666 while (!NewDead.empty()) { 1667 BasicBlock *Dead = NewDead.pop_back_val(); 1668 if (DeadBlocks.insert(Dead)) 1669 // Continue growing the dead block lists. 1670 for (BasicBlock *S : successors(Dead)) 1671 if (IsNewlyDead(S)) 1672 NewDead.push_back(S); 1673 } 1674 } 1675 } 1676 1677 /// \brief Analyze a call site for potential inlining. 1678 /// 1679 /// Returns true if inlining this call is viable, and false if it is not 1680 /// viable. It computes the cost and adjusts the threshold based on numerous 1681 /// factors and heuristics. If this method returns false but the computed cost 1682 /// is below the computed threshold, then inlining was forcibly disabled by 1683 /// some artifact of the routine. 1684 bool CallAnalyzer::analyzeCall(CallSite CS) { 1685 ++NumCallsAnalyzed; 1686 1687 // Perform some tweaks to the cost and threshold based on the direct 1688 // callsite information. 1689 1690 // We want to more aggressively inline vector-dense kernels, so up the 1691 // threshold, and we'll lower it if the % of vector instructions gets too 1692 // low. Note that these bonuses are some what arbitrary and evolved over time 1693 // by accident as much as because they are principled bonuses. 1694 // 1695 // FIXME: It would be nice to remove all such bonuses. At least it would be 1696 // nice to base the bonus values on something more scientific. 1697 assert(NumInstructions == 0); 1698 assert(NumVectorInstructions == 0); 1699 1700 // Update the threshold based on callsite properties 1701 updateThreshold(CS, F); 1702 1703 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1704 // this Threshold any time, and cost cannot decrease, we can stop processing 1705 // the rest of the function body. 1706 Threshold += (SingleBBBonus + VectorBonus); 1707 1708 // Give out bonuses for the callsite, as the instructions setting them up 1709 // will be gone after inlining. 1710 Cost -= getCallsiteCost(CS, DL); 1711 1712 // If this function uses the coldcc calling convention, prefer not to inline 1713 // it. 1714 if (F.getCallingConv() == CallingConv::Cold) 1715 Cost += InlineConstants::ColdccPenalty; 1716 1717 // Check if we're done. This can happen due to bonuses and penalties. 1718 if (Cost >= Threshold && !ComputeFullInlineCost) 1719 return false; 1720 1721 if (F.empty()) 1722 return true; 1723 1724 Function *Caller = CS.getInstruction()->getFunction(); 1725 // Check if the caller function is recursive itself. 1726 for (User *U : Caller->users()) { 1727 CallSite Site(U); 1728 if (!Site) 1729 continue; 1730 Instruction *I = Site.getInstruction(); 1731 if (I->getFunction() == Caller) { 1732 IsCallerRecursive = true; 1733 break; 1734 } 1735 } 1736 1737 // Populate our simplified values by mapping from function arguments to call 1738 // arguments with known important simplifications. 1739 CallSite::arg_iterator CAI = CS.arg_begin(); 1740 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1741 FAI != FAE; ++FAI, ++CAI) { 1742 assert(CAI != CS.arg_end()); 1743 if (Constant *C = dyn_cast<Constant>(CAI)) 1744 SimplifiedValues[&*FAI] = C; 1745 1746 Value *PtrArg = *CAI; 1747 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1748 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1749 1750 // We can SROA any pointer arguments derived from alloca instructions. 1751 if (isa<AllocaInst>(PtrArg)) { 1752 SROAArgValues[&*FAI] = PtrArg; 1753 SROAArgCosts[PtrArg] = 0; 1754 } 1755 } 1756 } 1757 NumConstantArgs = SimplifiedValues.size(); 1758 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1759 NumAllocaArgs = SROAArgValues.size(); 1760 1761 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1762 // the ephemeral values multiple times (and they're completely determined by 1763 // the callee, so this is purely duplicate work). 1764 SmallPtrSet<const Value *, 32> EphValues; 1765 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1766 1767 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1768 // adding basic blocks of the callee which can be proven to be dead for this 1769 // particular call site in order to get more accurate cost estimates. This 1770 // requires a somewhat heavyweight iteration pattern: we need to walk the 1771 // basic blocks in a breadth-first order as we insert live successors. To 1772 // accomplish this, prioritizing for small iterations because we exit after 1773 // crossing our threshold, we use a small-size optimized SetVector. 1774 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1775 SmallPtrSet<BasicBlock *, 16>> 1776 BBSetVector; 1777 BBSetVector BBWorklist; 1778 BBWorklist.insert(&F.getEntryBlock()); 1779 bool SingleBB = true; 1780 // Note that we *must not* cache the size, this loop grows the worklist. 1781 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1782 // Bail out the moment we cross the threshold. This means we'll under-count 1783 // the cost, but only when undercounting doesn't matter. 1784 if (Cost >= Threshold && !ComputeFullInlineCost) 1785 break; 1786 1787 BasicBlock *BB = BBWorklist[Idx]; 1788 if (BB->empty()) 1789 continue; 1790 1791 // Disallow inlining a blockaddress. A blockaddress only has defined 1792 // behavior for an indirect branch in the same function, and we do not 1793 // currently support inlining indirect branches. But, the inliner may not 1794 // see an indirect branch that ends up being dead code at a particular call 1795 // site. If the blockaddress escapes the function, e.g., via a global 1796 // variable, inlining may lead to an invalid cross-function reference. 1797 if (BB->hasAddressTaken()) 1798 return false; 1799 1800 // Analyze the cost of this block. If we blow through the threshold, this 1801 // returns false, and we can bail on out. 1802 if (!analyzeBlock(BB, EphValues)) 1803 return false; 1804 1805 TerminatorInst *TI = BB->getTerminator(); 1806 1807 // Add in the live successors by first checking whether we have terminator 1808 // that may be simplified based on the values simplified by this call. 1809 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1810 if (BI->isConditional()) { 1811 Value *Cond = BI->getCondition(); 1812 if (ConstantInt *SimpleCond = 1813 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1814 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1815 BBWorklist.insert(NextBB); 1816 KnownSuccessors[BB] = NextBB; 1817 findDeadBlocks(BB, NextBB); 1818 continue; 1819 } 1820 } 1821 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1822 Value *Cond = SI->getCondition(); 1823 if (ConstantInt *SimpleCond = 1824 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1825 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1826 BBWorklist.insert(NextBB); 1827 KnownSuccessors[BB] = NextBB; 1828 findDeadBlocks(BB, NextBB); 1829 continue; 1830 } 1831 } 1832 1833 // If we're unable to select a particular successor, just count all of 1834 // them. 1835 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1836 ++TIdx) 1837 BBWorklist.insert(TI->getSuccessor(TIdx)); 1838 1839 // If we had any successors at this point, than post-inlining is likely to 1840 // have them as well. Note that we assume any basic blocks which existed 1841 // due to branches or switches which folded above will also fold after 1842 // inlining. 1843 if (SingleBB && TI->getNumSuccessors() > 1) { 1844 // Take off the bonus we applied to the threshold. 1845 Threshold -= SingleBBBonus; 1846 SingleBB = false; 1847 } 1848 } 1849 1850 bool OnlyOneCallAndLocalLinkage = 1851 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1852 // If this is a noduplicate call, we can still inline as long as 1853 // inlining this would cause the removal of the caller (so the instruction 1854 // is not actually duplicated, just moved). 1855 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1856 return false; 1857 1858 // We applied the maximum possible vector bonus at the beginning. Now, 1859 // subtract the excess bonus, if any, from the Threshold before 1860 // comparing against Cost. 1861 if (NumVectorInstructions <= NumInstructions / 10) 1862 Threshold -= VectorBonus; 1863 else if (NumVectorInstructions <= NumInstructions / 2) 1864 Threshold -= VectorBonus/2; 1865 1866 return Cost < std::max(1, Threshold); 1867 } 1868 1869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1870 /// \brief Dump stats about this call's analysis. 1871 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1872 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1873 DEBUG_PRINT_STAT(NumConstantArgs); 1874 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1875 DEBUG_PRINT_STAT(NumAllocaArgs); 1876 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1877 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1878 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1879 DEBUG_PRINT_STAT(NumInstructions); 1880 DEBUG_PRINT_STAT(SROACostSavings); 1881 DEBUG_PRINT_STAT(SROACostSavingsLost); 1882 DEBUG_PRINT_STAT(LoadEliminationCost); 1883 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1884 DEBUG_PRINT_STAT(Cost); 1885 DEBUG_PRINT_STAT(Threshold); 1886 #undef DEBUG_PRINT_STAT 1887 } 1888 #endif 1889 1890 /// \brief Test that there are no attribute conflicts between Caller and Callee 1891 /// that prevent inlining. 1892 static bool functionsHaveCompatibleAttributes(Function *Caller, 1893 Function *Callee, 1894 TargetTransformInfo &TTI) { 1895 return TTI.areInlineCompatible(Caller, Callee) && 1896 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1897 } 1898 1899 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1900 int Cost = 0; 1901 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1902 if (CS.isByValArgument(I)) { 1903 // We approximate the number of loads and stores needed by dividing the 1904 // size of the byval type by the target's pointer size. 1905 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1906 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1907 unsigned PointerSize = DL.getPointerSizeInBits(); 1908 // Ceiling division. 1909 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1910 1911 // If it generates more than 8 stores it is likely to be expanded as an 1912 // inline memcpy so we take that as an upper bound. Otherwise we assume 1913 // one load and one store per word copied. 1914 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1915 // here instead of a magic number of 8, but it's not available via 1916 // DataLayout. 1917 NumStores = std::min(NumStores, 8U); 1918 1919 Cost += 2 * NumStores * InlineConstants::InstrCost; 1920 } else { 1921 // For non-byval arguments subtract off one instruction per call 1922 // argument. 1923 Cost += InlineConstants::InstrCost; 1924 } 1925 } 1926 // The call instruction also disappears after inlining. 1927 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1928 return Cost; 1929 } 1930 1931 InlineCost llvm::getInlineCost( 1932 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1933 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1934 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1935 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1936 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1937 GetAssumptionCache, GetBFI, PSI, ORE); 1938 } 1939 1940 InlineCost llvm::getInlineCost( 1941 CallSite CS, Function *Callee, const InlineParams &Params, 1942 TargetTransformInfo &CalleeTTI, 1943 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1944 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1945 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1946 1947 // Cannot inline indirect calls. 1948 if (!Callee) 1949 return llvm::InlineCost::getNever(); 1950 1951 // Calls to functions with always-inline attributes should be inlined 1952 // whenever possible. 1953 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1954 if (isInlineViable(*Callee)) 1955 return llvm::InlineCost::getAlways(); 1956 return llvm::InlineCost::getNever(); 1957 } 1958 1959 // Never inline functions with conflicting attributes (unless callee has 1960 // always-inline attribute). 1961 Function *Caller = CS.getCaller(); 1962 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1963 return llvm::InlineCost::getNever(); 1964 1965 // Don't inline this call if the caller has the optnone attribute. 1966 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1967 return llvm::InlineCost::getNever(); 1968 1969 // Don't inline functions which can be interposed at link-time. Don't inline 1970 // functions marked noinline or call sites marked noinline. 1971 // Note: inlining non-exact non-interposable functions is fine, since we know 1972 // we have *a* correct implementation of the source level function. 1973 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1974 CS.isNoInline()) 1975 return llvm::InlineCost::getNever(); 1976 1977 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1978 << "... (caller:" << Caller->getName() << ")\n"); 1979 1980 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 1981 Params); 1982 bool ShouldInline = CA.analyzeCall(CS); 1983 1984 DEBUG(CA.dump()); 1985 1986 // Check if there was a reason to force inlining or no inlining. 1987 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1988 return InlineCost::getNever(); 1989 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1990 return InlineCost::getAlways(); 1991 1992 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1993 } 1994 1995 bool llvm::isInlineViable(Function &F) { 1996 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1997 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1998 // Disallow inlining of functions which contain indirect branches or 1999 // blockaddresses. 2000 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 2001 return false; 2002 2003 for (auto &II : *BI) { 2004 CallSite CS(&II); 2005 if (!CS) 2006 continue; 2007 2008 // Disallow recursive calls. 2009 if (&F == CS.getCalledFunction()) 2010 return false; 2011 2012 // Disallow calls which expose returns-twice to a function not previously 2013 // attributed as such. 2014 if (!ReturnsTwice && CS.isCall() && 2015 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 2016 return false; 2017 2018 // Disallow inlining functions that call @llvm.localescape. Doing this 2019 // correctly would require major changes to the inliner. 2020 if (CS.getCalledFunction() && 2021 CS.getCalledFunction()->getIntrinsicID() == 2022 llvm::Intrinsic::localescape) 2023 return false; 2024 } 2025 } 2026 2027 return true; 2028 } 2029 2030 // APIs to create InlineParams based on command line flags and/or other 2031 // parameters. 2032 2033 InlineParams llvm::getInlineParams(int Threshold) { 2034 InlineParams Params; 2035 2036 // This field is the threshold to use for a callee by default. This is 2037 // derived from one or more of: 2038 // * optimization or size-optimization levels, 2039 // * a value passed to createFunctionInliningPass function, or 2040 // * the -inline-threshold flag. 2041 // If the -inline-threshold flag is explicitly specified, that is used 2042 // irrespective of anything else. 2043 if (InlineThreshold.getNumOccurrences() > 0) 2044 Params.DefaultThreshold = InlineThreshold; 2045 else 2046 Params.DefaultThreshold = Threshold; 2047 2048 // Set the HintThreshold knob from the -inlinehint-threshold. 2049 Params.HintThreshold = HintThreshold; 2050 2051 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2052 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2053 2054 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2055 // populate LocallyHotCallSiteThreshold. Later, we populate 2056 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2057 // we know that optimization level is O3 (in the getInlineParams variant that 2058 // takes the opt and size levels). 2059 // FIXME: Remove this check (and make the assignment unconditional) after 2060 // addressing size regression issues at O2. 2061 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2062 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2063 2064 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2065 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2066 2067 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2068 // -inlinehint-threshold commandline option is not explicitly given. If that 2069 // option is present, then its value applies even for callees with size and 2070 // minsize attributes. 2071 // If the -inline-threshold is not specified, set the ColdThreshold from the 2072 // -inlinecold-threshold even if it is not explicitly passed. If 2073 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2074 // explicitly specified to set the ColdThreshold knob 2075 if (InlineThreshold.getNumOccurrences() == 0) { 2076 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2077 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2078 Params.ColdThreshold = ColdThreshold; 2079 } else if (ColdThreshold.getNumOccurrences() > 0) { 2080 Params.ColdThreshold = ColdThreshold; 2081 } 2082 return Params; 2083 } 2084 2085 InlineParams llvm::getInlineParams() { 2086 return getInlineParams(InlineThreshold); 2087 } 2088 2089 // Compute the default threshold for inlining based on the opt level and the 2090 // size opt level. 2091 static int computeThresholdFromOptLevels(unsigned OptLevel, 2092 unsigned SizeOptLevel) { 2093 if (OptLevel > 2) 2094 return InlineConstants::OptAggressiveThreshold; 2095 if (SizeOptLevel == 1) // -Os 2096 return InlineConstants::OptSizeThreshold; 2097 if (SizeOptLevel == 2) // -Oz 2098 return InlineConstants::OptMinSizeThreshold; 2099 return InlineThreshold; 2100 } 2101 2102 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2103 auto Params = 2104 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2105 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2106 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2107 // when it is specified explicitly. 2108 if (OptLevel > 2) 2109 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2110 return Params; 2111 } 2112