1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/CFG.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/GetElementPtrTypeIterator.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/InstVisitor.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Support/raw_ostream.h" 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "inline-cost" 43 44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 45 46 static cl::opt<int> InlineThreshold( 47 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 48 cl::desc("Control the amount of inlining to perform (default = 225)")); 49 50 static cl::opt<int> HintThreshold( 51 "inlinehint-threshold", cl::Hidden, cl::init(325), 52 cl::desc("Threshold for inlining functions with inline hint")); 53 54 static cl::opt<int> 55 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 56 cl::init(45), 57 cl::desc("Threshold for inlining cold callsites")); 58 59 // We introduce this threshold to help performance of instrumentation based 60 // PGO before we actually hook up inliner with analysis passes such as BPI and 61 // BFI. 62 static cl::opt<int> ColdThreshold( 63 "inlinecold-threshold", cl::Hidden, cl::init(45), 64 cl::desc("Threshold for inlining functions with cold attribute")); 65 66 static cl::opt<int> 67 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 68 cl::ZeroOrMore, 69 cl::desc("Threshold for hot callsites ")); 70 71 static cl::opt<int> LocallyHotCallSiteThreshold( 72 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 73 cl::desc("Threshold for locally hot callsites ")); 74 75 static cl::opt<int> ColdCallSiteRelFreq( 76 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 77 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 78 "entry frequency, for a callsite to be cold in the absence of " 79 "profile information.")); 80 81 static cl::opt<int> HotCallSiteRelFreq( 82 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 83 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 84 "entry frequency, for a callsite to be hot in the absence of " 85 "profile information.")); 86 87 static cl::opt<bool> OptComputeFullInlineCost( 88 "inline-cost-full", cl::Hidden, cl::init(false), 89 cl::desc("Compute the full inline cost of a call site even when the cost " 90 "exceeds the threshold.")); 91 92 namespace { 93 94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 95 typedef InstVisitor<CallAnalyzer, bool> Base; 96 friend class InstVisitor<CallAnalyzer, bool>; 97 98 /// The TargetTransformInfo available for this compilation. 99 const TargetTransformInfo &TTI; 100 101 /// Getter for the cache of @llvm.assume intrinsics. 102 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 103 104 /// Getter for BlockFrequencyInfo 105 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 106 107 /// Profile summary information. 108 ProfileSummaryInfo *PSI; 109 110 /// The called function. 111 Function &F; 112 113 // Cache the DataLayout since we use it a lot. 114 const DataLayout &DL; 115 116 /// The OptimizationRemarkEmitter available for this compilation. 117 OptimizationRemarkEmitter *ORE; 118 119 /// The candidate callsite being analyzed. Please do not use this to do 120 /// analysis in the caller function; we want the inline cost query to be 121 /// easily cacheable. Instead, use the cover function paramHasAttr. 122 CallSite CandidateCS; 123 124 /// Tunable parameters that control the analysis. 125 const InlineParams &Params; 126 127 int Threshold; 128 int Cost; 129 bool ComputeFullInlineCost; 130 131 bool IsCallerRecursive; 132 bool IsRecursiveCall; 133 bool ExposesReturnsTwice; 134 bool HasDynamicAlloca; 135 bool ContainsNoDuplicateCall; 136 bool HasReturn; 137 bool HasIndirectBr; 138 bool HasFrameEscape; 139 140 /// Number of bytes allocated statically by the callee. 141 uint64_t AllocatedSize; 142 unsigned NumInstructions, NumVectorInstructions; 143 int VectorBonus, TenPercentVectorBonus; 144 // Bonus to be applied when the callee has only one reachable basic block. 145 int SingleBBBonus; 146 147 /// While we walk the potentially-inlined instructions, we build up and 148 /// maintain a mapping of simplified values specific to this callsite. The 149 /// idea is to propagate any special information we have about arguments to 150 /// this call through the inlinable section of the function, and account for 151 /// likely simplifications post-inlining. The most important aspect we track 152 /// is CFG altering simplifications -- when we prove a basic block dead, that 153 /// can cause dramatic shifts in the cost of inlining a function. 154 DenseMap<Value *, Constant *> SimplifiedValues; 155 156 /// Keep track of the values which map back (through function arguments) to 157 /// allocas on the caller stack which could be simplified through SROA. 158 DenseMap<Value *, Value *> SROAArgValues; 159 160 /// The mapping of caller Alloca values to their accumulated cost savings. If 161 /// we have to disable SROA for one of the allocas, this tells us how much 162 /// cost must be added. 163 DenseMap<Value *, int> SROAArgCosts; 164 165 /// Keep track of values which map to a pointer base and constant offset. 166 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 167 168 /// Keep track of dead blocks due to the constant arguments. 169 SetVector<BasicBlock *> DeadBlocks; 170 171 /// The mapping of the blocks to their known unique successors due to the 172 /// constant arguments. 173 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 174 175 /// Model the elimination of repeated loads that is expected to happen 176 /// whenever we simplify away the stores that would otherwise cause them to be 177 /// loads. 178 bool EnableLoadElimination; 179 SmallPtrSet<Value *, 16> LoadAddrSet; 180 int LoadEliminationCost; 181 182 // Custom simplification helper routines. 183 bool isAllocaDerivedArg(Value *V); 184 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 185 DenseMap<Value *, int>::iterator &CostIt); 186 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 187 void disableSROA(Value *V); 188 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 189 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 190 int InstructionCost); 191 void disableLoadElimination(); 192 bool isGEPFree(GetElementPtrInst &GEP); 193 bool canFoldInboundsGEP(GetElementPtrInst &I); 194 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 195 bool simplifyCallSite(Function *F, CallSite CS); 196 template <typename Callable> 197 bool simplifyInstruction(Instruction &I, Callable Evaluate); 198 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 199 200 /// Return true if the given argument to the function being considered for 201 /// inlining has the given attribute set either at the call site or the 202 /// function declaration. Primarily used to inspect call site specific 203 /// attributes since these can be more precise than the ones on the callee 204 /// itself. 205 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 206 207 /// Return true if the given value is known non null within the callee if 208 /// inlined through this particular callsite. 209 bool isKnownNonNullInCallee(Value *V); 210 211 /// Update Threshold based on callsite properties such as callee 212 /// attributes and callee hotness for PGO builds. The Callee is explicitly 213 /// passed to support analyzing indirect calls whose target is inferred by 214 /// analysis. 215 void updateThreshold(CallSite CS, Function &Callee); 216 217 /// Return true if size growth is allowed when inlining the callee at CS. 218 bool allowSizeGrowth(CallSite CS); 219 220 /// Return true if \p CS is a cold callsite. 221 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 222 223 /// Return a higher threshold if \p CS is a hot callsite. 224 Optional<int> getHotCallSiteThreshold(CallSite CS, 225 BlockFrequencyInfo *CallerBFI); 226 227 // Custom analysis routines. 228 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 229 230 // Disable several entry points to the visitor so we don't accidentally use 231 // them by declaring but not defining them here. 232 void visit(Module *); 233 void visit(Module &); 234 void visit(Function *); 235 void visit(Function &); 236 void visit(BasicBlock *); 237 void visit(BasicBlock &); 238 239 // Provide base case for our instruction visit. 240 bool visitInstruction(Instruction &I); 241 242 // Our visit overrides. 243 bool visitAlloca(AllocaInst &I); 244 bool visitPHI(PHINode &I); 245 bool visitGetElementPtr(GetElementPtrInst &I); 246 bool visitBitCast(BitCastInst &I); 247 bool visitPtrToInt(PtrToIntInst &I); 248 bool visitIntToPtr(IntToPtrInst &I); 249 bool visitCastInst(CastInst &I); 250 bool visitUnaryInstruction(UnaryInstruction &I); 251 bool visitCmpInst(CmpInst &I); 252 bool visitSub(BinaryOperator &I); 253 bool visitBinaryOperator(BinaryOperator &I); 254 bool visitLoad(LoadInst &I); 255 bool visitStore(StoreInst &I); 256 bool visitExtractValue(ExtractValueInst &I); 257 bool visitInsertValue(InsertValueInst &I); 258 bool visitCallSite(CallSite CS); 259 bool visitReturnInst(ReturnInst &RI); 260 bool visitBranchInst(BranchInst &BI); 261 bool visitSelectInst(SelectInst &SI); 262 bool visitSwitchInst(SwitchInst &SI); 263 bool visitIndirectBrInst(IndirectBrInst &IBI); 264 bool visitResumeInst(ResumeInst &RI); 265 bool visitCleanupReturnInst(CleanupReturnInst &RI); 266 bool visitCatchReturnInst(CatchReturnInst &RI); 267 bool visitUnreachableInst(UnreachableInst &I); 268 269 public: 270 CallAnalyzer(const TargetTransformInfo &TTI, 271 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 272 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 273 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 274 Function &Callee, CallSite CSArg, const InlineParams &Params) 275 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 276 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 277 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 278 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 279 Params.ComputeFullInlineCost || ORE), 280 IsCallerRecursive(false), IsRecursiveCall(false), 281 ExposesReturnsTwice(false), HasDynamicAlloca(false), 282 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 283 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 284 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), 285 EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), 286 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 287 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 288 SROACostSavings(0), SROACostSavingsLost(0) {} 289 290 bool analyzeCall(CallSite CS); 291 292 int getThreshold() { return Threshold; } 293 int getCost() { return Cost; } 294 295 // Keep a bunch of stats about the cost savings found so we can print them 296 // out when debugging. 297 unsigned NumConstantArgs; 298 unsigned NumConstantOffsetPtrArgs; 299 unsigned NumAllocaArgs; 300 unsigned NumConstantPtrCmps; 301 unsigned NumConstantPtrDiffs; 302 unsigned NumInstructionsSimplified; 303 unsigned SROACostSavings; 304 unsigned SROACostSavingsLost; 305 306 void dump(); 307 }; 308 309 } // namespace 310 311 /// \brief Test whether the given value is an Alloca-derived function argument. 312 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 313 return SROAArgValues.count(V); 314 } 315 316 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 317 /// Returns false if V does not map to a SROA-candidate. 318 bool CallAnalyzer::lookupSROAArgAndCost( 319 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 320 if (SROAArgValues.empty() || SROAArgCosts.empty()) 321 return false; 322 323 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 324 if (ArgIt == SROAArgValues.end()) 325 return false; 326 327 Arg = ArgIt->second; 328 CostIt = SROAArgCosts.find(Arg); 329 return CostIt != SROAArgCosts.end(); 330 } 331 332 /// \brief Disable SROA for the candidate marked by this cost iterator. 333 /// 334 /// This marks the candidate as no longer viable for SROA, and adds the cost 335 /// savings associated with it back into the inline cost measurement. 336 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 337 // If we're no longer able to perform SROA we need to undo its cost savings 338 // and prevent subsequent analysis. 339 Cost += CostIt->second; 340 SROACostSavings -= CostIt->second; 341 SROACostSavingsLost += CostIt->second; 342 SROAArgCosts.erase(CostIt); 343 disableLoadElimination(); 344 } 345 346 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 347 void CallAnalyzer::disableSROA(Value *V) { 348 Value *SROAArg; 349 DenseMap<Value *, int>::iterator CostIt; 350 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 351 disableSROA(CostIt); 352 } 353 354 /// \brief Accumulate the given cost for a particular SROA candidate. 355 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 356 int InstructionCost) { 357 CostIt->second += InstructionCost; 358 SROACostSavings += InstructionCost; 359 } 360 361 void CallAnalyzer::disableLoadElimination() { 362 if (EnableLoadElimination) { 363 Cost += LoadEliminationCost; 364 LoadEliminationCost = 0; 365 EnableLoadElimination = false; 366 } 367 } 368 369 /// \brief Accumulate a constant GEP offset into an APInt if possible. 370 /// 371 /// Returns false if unable to compute the offset for any reason. Respects any 372 /// simplified values known during the analysis of this callsite. 373 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 374 unsigned IntPtrWidth = DL.getPointerTypeSizeInBits(GEP.getType()); 375 assert(IntPtrWidth == Offset.getBitWidth()); 376 377 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 378 GTI != GTE; ++GTI) { 379 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 380 if (!OpC) 381 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 382 OpC = dyn_cast<ConstantInt>(SimpleOp); 383 if (!OpC) 384 return false; 385 if (OpC->isZero()) 386 continue; 387 388 // Handle a struct index, which adds its field offset to the pointer. 389 if (StructType *STy = GTI.getStructTypeOrNull()) { 390 unsigned ElementIdx = OpC->getZExtValue(); 391 const StructLayout *SL = DL.getStructLayout(STy); 392 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 393 continue; 394 } 395 396 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 397 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 398 } 399 return true; 400 } 401 402 /// \brief Use TTI to check whether a GEP is free. 403 /// 404 /// Respects any simplified values known during the analysis of this callsite. 405 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 406 SmallVector<Value *, 4> Operands; 407 Operands.push_back(GEP.getOperand(0)); 408 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 409 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 410 Operands.push_back(SimpleOp); 411 else 412 Operands.push_back(*I); 413 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 414 } 415 416 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 417 // Check whether inlining will turn a dynamic alloca into a static 418 // alloca and handle that case. 419 if (I.isArrayAllocation()) { 420 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 421 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 422 Type *Ty = I.getAllocatedType(); 423 AllocatedSize = SaturatingMultiplyAdd( 424 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 425 return Base::visitAlloca(I); 426 } 427 } 428 429 // Accumulate the allocated size. 430 if (I.isStaticAlloca()) { 431 Type *Ty = I.getAllocatedType(); 432 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 433 } 434 435 // We will happily inline static alloca instructions. 436 if (I.isStaticAlloca()) 437 return Base::visitAlloca(I); 438 439 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 440 // a variety of reasons, and so we would like to not inline them into 441 // functions which don't currently have a dynamic alloca. This simply 442 // disables inlining altogether in the presence of a dynamic alloca. 443 HasDynamicAlloca = true; 444 return false; 445 } 446 447 bool CallAnalyzer::visitPHI(PHINode &I) { 448 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 449 // though we don't want to propagate it's bonuses. The idea is to disable 450 // SROA if it *might* be used in an inappropriate manner. 451 452 // Phi nodes are always zero-cost. 453 // FIXME: Pointer sizes may differ between different address spaces, so do we 454 // need to use correct address space in the call to getPointerSizeInBits here? 455 // Or could we skip the getPointerSizeInBits call completely? As far as I can 456 // see the ZeroOffset is used as a dummy value, so we can probably use any 457 // bit width for the ZeroOffset? 458 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 459 bool CheckSROA = I.getType()->isPointerTy(); 460 461 // Track the constant or pointer with constant offset we've seen so far. 462 Constant *FirstC = nullptr; 463 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 464 Value *FirstV = nullptr; 465 466 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 467 BasicBlock *Pred = I.getIncomingBlock(i); 468 // If the incoming block is dead, skip the incoming block. 469 if (DeadBlocks.count(Pred)) 470 continue; 471 // If the parent block of phi is not the known successor of the incoming 472 // block, skip the incoming block. 473 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 474 if (KnownSuccessor && KnownSuccessor != I.getParent()) 475 continue; 476 477 Value *V = I.getIncomingValue(i); 478 // If the incoming value is this phi itself, skip the incoming value. 479 if (&I == V) 480 continue; 481 482 Constant *C = dyn_cast<Constant>(V); 483 if (!C) 484 C = SimplifiedValues.lookup(V); 485 486 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 487 if (!C && CheckSROA) 488 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 489 490 if (!C && !BaseAndOffset.first) 491 // The incoming value is neither a constant nor a pointer with constant 492 // offset, exit early. 493 return true; 494 495 if (FirstC) { 496 if (FirstC == C) 497 // If we've seen a constant incoming value before and it is the same 498 // constant we see this time, continue checking the next incoming value. 499 continue; 500 // Otherwise early exit because we either see a different constant or saw 501 // a constant before but we have a pointer with constant offset this time. 502 return true; 503 } 504 505 if (FirstV) { 506 // The same logic as above, but check pointer with constant offset here. 507 if (FirstBaseAndOffset == BaseAndOffset) 508 continue; 509 return true; 510 } 511 512 if (C) { 513 // This is the 1st time we've seen a constant, record it. 514 FirstC = C; 515 continue; 516 } 517 518 // The remaining case is that this is the 1st time we've seen a pointer with 519 // constant offset, record it. 520 FirstV = V; 521 FirstBaseAndOffset = BaseAndOffset; 522 } 523 524 // Check if we can map phi to a constant. 525 if (FirstC) { 526 SimplifiedValues[&I] = FirstC; 527 return true; 528 } 529 530 // Check if we can map phi to a pointer with constant offset. 531 if (FirstBaseAndOffset.first) { 532 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 533 534 Value *SROAArg; 535 DenseMap<Value *, int>::iterator CostIt; 536 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 537 SROAArgValues[&I] = SROAArg; 538 } 539 540 return true; 541 } 542 543 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. 544 /// This requires target data and inbounds GEPs. 545 /// 546 /// \return true if the specified GEP can be folded. 547 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 548 // Check if we have a base + offset for the pointer. 549 std::pair<Value *, APInt> BaseAndOffset = 550 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 551 if (!BaseAndOffset.first) 552 return false; 553 554 // Check if the offset of this GEP is constant, and if so accumulate it 555 // into Offset. 556 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 557 return false; 558 559 // Add the result as a new mapping to Base + Offset. 560 ConstantOffsetPtrs[&I] = BaseAndOffset; 561 562 return true; 563 } 564 565 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 566 Value *SROAArg; 567 DenseMap<Value *, int>::iterator CostIt; 568 bool SROACandidate = 569 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 570 571 // Lambda to check whether a GEP's indices are all constant. 572 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 573 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 574 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 575 return false; 576 return true; 577 }; 578 579 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 580 if (SROACandidate) 581 SROAArgValues[&I] = SROAArg; 582 583 // Constant GEPs are modeled as free. 584 return true; 585 } 586 587 // Variable GEPs will require math and will disable SROA. 588 if (SROACandidate) 589 disableSROA(CostIt); 590 return isGEPFree(I); 591 } 592 593 /// Simplify \p I if its operands are constants and update SimplifiedValues. 594 /// \p Evaluate is a callable specific to instruction type that evaluates the 595 /// instruction when all the operands are constants. 596 template <typename Callable> 597 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 598 SmallVector<Constant *, 2> COps; 599 for (Value *Op : I.operands()) { 600 Constant *COp = dyn_cast<Constant>(Op); 601 if (!COp) 602 COp = SimplifiedValues.lookup(Op); 603 if (!COp) 604 return false; 605 COps.push_back(COp); 606 } 607 auto *C = Evaluate(COps); 608 if (!C) 609 return false; 610 SimplifiedValues[&I] = C; 611 return true; 612 } 613 614 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 615 // Propagate constants through bitcasts. 616 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 617 return ConstantExpr::getBitCast(COps[0], I.getType()); 618 })) 619 return true; 620 621 // Track base/offsets through casts 622 std::pair<Value *, APInt> BaseAndOffset = 623 ConstantOffsetPtrs.lookup(I.getOperand(0)); 624 // Casts don't change the offset, just wrap it up. 625 if (BaseAndOffset.first) 626 ConstantOffsetPtrs[&I] = BaseAndOffset; 627 628 // Also look for SROA candidates here. 629 Value *SROAArg; 630 DenseMap<Value *, int>::iterator CostIt; 631 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 632 SROAArgValues[&I] = SROAArg; 633 634 // Bitcasts are always zero cost. 635 return true; 636 } 637 638 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 639 // Propagate constants through ptrtoint. 640 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 641 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 642 })) 643 return true; 644 645 // Track base/offset pairs when converted to a plain integer provided the 646 // integer is large enough to represent the pointer. 647 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 648 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 649 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 650 std::pair<Value *, APInt> BaseAndOffset = 651 ConstantOffsetPtrs.lookup(I.getOperand(0)); 652 if (BaseAndOffset.first) 653 ConstantOffsetPtrs[&I] = BaseAndOffset; 654 } 655 656 // This is really weird. Technically, ptrtoint will disable SROA. However, 657 // unless that ptrtoint is *used* somewhere in the live basic blocks after 658 // inlining, it will be nuked, and SROA should proceed. All of the uses which 659 // would block SROA would also block SROA if applied directly to a pointer, 660 // and so we can just add the integer in here. The only places where SROA is 661 // preserved either cannot fire on an integer, or won't in-and-of themselves 662 // disable SROA (ext) w/o some later use that we would see and disable. 663 Value *SROAArg; 664 DenseMap<Value *, int>::iterator CostIt; 665 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 666 SROAArgValues[&I] = SROAArg; 667 668 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 669 } 670 671 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 672 // Propagate constants through ptrtoint. 673 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 674 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 675 })) 676 return true; 677 678 // Track base/offset pairs when round-tripped through a pointer without 679 // modifications provided the integer is not too large. 680 Value *Op = I.getOperand(0); 681 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 682 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 683 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 684 if (BaseAndOffset.first) 685 ConstantOffsetPtrs[&I] = BaseAndOffset; 686 } 687 688 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 689 Value *SROAArg; 690 DenseMap<Value *, int>::iterator CostIt; 691 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 692 SROAArgValues[&I] = SROAArg; 693 694 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 695 } 696 697 bool CallAnalyzer::visitCastInst(CastInst &I) { 698 // Propagate constants through ptrtoint. 699 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 700 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 701 })) 702 return true; 703 704 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 705 disableSROA(I.getOperand(0)); 706 707 // If this is a floating-point cast, and the target says this operation 708 // is expensive, this may eventually become a library call. Treat the cost 709 // as such. 710 switch (I.getOpcode()) { 711 case Instruction::FPTrunc: 712 case Instruction::FPExt: 713 case Instruction::UIToFP: 714 case Instruction::SIToFP: 715 case Instruction::FPToUI: 716 case Instruction::FPToSI: 717 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 718 Cost += InlineConstants::CallPenalty; 719 default: 720 break; 721 } 722 723 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 724 } 725 726 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 727 Value *Operand = I.getOperand(0); 728 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 729 return ConstantFoldInstOperands(&I, COps[0], DL); 730 })) 731 return true; 732 733 // Disable any SROA on the argument to arbitrary unary operators. 734 disableSROA(Operand); 735 736 return false; 737 } 738 739 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 740 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 741 } 742 743 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 744 // Does the *call site* have the NonNull attribute set on an argument? We 745 // use the attribute on the call site to memoize any analysis done in the 746 // caller. This will also trip if the callee function has a non-null 747 // parameter attribute, but that's a less interesting case because hopefully 748 // the callee would already have been simplified based on that. 749 if (Argument *A = dyn_cast<Argument>(V)) 750 if (paramHasAttr(A, Attribute::NonNull)) 751 return true; 752 753 // Is this an alloca in the caller? This is distinct from the attribute case 754 // above because attributes aren't updated within the inliner itself and we 755 // always want to catch the alloca derived case. 756 if (isAllocaDerivedArg(V)) 757 // We can actually predict the result of comparisons between an 758 // alloca-derived value and null. Note that this fires regardless of 759 // SROA firing. 760 return true; 761 762 return false; 763 } 764 765 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 766 // If the normal destination of the invoke or the parent block of the call 767 // site is unreachable-terminated, there is little point in inlining this 768 // unless there is literally zero cost. 769 // FIXME: Note that it is possible that an unreachable-terminated block has a 770 // hot entry. For example, in below scenario inlining hot_call_X() may be 771 // beneficial : 772 // main() { 773 // hot_call_1(); 774 // ... 775 // hot_call_N() 776 // exit(0); 777 // } 778 // For now, we are not handling this corner case here as it is rare in real 779 // code. In future, we should elaborate this based on BPI and BFI in more 780 // general threshold adjusting heuristics in updateThreshold(). 781 Instruction *Instr = CS.getInstruction(); 782 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 783 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 784 return false; 785 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 786 return false; 787 788 return true; 789 } 790 791 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 792 // If global profile summary is available, then callsite's coldness is 793 // determined based on that. 794 if (PSI && PSI->hasProfileSummary()) 795 return PSI->isColdCallSite(CS, CallerBFI); 796 797 // Otherwise we need BFI to be available. 798 if (!CallerBFI) 799 return false; 800 801 // Determine if the callsite is cold relative to caller's entry. We could 802 // potentially cache the computation of scaled entry frequency, but the added 803 // complexity is not worth it unless this scaling shows up high in the 804 // profiles. 805 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 806 auto CallSiteBB = CS.getInstruction()->getParent(); 807 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 808 auto CallerEntryFreq = 809 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 810 return CallSiteFreq < CallerEntryFreq * ColdProb; 811 } 812 813 Optional<int> 814 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 815 BlockFrequencyInfo *CallerBFI) { 816 817 // If global profile summary is available, then callsite's hotness is 818 // determined based on that. 819 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 820 return Params.HotCallSiteThreshold; 821 822 // Otherwise we need BFI to be available and to have a locally hot callsite 823 // threshold. 824 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 825 return None; 826 827 // Determine if the callsite is hot relative to caller's entry. We could 828 // potentially cache the computation of scaled entry frequency, but the added 829 // complexity is not worth it unless this scaling shows up high in the 830 // profiles. 831 auto CallSiteBB = CS.getInstruction()->getParent(); 832 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 833 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 834 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 835 return Params.LocallyHotCallSiteThreshold; 836 837 // Otherwise treat it normally. 838 return None; 839 } 840 841 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 842 // If no size growth is allowed for this inlining, set Threshold to 0. 843 if (!allowSizeGrowth(CS)) { 844 Threshold = 0; 845 return; 846 } 847 848 Function *Caller = CS.getCaller(); 849 850 // return min(A, B) if B is valid. 851 auto MinIfValid = [](int A, Optional<int> B) { 852 return B ? std::min(A, B.getValue()) : A; 853 }; 854 855 // return max(A, B) if B is valid. 856 auto MaxIfValid = [](int A, Optional<int> B) { 857 return B ? std::max(A, B.getValue()) : A; 858 }; 859 860 // Various bonus percentages. These are multiplied by Threshold to get the 861 // bonus values. 862 // SingleBBBonus: This bonus is applied if the callee has a single reachable 863 // basic block at the given callsite context. This is speculatively applied 864 // and withdrawn if more than one basic block is seen. 865 // 866 // Vector bonuses: We want to more aggressively inline vector-dense kernels 867 // and apply this bonus based on the percentage of vector instructions. A 868 // bonus is applied if the vector instructions exceed 50% and half that amount 869 // is applied if it exceeds 10%. Note that these bonuses are some what 870 // arbitrary and evolved over time by accident as much as because they are 871 // principled bonuses. 872 // FIXME: It would be nice to base the bonus values on something more 873 // scientific. 874 // 875 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 876 // of the last call to a static function as inlining such functions is 877 // guaranteed to reduce code size. 878 // 879 // These bonus percentages may be set to 0 based on properties of the caller 880 // and the callsite. 881 int SingleBBBonusPercent = 50; 882 int VectorBonusPercent = 150; 883 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 884 885 // Lambda to set all the above bonus and bonus percentages to 0. 886 auto DisallowAllBonuses = [&]() { 887 SingleBBBonusPercent = 0; 888 VectorBonusPercent = 0; 889 LastCallToStaticBonus = 0; 890 }; 891 892 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 893 // and reduce the threshold if the caller has the necessary attribute. 894 if (Caller->optForMinSize()) { 895 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 896 // For minsize, we want to disable the single BB bonus and the vector 897 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 898 // a static function will, at the minimum, eliminate the parameter setup and 899 // call/return instructions. 900 SingleBBBonusPercent = 0; 901 VectorBonusPercent = 0; 902 } else if (Caller->optForSize()) 903 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 904 905 // Adjust the threshold based on inlinehint attribute and profile based 906 // hotness information if the caller does not have MinSize attribute. 907 if (!Caller->optForMinSize()) { 908 if (Callee.hasFnAttribute(Attribute::InlineHint)) 909 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 910 911 // FIXME: After switching to the new passmanager, simplify the logic below 912 // by checking only the callsite hotness/coldness as we will reliably 913 // have local profile information. 914 // 915 // Callsite hotness and coldness can be determined if sample profile is 916 // used (which adds hotness metadata to calls) or if caller's 917 // BlockFrequencyInfo is available. 918 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 919 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 920 if (!Caller->optForSize() && HotCallSiteThreshold) { 921 DEBUG(dbgs() << "Hot callsite.\n"); 922 // FIXME: This should update the threshold only if it exceeds the 923 // current threshold, but AutoFDO + ThinLTO currently relies on this 924 // behavior to prevent inlining of hot callsites during ThinLTO 925 // compile phase. 926 Threshold = HotCallSiteThreshold.getValue(); 927 } else if (isColdCallSite(CS, CallerBFI)) { 928 DEBUG(dbgs() << "Cold callsite.\n"); 929 // Do not apply bonuses for a cold callsite including the 930 // LastCallToStatic bonus. While this bonus might result in code size 931 // reduction, it can cause the size of a non-cold caller to increase 932 // preventing it from being inlined. 933 DisallowAllBonuses(); 934 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 935 } else if (PSI) { 936 // Use callee's global profile information only if we have no way of 937 // determining this via callsite information. 938 if (PSI->isFunctionEntryHot(&Callee)) { 939 DEBUG(dbgs() << "Hot callee.\n"); 940 // If callsite hotness can not be determined, we may still know 941 // that the callee is hot and treat it as a weaker hint for threshold 942 // increase. 943 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 944 } else if (PSI->isFunctionEntryCold(&Callee)) { 945 DEBUG(dbgs() << "Cold callee.\n"); 946 // Do not apply bonuses for a cold callee including the 947 // LastCallToStatic bonus. While this bonus might result in code size 948 // reduction, it can cause the size of a non-cold caller to increase 949 // preventing it from being inlined. 950 DisallowAllBonuses(); 951 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 952 } 953 } 954 } 955 956 // Finally, take the target-specific inlining threshold multiplier into 957 // account. 958 Threshold *= TTI.getInliningThresholdMultiplier(); 959 960 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 961 VectorBonus = Threshold * VectorBonusPercent / 100; 962 963 bool OnlyOneCallAndLocalLinkage = 964 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 965 // If there is only one call of the function, and it has internal linkage, 966 // the cost of inlining it drops dramatically. It may seem odd to update 967 // Cost in updateThreshold, but the bonus depends on the logic in this method. 968 if (OnlyOneCallAndLocalLinkage) 969 Cost -= LastCallToStaticBonus; 970 } 971 972 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 973 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 974 // First try to handle simplified comparisons. 975 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 976 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 977 })) 978 return true; 979 980 if (I.getOpcode() == Instruction::FCmp) 981 return false; 982 983 // Otherwise look for a comparison between constant offset pointers with 984 // a common base. 985 Value *LHSBase, *RHSBase; 986 APInt LHSOffset, RHSOffset; 987 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 988 if (LHSBase) { 989 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 990 if (RHSBase && LHSBase == RHSBase) { 991 // We have common bases, fold the icmp to a constant based on the 992 // offsets. 993 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 994 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 995 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 996 SimplifiedValues[&I] = C; 997 ++NumConstantPtrCmps; 998 return true; 999 } 1000 } 1001 } 1002 1003 // If the comparison is an equality comparison with null, we can simplify it 1004 // if we know the value (argument) can't be null 1005 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1006 isKnownNonNullInCallee(I.getOperand(0))) { 1007 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1008 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1009 : ConstantInt::getFalse(I.getType()); 1010 return true; 1011 } 1012 // Finally check for SROA candidates in comparisons. 1013 Value *SROAArg; 1014 DenseMap<Value *, int>::iterator CostIt; 1015 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1016 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1017 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1018 return true; 1019 } 1020 1021 disableSROA(CostIt); 1022 } 1023 1024 return false; 1025 } 1026 1027 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1028 // Try to handle a special case: we can fold computing the difference of two 1029 // constant-related pointers. 1030 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1031 Value *LHSBase, *RHSBase; 1032 APInt LHSOffset, RHSOffset; 1033 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1034 if (LHSBase) { 1035 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1036 if (RHSBase && LHSBase == RHSBase) { 1037 // We have common bases, fold the subtract to a constant based on the 1038 // offsets. 1039 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1040 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1041 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1042 SimplifiedValues[&I] = C; 1043 ++NumConstantPtrDiffs; 1044 return true; 1045 } 1046 } 1047 } 1048 1049 // Otherwise, fall back to the generic logic for simplifying and handling 1050 // instructions. 1051 return Base::visitSub(I); 1052 } 1053 1054 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1055 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1056 Constant *CLHS = dyn_cast<Constant>(LHS); 1057 if (!CLHS) 1058 CLHS = SimplifiedValues.lookup(LHS); 1059 Constant *CRHS = dyn_cast<Constant>(RHS); 1060 if (!CRHS) 1061 CRHS = SimplifiedValues.lookup(RHS); 1062 1063 Value *SimpleV = nullptr; 1064 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1065 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1066 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1067 else 1068 SimpleV = 1069 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1070 1071 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1072 SimplifiedValues[&I] = C; 1073 1074 if (SimpleV) 1075 return true; 1076 1077 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1078 disableSROA(LHS); 1079 disableSROA(RHS); 1080 1081 // If the instruction is floating point, and the target says this operation 1082 // is expensive, this may eventually become a library call. Treat the cost 1083 // as such. 1084 if (I.getType()->isFloatingPointTy() && 1085 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1086 Cost += InlineConstants::CallPenalty; 1087 1088 return false; 1089 } 1090 1091 bool CallAnalyzer::visitLoad(LoadInst &I) { 1092 Value *SROAArg; 1093 DenseMap<Value *, int>::iterator CostIt; 1094 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1095 if (I.isSimple()) { 1096 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1097 return true; 1098 } 1099 1100 disableSROA(CostIt); 1101 } 1102 1103 // If the data is already loaded from this address and hasn't been clobbered 1104 // by any stores or calls, this load is likely to be redundant and can be 1105 // eliminated. 1106 if (EnableLoadElimination && 1107 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1108 LoadEliminationCost += InlineConstants::InstrCost; 1109 return true; 1110 } 1111 1112 return false; 1113 } 1114 1115 bool CallAnalyzer::visitStore(StoreInst &I) { 1116 Value *SROAArg; 1117 DenseMap<Value *, int>::iterator CostIt; 1118 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1119 if (I.isSimple()) { 1120 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1121 return true; 1122 } 1123 1124 disableSROA(CostIt); 1125 } 1126 1127 // The store can potentially clobber loads and prevent repeated loads from 1128 // being eliminated. 1129 // FIXME: 1130 // 1. We can probably keep an initial set of eliminatable loads substracted 1131 // from the cost even when we finally see a store. We just need to disable 1132 // *further* accumulation of elimination savings. 1133 // 2. We should probably at some point thread MemorySSA for the callee into 1134 // this and then use that to actually compute *really* precise savings. 1135 disableLoadElimination(); 1136 return false; 1137 } 1138 1139 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1140 // Constant folding for extract value is trivial. 1141 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1142 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1143 })) 1144 return true; 1145 1146 // SROA can look through these but give them a cost. 1147 return false; 1148 } 1149 1150 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1151 // Constant folding for insert value is trivial. 1152 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1153 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1154 /*InsertedValueOperand*/ COps[1], 1155 I.getIndices()); 1156 })) 1157 return true; 1158 1159 // SROA can look through these but give them a cost. 1160 return false; 1161 } 1162 1163 /// \brief Try to simplify a call site. 1164 /// 1165 /// Takes a concrete function and callsite and tries to actually simplify it by 1166 /// analyzing the arguments and call itself with instsimplify. Returns true if 1167 /// it has simplified the callsite to some other entity (a constant), making it 1168 /// free. 1169 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1170 // FIXME: Using the instsimplify logic directly for this is inefficient 1171 // because we have to continually rebuild the argument list even when no 1172 // simplifications can be performed. Until that is fixed with remapping 1173 // inside of instsimplify, directly constant fold calls here. 1174 if (!canConstantFoldCallTo(CS, F)) 1175 return false; 1176 1177 // Try to re-map the arguments to constants. 1178 SmallVector<Constant *, 4> ConstantArgs; 1179 ConstantArgs.reserve(CS.arg_size()); 1180 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1181 ++I) { 1182 Constant *C = dyn_cast<Constant>(*I); 1183 if (!C) 1184 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1185 if (!C) 1186 return false; // This argument doesn't map to a constant. 1187 1188 ConstantArgs.push_back(C); 1189 } 1190 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1191 SimplifiedValues[CS.getInstruction()] = C; 1192 return true; 1193 } 1194 1195 return false; 1196 } 1197 1198 bool CallAnalyzer::visitCallSite(CallSite CS) { 1199 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1200 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1201 // This aborts the entire analysis. 1202 ExposesReturnsTwice = true; 1203 return false; 1204 } 1205 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1206 ContainsNoDuplicateCall = true; 1207 1208 if (Function *F = CS.getCalledFunction()) { 1209 // When we have a concrete function, first try to simplify it directly. 1210 if (simplifyCallSite(F, CS)) 1211 return true; 1212 1213 // Next check if it is an intrinsic we know about. 1214 // FIXME: Lift this into part of the InstVisitor. 1215 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1216 switch (II->getIntrinsicID()) { 1217 default: 1218 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1219 disableLoadElimination(); 1220 return Base::visitCallSite(CS); 1221 1222 case Intrinsic::load_relative: 1223 // This is normally lowered to 4 LLVM instructions. 1224 Cost += 3 * InlineConstants::InstrCost; 1225 return false; 1226 1227 case Intrinsic::memset: 1228 case Intrinsic::memcpy: 1229 case Intrinsic::memmove: 1230 disableLoadElimination(); 1231 // SROA can usually chew through these intrinsics, but they aren't free. 1232 return false; 1233 case Intrinsic::localescape: 1234 HasFrameEscape = true; 1235 return false; 1236 } 1237 } 1238 1239 if (F == CS.getInstruction()->getFunction()) { 1240 // This flag will fully abort the analysis, so don't bother with anything 1241 // else. 1242 IsRecursiveCall = true; 1243 return false; 1244 } 1245 1246 if (TTI.isLoweredToCall(F)) { 1247 // We account for the average 1 instruction per call argument setup 1248 // here. 1249 Cost += CS.arg_size() * InlineConstants::InstrCost; 1250 1251 // Everything other than inline ASM will also have a significant cost 1252 // merely from making the call. 1253 if (!isa<InlineAsm>(CS.getCalledValue())) 1254 Cost += InlineConstants::CallPenalty; 1255 } 1256 1257 if (!CS.onlyReadsMemory()) 1258 disableLoadElimination(); 1259 return Base::visitCallSite(CS); 1260 } 1261 1262 // Otherwise we're in a very special case -- an indirect function call. See 1263 // if we can be particularly clever about this. 1264 Value *Callee = CS.getCalledValue(); 1265 1266 // First, pay the price of the argument setup. We account for the average 1267 // 1 instruction per call argument setup here. 1268 Cost += CS.arg_size() * InlineConstants::InstrCost; 1269 1270 // Next, check if this happens to be an indirect function call to a known 1271 // function in this inline context. If not, we've done all we can. 1272 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1273 if (!F) { 1274 if (!CS.onlyReadsMemory()) 1275 disableLoadElimination(); 1276 return Base::visitCallSite(CS); 1277 } 1278 1279 // If we have a constant that we are calling as a function, we can peer 1280 // through it and see the function target. This happens not infrequently 1281 // during devirtualization and so we want to give it a hefty bonus for 1282 // inlining, but cap that bonus in the event that inlining wouldn't pan 1283 // out. Pretend to inline the function, with a custom threshold. 1284 auto IndirectCallParams = Params; 1285 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1286 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1287 IndirectCallParams); 1288 if (CA.analyzeCall(CS)) { 1289 // We were able to inline the indirect call! Subtract the cost from the 1290 // threshold to get the bonus we want to apply, but don't go below zero. 1291 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1292 } 1293 1294 if (!F->onlyReadsMemory()) 1295 disableLoadElimination(); 1296 return Base::visitCallSite(CS); 1297 } 1298 1299 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1300 // At least one return instruction will be free after inlining. 1301 bool Free = !HasReturn; 1302 HasReturn = true; 1303 return Free; 1304 } 1305 1306 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1307 // We model unconditional branches as essentially free -- they really 1308 // shouldn't exist at all, but handling them makes the behavior of the 1309 // inliner more regular and predictable. Interestingly, conditional branches 1310 // which will fold away are also free. 1311 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1312 dyn_cast_or_null<ConstantInt>( 1313 SimplifiedValues.lookup(BI.getCondition())); 1314 } 1315 1316 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1317 bool CheckSROA = SI.getType()->isPointerTy(); 1318 Value *TrueVal = SI.getTrueValue(); 1319 Value *FalseVal = SI.getFalseValue(); 1320 1321 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1322 if (!TrueC) 1323 TrueC = SimplifiedValues.lookup(TrueVal); 1324 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1325 if (!FalseC) 1326 FalseC = SimplifiedValues.lookup(FalseVal); 1327 Constant *CondC = 1328 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1329 1330 if (!CondC) { 1331 // Select C, X, X => X 1332 if (TrueC == FalseC && TrueC) { 1333 SimplifiedValues[&SI] = TrueC; 1334 return true; 1335 } 1336 1337 if (!CheckSROA) 1338 return Base::visitSelectInst(SI); 1339 1340 std::pair<Value *, APInt> TrueBaseAndOffset = 1341 ConstantOffsetPtrs.lookup(TrueVal); 1342 std::pair<Value *, APInt> FalseBaseAndOffset = 1343 ConstantOffsetPtrs.lookup(FalseVal); 1344 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1345 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1346 1347 Value *SROAArg; 1348 DenseMap<Value *, int>::iterator CostIt; 1349 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1350 SROAArgValues[&SI] = SROAArg; 1351 return true; 1352 } 1353 1354 return Base::visitSelectInst(SI); 1355 } 1356 1357 // Select condition is a constant. 1358 Value *SelectedV = CondC->isAllOnesValue() 1359 ? TrueVal 1360 : (CondC->isNullValue()) ? FalseVal : nullptr; 1361 if (!SelectedV) { 1362 // Condition is a vector constant that is not all 1s or all 0s. If all 1363 // operands are constants, ConstantExpr::getSelect() can handle the cases 1364 // such as select vectors. 1365 if (TrueC && FalseC) { 1366 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1367 SimplifiedValues[&SI] = C; 1368 return true; 1369 } 1370 } 1371 return Base::visitSelectInst(SI); 1372 } 1373 1374 // Condition is either all 1s or all 0s. SI can be simplified. 1375 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1376 SimplifiedValues[&SI] = SelectedC; 1377 return true; 1378 } 1379 1380 if (!CheckSROA) 1381 return true; 1382 1383 std::pair<Value *, APInt> BaseAndOffset = 1384 ConstantOffsetPtrs.lookup(SelectedV); 1385 if (BaseAndOffset.first) { 1386 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1387 1388 Value *SROAArg; 1389 DenseMap<Value *, int>::iterator CostIt; 1390 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1391 SROAArgValues[&SI] = SROAArg; 1392 } 1393 1394 return true; 1395 } 1396 1397 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1398 // We model unconditional switches as free, see the comments on handling 1399 // branches. 1400 if (isa<ConstantInt>(SI.getCondition())) 1401 return true; 1402 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1403 if (isa<ConstantInt>(V)) 1404 return true; 1405 1406 // Assume the most general case where the switch is lowered into 1407 // either a jump table, bit test, or a balanced binary tree consisting of 1408 // case clusters without merging adjacent clusters with the same 1409 // destination. We do not consider the switches that are lowered with a mix 1410 // of jump table/bit test/binary search tree. The cost of the switch is 1411 // proportional to the size of the tree or the size of jump table range. 1412 // 1413 // NB: We convert large switches which are just used to initialize large phi 1414 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1415 // inlining those. It will prevent inlining in cases where the optimization 1416 // does not (yet) fire. 1417 1418 // Maximum valid cost increased in this function. 1419 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1420 1421 // Exit early for a large switch, assuming one case needs at least one 1422 // instruction. 1423 // FIXME: This is not true for a bit test, but ignore such case for now to 1424 // save compile-time. 1425 int64_t CostLowerBound = 1426 std::min((int64_t)CostUpperBound, 1427 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1428 1429 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1430 Cost = CostLowerBound; 1431 return false; 1432 } 1433 1434 unsigned JumpTableSize = 0; 1435 unsigned NumCaseCluster = 1436 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1437 1438 // If suitable for a jump table, consider the cost for the table size and 1439 // branch to destination. 1440 if (JumpTableSize) { 1441 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1442 4 * InlineConstants::InstrCost; 1443 1444 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1445 return false; 1446 } 1447 1448 // Considering forming a binary search, we should find the number of nodes 1449 // which is same as the number of comparisons when lowered. For a given 1450 // number of clusters, n, we can define a recursive function, f(n), to find 1451 // the number of nodes in the tree. The recursion is : 1452 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1453 // and f(n) = n, when n <= 3. 1454 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1455 // when n > 3. So, the number of comparisons from leaves should be n, while 1456 // the number of non-leaf should be : 1457 // 2^(log2(n) - 1) - 1 1458 // = 2^log2(n) * 2^-1 - 1 1459 // = n / 2 - 1. 1460 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1461 // number of comparisons in a simple closed form : 1462 // n + n / 2 - 1 = n * 3 / 2 - 1 1463 if (NumCaseCluster <= 3) { 1464 // Suppose a comparison includes one compare and one conditional branch. 1465 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1466 return false; 1467 } 1468 1469 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1470 int64_t SwitchCost = 1471 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1472 1473 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1474 return false; 1475 } 1476 1477 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1478 // We never want to inline functions that contain an indirectbr. This is 1479 // incorrect because all the blockaddress's (in static global initializers 1480 // for example) would be referring to the original function, and this 1481 // indirect jump would jump from the inlined copy of the function into the 1482 // original function which is extremely undefined behavior. 1483 // FIXME: This logic isn't really right; we can safely inline functions with 1484 // indirectbr's as long as no other function or global references the 1485 // blockaddress of a block within the current function. 1486 HasIndirectBr = true; 1487 return false; 1488 } 1489 1490 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1491 // FIXME: It's not clear that a single instruction is an accurate model for 1492 // the inline cost of a resume instruction. 1493 return false; 1494 } 1495 1496 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1497 // FIXME: It's not clear that a single instruction is an accurate model for 1498 // the inline cost of a cleanupret instruction. 1499 return false; 1500 } 1501 1502 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1503 // FIXME: It's not clear that a single instruction is an accurate model for 1504 // the inline cost of a catchret instruction. 1505 return false; 1506 } 1507 1508 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1509 // FIXME: It might be reasonably to discount the cost of instructions leading 1510 // to unreachable as they have the lowest possible impact on both runtime and 1511 // code size. 1512 return true; // No actual code is needed for unreachable. 1513 } 1514 1515 bool CallAnalyzer::visitInstruction(Instruction &I) { 1516 // Some instructions are free. All of the free intrinsics can also be 1517 // handled by SROA, etc. 1518 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1519 return true; 1520 1521 // We found something we don't understand or can't handle. Mark any SROA-able 1522 // values in the operand list as no longer viable. 1523 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1524 disableSROA(*OI); 1525 1526 return false; 1527 } 1528 1529 /// \brief Analyze a basic block for its contribution to the inline cost. 1530 /// 1531 /// This method walks the analyzer over every instruction in the given basic 1532 /// block and accounts for their cost during inlining at this callsite. It 1533 /// aborts early if the threshold has been exceeded or an impossible to inline 1534 /// construct has been detected. It returns false if inlining is no longer 1535 /// viable, and true if inlining remains viable. 1536 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1537 SmallPtrSetImpl<const Value *> &EphValues) { 1538 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1539 // FIXME: Currently, the number of instructions in a function regardless of 1540 // our ability to simplify them during inline to constants or dead code, 1541 // are actually used by the vector bonus heuristic. As long as that's true, 1542 // we have to special case debug intrinsics here to prevent differences in 1543 // inlining due to debug symbols. Eventually, the number of unsimplified 1544 // instructions shouldn't factor into the cost computation, but until then, 1545 // hack around it here. 1546 if (isa<DbgInfoIntrinsic>(I)) 1547 continue; 1548 1549 // Skip ephemeral values. 1550 if (EphValues.count(&*I)) 1551 continue; 1552 1553 ++NumInstructions; 1554 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1555 ++NumVectorInstructions; 1556 1557 // If the instruction simplified to a constant, there is no cost to this 1558 // instruction. Visit the instructions using our InstVisitor to account for 1559 // all of the per-instruction logic. The visit tree returns true if we 1560 // consumed the instruction in any way, and false if the instruction's base 1561 // cost should count against inlining. 1562 if (Base::visit(&*I)) 1563 ++NumInstructionsSimplified; 1564 else 1565 Cost += InlineConstants::InstrCost; 1566 1567 using namespace ore; 1568 // If the visit this instruction detected an uninlinable pattern, abort. 1569 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1570 HasIndirectBr || HasFrameEscape) { 1571 if (ORE) 1572 ORE->emit([&]() { 1573 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1574 CandidateCS.getInstruction()) 1575 << NV("Callee", &F) 1576 << " has uninlinable pattern and cost is not fully computed"; 1577 }); 1578 return false; 1579 } 1580 1581 // If the caller is a recursive function then we don't want to inline 1582 // functions which allocate a lot of stack space because it would increase 1583 // the caller stack usage dramatically. 1584 if (IsCallerRecursive && 1585 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1586 if (ORE) 1587 ORE->emit([&]() { 1588 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1589 CandidateCS.getInstruction()) 1590 << NV("Callee", &F) 1591 << " is recursive and allocates too much stack space. Cost is " 1592 "not fully computed"; 1593 }); 1594 return false; 1595 } 1596 1597 // Check if we've past the maximum possible threshold so we don't spin in 1598 // huge basic blocks that will never inline. 1599 if (Cost >= Threshold && !ComputeFullInlineCost) 1600 return false; 1601 } 1602 1603 return true; 1604 } 1605 1606 /// \brief Compute the base pointer and cumulative constant offsets for V. 1607 /// 1608 /// This strips all constant offsets off of V, leaving it the base pointer, and 1609 /// accumulates the total constant offset applied in the returned constant. It 1610 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1611 /// no constant offsets applied. 1612 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1613 if (!V->getType()->isPointerTy()) 1614 return nullptr; 1615 1616 unsigned AS = V->getType()->getPointerAddressSpace(); 1617 unsigned IntPtrWidth = DL.getPointerSizeInBits(AS); 1618 APInt Offset = APInt::getNullValue(IntPtrWidth); 1619 1620 // Even though we don't look through PHI nodes, we could be called on an 1621 // instruction in an unreachable block, which may be on a cycle. 1622 SmallPtrSet<Value *, 4> Visited; 1623 Visited.insert(V); 1624 do { 1625 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1626 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1627 return nullptr; 1628 V = GEP->getPointerOperand(); 1629 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1630 V = cast<Operator>(V)->getOperand(0); 1631 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1632 if (GA->isInterposable()) 1633 break; 1634 V = GA->getAliasee(); 1635 } else { 1636 break; 1637 } 1638 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1639 } while (Visited.insert(V).second); 1640 1641 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1642 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1643 } 1644 1645 /// \brief Find dead blocks due to deleted CFG edges during inlining. 1646 /// 1647 /// If we know the successor of the current block, \p CurrBB, has to be \p 1648 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1649 /// no live incoming CFG edges. If one block is found to be dead, we can 1650 /// continue growing the dead block list by checking the successors of the dead 1651 /// blocks to see if all their incoming edges are dead or not. 1652 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1653 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1654 // A CFG edge is dead if the predecessor is dead or the predessor has a 1655 // known successor which is not the one under exam. 1656 return (DeadBlocks.count(Pred) || 1657 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1658 }; 1659 1660 auto IsNewlyDead = [&](BasicBlock *BB) { 1661 // If all the edges to a block are dead, the block is also dead. 1662 return (!DeadBlocks.count(BB) && 1663 llvm::all_of(predecessors(BB), 1664 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1665 }; 1666 1667 for (BasicBlock *Succ : successors(CurrBB)) { 1668 if (Succ == NextBB || !IsNewlyDead(Succ)) 1669 continue; 1670 SmallVector<BasicBlock *, 4> NewDead; 1671 NewDead.push_back(Succ); 1672 while (!NewDead.empty()) { 1673 BasicBlock *Dead = NewDead.pop_back_val(); 1674 if (DeadBlocks.insert(Dead)) 1675 // Continue growing the dead block lists. 1676 for (BasicBlock *S : successors(Dead)) 1677 if (IsNewlyDead(S)) 1678 NewDead.push_back(S); 1679 } 1680 } 1681 } 1682 1683 /// \brief Analyze a call site for potential inlining. 1684 /// 1685 /// Returns true if inlining this call is viable, and false if it is not 1686 /// viable. It computes the cost and adjusts the threshold based on numerous 1687 /// factors and heuristics. If this method returns false but the computed cost 1688 /// is below the computed threshold, then inlining was forcibly disabled by 1689 /// some artifact of the routine. 1690 bool CallAnalyzer::analyzeCall(CallSite CS) { 1691 ++NumCallsAnalyzed; 1692 1693 // Perform some tweaks to the cost and threshold based on the direct 1694 // callsite information. 1695 1696 // We want to more aggressively inline vector-dense kernels, so up the 1697 // threshold, and we'll lower it if the % of vector instructions gets too 1698 // low. Note that these bonuses are some what arbitrary and evolved over time 1699 // by accident as much as because they are principled bonuses. 1700 // 1701 // FIXME: It would be nice to remove all such bonuses. At least it would be 1702 // nice to base the bonus values on something more scientific. 1703 assert(NumInstructions == 0); 1704 assert(NumVectorInstructions == 0); 1705 1706 // Update the threshold based on callsite properties 1707 updateThreshold(CS, F); 1708 1709 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1710 // this Threshold any time, and cost cannot decrease, we can stop processing 1711 // the rest of the function body. 1712 Threshold += (SingleBBBonus + VectorBonus); 1713 1714 // Give out bonuses for the callsite, as the instructions setting them up 1715 // will be gone after inlining. 1716 Cost -= getCallsiteCost(CS, DL); 1717 1718 // If this function uses the coldcc calling convention, prefer not to inline 1719 // it. 1720 if (F.getCallingConv() == CallingConv::Cold) 1721 Cost += InlineConstants::ColdccPenalty; 1722 1723 // Check if we're done. This can happen due to bonuses and penalties. 1724 if (Cost >= Threshold && !ComputeFullInlineCost) 1725 return false; 1726 1727 if (F.empty()) 1728 return true; 1729 1730 Function *Caller = CS.getInstruction()->getFunction(); 1731 // Check if the caller function is recursive itself. 1732 for (User *U : Caller->users()) { 1733 CallSite Site(U); 1734 if (!Site) 1735 continue; 1736 Instruction *I = Site.getInstruction(); 1737 if (I->getFunction() == Caller) { 1738 IsCallerRecursive = true; 1739 break; 1740 } 1741 } 1742 1743 // Populate our simplified values by mapping from function arguments to call 1744 // arguments with known important simplifications. 1745 CallSite::arg_iterator CAI = CS.arg_begin(); 1746 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1747 FAI != FAE; ++FAI, ++CAI) { 1748 assert(CAI != CS.arg_end()); 1749 if (Constant *C = dyn_cast<Constant>(CAI)) 1750 SimplifiedValues[&*FAI] = C; 1751 1752 Value *PtrArg = *CAI; 1753 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1754 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1755 1756 // We can SROA any pointer arguments derived from alloca instructions. 1757 if (isa<AllocaInst>(PtrArg)) { 1758 SROAArgValues[&*FAI] = PtrArg; 1759 SROAArgCosts[PtrArg] = 0; 1760 } 1761 } 1762 } 1763 NumConstantArgs = SimplifiedValues.size(); 1764 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1765 NumAllocaArgs = SROAArgValues.size(); 1766 1767 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1768 // the ephemeral values multiple times (and they're completely determined by 1769 // the callee, so this is purely duplicate work). 1770 SmallPtrSet<const Value *, 32> EphValues; 1771 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1772 1773 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1774 // adding basic blocks of the callee which can be proven to be dead for this 1775 // particular call site in order to get more accurate cost estimates. This 1776 // requires a somewhat heavyweight iteration pattern: we need to walk the 1777 // basic blocks in a breadth-first order as we insert live successors. To 1778 // accomplish this, prioritizing for small iterations because we exit after 1779 // crossing our threshold, we use a small-size optimized SetVector. 1780 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1781 SmallPtrSet<BasicBlock *, 16>> 1782 BBSetVector; 1783 BBSetVector BBWorklist; 1784 BBWorklist.insert(&F.getEntryBlock()); 1785 bool SingleBB = true; 1786 // Note that we *must not* cache the size, this loop grows the worklist. 1787 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1788 // Bail out the moment we cross the threshold. This means we'll under-count 1789 // the cost, but only when undercounting doesn't matter. 1790 if (Cost >= Threshold && !ComputeFullInlineCost) 1791 break; 1792 1793 BasicBlock *BB = BBWorklist[Idx]; 1794 if (BB->empty()) 1795 continue; 1796 1797 // Disallow inlining a blockaddress. A blockaddress only has defined 1798 // behavior for an indirect branch in the same function, and we do not 1799 // currently support inlining indirect branches. But, the inliner may not 1800 // see an indirect branch that ends up being dead code at a particular call 1801 // site. If the blockaddress escapes the function, e.g., via a global 1802 // variable, inlining may lead to an invalid cross-function reference. 1803 if (BB->hasAddressTaken()) 1804 return false; 1805 1806 // Analyze the cost of this block. If we blow through the threshold, this 1807 // returns false, and we can bail on out. 1808 if (!analyzeBlock(BB, EphValues)) 1809 return false; 1810 1811 TerminatorInst *TI = BB->getTerminator(); 1812 1813 // Add in the live successors by first checking whether we have terminator 1814 // that may be simplified based on the values simplified by this call. 1815 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1816 if (BI->isConditional()) { 1817 Value *Cond = BI->getCondition(); 1818 if (ConstantInt *SimpleCond = 1819 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1820 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1821 BBWorklist.insert(NextBB); 1822 KnownSuccessors[BB] = NextBB; 1823 findDeadBlocks(BB, NextBB); 1824 continue; 1825 } 1826 } 1827 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1828 Value *Cond = SI->getCondition(); 1829 if (ConstantInt *SimpleCond = 1830 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1831 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1832 BBWorklist.insert(NextBB); 1833 KnownSuccessors[BB] = NextBB; 1834 findDeadBlocks(BB, NextBB); 1835 continue; 1836 } 1837 } 1838 1839 // If we're unable to select a particular successor, just count all of 1840 // them. 1841 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1842 ++TIdx) 1843 BBWorklist.insert(TI->getSuccessor(TIdx)); 1844 1845 // If we had any successors at this point, than post-inlining is likely to 1846 // have them as well. Note that we assume any basic blocks which existed 1847 // due to branches or switches which folded above will also fold after 1848 // inlining. 1849 if (SingleBB && TI->getNumSuccessors() > 1) { 1850 // Take off the bonus we applied to the threshold. 1851 Threshold -= SingleBBBonus; 1852 SingleBB = false; 1853 } 1854 } 1855 1856 bool OnlyOneCallAndLocalLinkage = 1857 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1858 // If this is a noduplicate call, we can still inline as long as 1859 // inlining this would cause the removal of the caller (so the instruction 1860 // is not actually duplicated, just moved). 1861 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1862 return false; 1863 1864 // We applied the maximum possible vector bonus at the beginning. Now, 1865 // subtract the excess bonus, if any, from the Threshold before 1866 // comparing against Cost. 1867 if (NumVectorInstructions <= NumInstructions / 10) 1868 Threshold -= VectorBonus; 1869 else if (NumVectorInstructions <= NumInstructions / 2) 1870 Threshold -= VectorBonus/2; 1871 1872 return Cost < std::max(1, Threshold); 1873 } 1874 1875 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1876 /// \brief Dump stats about this call's analysis. 1877 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1878 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1879 DEBUG_PRINT_STAT(NumConstantArgs); 1880 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1881 DEBUG_PRINT_STAT(NumAllocaArgs); 1882 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1883 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1884 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1885 DEBUG_PRINT_STAT(NumInstructions); 1886 DEBUG_PRINT_STAT(SROACostSavings); 1887 DEBUG_PRINT_STAT(SROACostSavingsLost); 1888 DEBUG_PRINT_STAT(LoadEliminationCost); 1889 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1890 DEBUG_PRINT_STAT(Cost); 1891 DEBUG_PRINT_STAT(Threshold); 1892 #undef DEBUG_PRINT_STAT 1893 } 1894 #endif 1895 1896 /// \brief Test that there are no attribute conflicts between Caller and Callee 1897 /// that prevent inlining. 1898 static bool functionsHaveCompatibleAttributes(Function *Caller, 1899 Function *Callee, 1900 TargetTransformInfo &TTI) { 1901 return TTI.areInlineCompatible(Caller, Callee) && 1902 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1903 } 1904 1905 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1906 int Cost = 0; 1907 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1908 if (CS.isByValArgument(I)) { 1909 // We approximate the number of loads and stores needed by dividing the 1910 // size of the byval type by the target's pointer size. 1911 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1912 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1913 unsigned AS = PTy->getAddressSpace(); 1914 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1915 // Ceiling division. 1916 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1917 1918 // If it generates more than 8 stores it is likely to be expanded as an 1919 // inline memcpy so we take that as an upper bound. Otherwise we assume 1920 // one load and one store per word copied. 1921 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1922 // here instead of a magic number of 8, but it's not available via 1923 // DataLayout. 1924 NumStores = std::min(NumStores, 8U); 1925 1926 Cost += 2 * NumStores * InlineConstants::InstrCost; 1927 } else { 1928 // For non-byval arguments subtract off one instruction per call 1929 // argument. 1930 Cost += InlineConstants::InstrCost; 1931 } 1932 } 1933 // The call instruction also disappears after inlining. 1934 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1935 return Cost; 1936 } 1937 1938 InlineCost llvm::getInlineCost( 1939 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1940 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1941 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1942 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1943 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1944 GetAssumptionCache, GetBFI, PSI, ORE); 1945 } 1946 1947 InlineCost llvm::getInlineCost( 1948 CallSite CS, Function *Callee, const InlineParams &Params, 1949 TargetTransformInfo &CalleeTTI, 1950 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1951 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1952 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1953 1954 // Cannot inline indirect calls. 1955 if (!Callee) 1956 return llvm::InlineCost::getNever(); 1957 1958 // Calls to functions with always-inline attributes should be inlined 1959 // whenever possible. 1960 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1961 if (isInlineViable(*Callee)) 1962 return llvm::InlineCost::getAlways(); 1963 return llvm::InlineCost::getNever(); 1964 } 1965 1966 // Never inline functions with conflicting attributes (unless callee has 1967 // always-inline attribute). 1968 Function *Caller = CS.getCaller(); 1969 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1970 return llvm::InlineCost::getNever(); 1971 1972 // Don't inline this call if the caller has the optnone attribute. 1973 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1974 return llvm::InlineCost::getNever(); 1975 1976 // Don't inline functions which can be interposed at link-time. Don't inline 1977 // functions marked noinline or call sites marked noinline. 1978 // Note: inlining non-exact non-interposable functions is fine, since we know 1979 // we have *a* correct implementation of the source level function. 1980 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1981 CS.isNoInline()) 1982 return llvm::InlineCost::getNever(); 1983 1984 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1985 << "... (caller:" << Caller->getName() << ")\n"); 1986 1987 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 1988 Params); 1989 bool ShouldInline = CA.analyzeCall(CS); 1990 1991 DEBUG(CA.dump()); 1992 1993 // Check if there was a reason to force inlining or no inlining. 1994 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1995 return InlineCost::getNever(); 1996 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1997 return InlineCost::getAlways(); 1998 1999 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2000 } 2001 2002 bool llvm::isInlineViable(Function &F) { 2003 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2004 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2005 // Disallow inlining of functions which contain indirect branches or 2006 // blockaddresses. 2007 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 2008 return false; 2009 2010 for (auto &II : *BI) { 2011 CallSite CS(&II); 2012 if (!CS) 2013 continue; 2014 2015 // Disallow recursive calls. 2016 if (&F == CS.getCalledFunction()) 2017 return false; 2018 2019 // Disallow calls which expose returns-twice to a function not previously 2020 // attributed as such. 2021 if (!ReturnsTwice && CS.isCall() && 2022 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 2023 return false; 2024 2025 // Disallow inlining functions that call @llvm.localescape. Doing this 2026 // correctly would require major changes to the inliner. 2027 if (CS.getCalledFunction() && 2028 CS.getCalledFunction()->getIntrinsicID() == 2029 llvm::Intrinsic::localescape) 2030 return false; 2031 } 2032 } 2033 2034 return true; 2035 } 2036 2037 // APIs to create InlineParams based on command line flags and/or other 2038 // parameters. 2039 2040 InlineParams llvm::getInlineParams(int Threshold) { 2041 InlineParams Params; 2042 2043 // This field is the threshold to use for a callee by default. This is 2044 // derived from one or more of: 2045 // * optimization or size-optimization levels, 2046 // * a value passed to createFunctionInliningPass function, or 2047 // * the -inline-threshold flag. 2048 // If the -inline-threshold flag is explicitly specified, that is used 2049 // irrespective of anything else. 2050 if (InlineThreshold.getNumOccurrences() > 0) 2051 Params.DefaultThreshold = InlineThreshold; 2052 else 2053 Params.DefaultThreshold = Threshold; 2054 2055 // Set the HintThreshold knob from the -inlinehint-threshold. 2056 Params.HintThreshold = HintThreshold; 2057 2058 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2059 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2060 2061 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2062 // populate LocallyHotCallSiteThreshold. Later, we populate 2063 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2064 // we know that optimization level is O3 (in the getInlineParams variant that 2065 // takes the opt and size levels). 2066 // FIXME: Remove this check (and make the assignment unconditional) after 2067 // addressing size regression issues at O2. 2068 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2069 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2070 2071 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2072 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2073 2074 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2075 // -inlinehint-threshold commandline option is not explicitly given. If that 2076 // option is present, then its value applies even for callees with size and 2077 // minsize attributes. 2078 // If the -inline-threshold is not specified, set the ColdThreshold from the 2079 // -inlinecold-threshold even if it is not explicitly passed. If 2080 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2081 // explicitly specified to set the ColdThreshold knob 2082 if (InlineThreshold.getNumOccurrences() == 0) { 2083 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2084 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2085 Params.ColdThreshold = ColdThreshold; 2086 } else if (ColdThreshold.getNumOccurrences() > 0) { 2087 Params.ColdThreshold = ColdThreshold; 2088 } 2089 return Params; 2090 } 2091 2092 InlineParams llvm::getInlineParams() { 2093 return getInlineParams(InlineThreshold); 2094 } 2095 2096 // Compute the default threshold for inlining based on the opt level and the 2097 // size opt level. 2098 static int computeThresholdFromOptLevels(unsigned OptLevel, 2099 unsigned SizeOptLevel) { 2100 if (OptLevel > 2) 2101 return InlineConstants::OptAggressiveThreshold; 2102 if (SizeOptLevel == 1) // -Os 2103 return InlineConstants::OptSizeThreshold; 2104 if (SizeOptLevel == 2) // -Oz 2105 return InlineConstants::OptMinSizeThreshold; 2106 return InlineThreshold; 2107 } 2108 2109 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2110 auto Params = 2111 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2112 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2113 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2114 // when it is specified explicitly. 2115 if (OptLevel > 2) 2116 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2117 return Params; 2118 } 2119