1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/CFG.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/Config/llvm-config.h" 30 #include "llvm/IR/CallSite.h" 31 #include "llvm/IR/CallingConv.h" 32 #include "llvm/IR/DataLayout.h" 33 #include "llvm/IR/GetElementPtrTypeIterator.h" 34 #include "llvm/IR/GlobalAlias.h" 35 #include "llvm/IR/InstVisitor.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Operator.h" 38 #include "llvm/Support/Debug.h" 39 #include "llvm/Support/raw_ostream.h" 40 41 using namespace llvm; 42 43 #define DEBUG_TYPE "inline-cost" 44 45 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 46 47 static cl::opt<int> InlineThreshold( 48 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 49 cl::desc("Control the amount of inlining to perform (default = 225)")); 50 51 static cl::opt<int> HintThreshold( 52 "inlinehint-threshold", cl::Hidden, cl::init(325), 53 cl::desc("Threshold for inlining functions with inline hint")); 54 55 static cl::opt<int> 56 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 57 cl::init(45), 58 cl::desc("Threshold for inlining cold callsites")); 59 60 // We introduce this threshold to help performance of instrumentation based 61 // PGO before we actually hook up inliner with analysis passes such as BPI and 62 // BFI. 63 static cl::opt<int> ColdThreshold( 64 "inlinecold-threshold", cl::Hidden, cl::init(45), 65 cl::desc("Threshold for inlining functions with cold attribute")); 66 67 static cl::opt<int> 68 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 69 cl::ZeroOrMore, 70 cl::desc("Threshold for hot callsites ")); 71 72 static cl::opt<int> LocallyHotCallSiteThreshold( 73 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 74 cl::desc("Threshold for locally hot callsites ")); 75 76 static cl::opt<int> ColdCallSiteRelFreq( 77 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 78 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 79 "entry frequency, for a callsite to be cold in the absence of " 80 "profile information.")); 81 82 static cl::opt<int> HotCallSiteRelFreq( 83 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 84 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 85 "entry frequency, for a callsite to be hot in the absence of " 86 "profile information.")); 87 88 static cl::opt<bool> OptComputeFullInlineCost( 89 "inline-cost-full", cl::Hidden, cl::init(false), 90 cl::desc("Compute the full inline cost of a call site even when the cost " 91 "exceeds the threshold.")); 92 93 namespace { 94 95 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 96 typedef InstVisitor<CallAnalyzer, bool> Base; 97 friend class InstVisitor<CallAnalyzer, bool>; 98 99 /// The TargetTransformInfo available for this compilation. 100 const TargetTransformInfo &TTI; 101 102 /// Getter for the cache of @llvm.assume intrinsics. 103 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 104 105 /// Getter for BlockFrequencyInfo 106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 107 108 /// Profile summary information. 109 ProfileSummaryInfo *PSI; 110 111 /// The called function. 112 Function &F; 113 114 // Cache the DataLayout since we use it a lot. 115 const DataLayout &DL; 116 117 /// The OptimizationRemarkEmitter available for this compilation. 118 OptimizationRemarkEmitter *ORE; 119 120 /// The candidate callsite being analyzed. Please do not use this to do 121 /// analysis in the caller function; we want the inline cost query to be 122 /// easily cacheable. Instead, use the cover function paramHasAttr. 123 CallSite CandidateCS; 124 125 /// Tunable parameters that control the analysis. 126 const InlineParams &Params; 127 128 int Threshold; 129 int Cost; 130 bool ComputeFullInlineCost; 131 132 bool IsCallerRecursive; 133 bool IsRecursiveCall; 134 bool ExposesReturnsTwice; 135 bool HasDynamicAlloca; 136 bool ContainsNoDuplicateCall; 137 bool HasReturn; 138 bool HasIndirectBr; 139 bool HasUninlineableIntrinsic; 140 bool UsesVarArgs; 141 142 /// Number of bytes allocated statically by the callee. 143 uint64_t AllocatedSize; 144 unsigned NumInstructions, NumVectorInstructions; 145 int VectorBonus, TenPercentVectorBonus; 146 // Bonus to be applied when the callee has only one reachable basic block. 147 int SingleBBBonus; 148 149 /// While we walk the potentially-inlined instructions, we build up and 150 /// maintain a mapping of simplified values specific to this callsite. The 151 /// idea is to propagate any special information we have about arguments to 152 /// this call through the inlinable section of the function, and account for 153 /// likely simplifications post-inlining. The most important aspect we track 154 /// is CFG altering simplifications -- when we prove a basic block dead, that 155 /// can cause dramatic shifts in the cost of inlining a function. 156 DenseMap<Value *, Constant *> SimplifiedValues; 157 158 /// Keep track of the values which map back (through function arguments) to 159 /// allocas on the caller stack which could be simplified through SROA. 160 DenseMap<Value *, Value *> SROAArgValues; 161 162 /// The mapping of caller Alloca values to their accumulated cost savings. If 163 /// we have to disable SROA for one of the allocas, this tells us how much 164 /// cost must be added. 165 DenseMap<Value *, int> SROAArgCosts; 166 167 /// Keep track of values which map to a pointer base and constant offset. 168 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 169 170 /// Keep track of dead blocks due to the constant arguments. 171 SetVector<BasicBlock *> DeadBlocks; 172 173 /// The mapping of the blocks to their known unique successors due to the 174 /// constant arguments. 175 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 176 177 /// Model the elimination of repeated loads that is expected to happen 178 /// whenever we simplify away the stores that would otherwise cause them to be 179 /// loads. 180 bool EnableLoadElimination; 181 SmallPtrSet<Value *, 16> LoadAddrSet; 182 int LoadEliminationCost; 183 184 // Custom simplification helper routines. 185 bool isAllocaDerivedArg(Value *V); 186 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 187 DenseMap<Value *, int>::iterator &CostIt); 188 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 189 void disableSROA(Value *V); 190 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 191 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 192 int InstructionCost); 193 void disableLoadElimination(); 194 bool isGEPFree(GetElementPtrInst &GEP); 195 bool canFoldInboundsGEP(GetElementPtrInst &I); 196 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 197 bool simplifyCallSite(Function *F, CallSite CS); 198 template <typename Callable> 199 bool simplifyInstruction(Instruction &I, Callable Evaluate); 200 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 201 202 /// Return true if the given argument to the function being considered for 203 /// inlining has the given attribute set either at the call site or the 204 /// function declaration. Primarily used to inspect call site specific 205 /// attributes since these can be more precise than the ones on the callee 206 /// itself. 207 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 208 209 /// Return true if the given value is known non null within the callee if 210 /// inlined through this particular callsite. 211 bool isKnownNonNullInCallee(Value *V); 212 213 /// Update Threshold based on callsite properties such as callee 214 /// attributes and callee hotness for PGO builds. The Callee is explicitly 215 /// passed to support analyzing indirect calls whose target is inferred by 216 /// analysis. 217 void updateThreshold(CallSite CS, Function &Callee); 218 219 /// Return true if size growth is allowed when inlining the callee at CS. 220 bool allowSizeGrowth(CallSite CS); 221 222 /// Return true if \p CS is a cold callsite. 223 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 224 225 /// Return a higher threshold if \p CS is a hot callsite. 226 Optional<int> getHotCallSiteThreshold(CallSite CS, 227 BlockFrequencyInfo *CallerBFI); 228 229 // Custom analysis routines. 230 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 231 232 // Disable several entry points to the visitor so we don't accidentally use 233 // them by declaring but not defining them here. 234 void visit(Module *); 235 void visit(Module &); 236 void visit(Function *); 237 void visit(Function &); 238 void visit(BasicBlock *); 239 void visit(BasicBlock &); 240 241 // Provide base case for our instruction visit. 242 bool visitInstruction(Instruction &I); 243 244 // Our visit overrides. 245 bool visitAlloca(AllocaInst &I); 246 bool visitPHI(PHINode &I); 247 bool visitGetElementPtr(GetElementPtrInst &I); 248 bool visitBitCast(BitCastInst &I); 249 bool visitPtrToInt(PtrToIntInst &I); 250 bool visitIntToPtr(IntToPtrInst &I); 251 bool visitCastInst(CastInst &I); 252 bool visitUnaryInstruction(UnaryInstruction &I); 253 bool visitCmpInst(CmpInst &I); 254 bool visitSub(BinaryOperator &I); 255 bool visitBinaryOperator(BinaryOperator &I); 256 bool visitLoad(LoadInst &I); 257 bool visitStore(StoreInst &I); 258 bool visitExtractValue(ExtractValueInst &I); 259 bool visitInsertValue(InsertValueInst &I); 260 bool visitCallSite(CallSite CS); 261 bool visitReturnInst(ReturnInst &RI); 262 bool visitBranchInst(BranchInst &BI); 263 bool visitSelectInst(SelectInst &SI); 264 bool visitSwitchInst(SwitchInst &SI); 265 bool visitIndirectBrInst(IndirectBrInst &IBI); 266 bool visitResumeInst(ResumeInst &RI); 267 bool visitCleanupReturnInst(CleanupReturnInst &RI); 268 bool visitCatchReturnInst(CatchReturnInst &RI); 269 bool visitUnreachableInst(UnreachableInst &I); 270 271 public: 272 CallAnalyzer(const TargetTransformInfo &TTI, 273 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 274 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 275 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 276 Function &Callee, CallSite CSArg, const InlineParams &Params) 277 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 278 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 279 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 280 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 281 Params.ComputeFullInlineCost || ORE), 282 IsCallerRecursive(false), IsRecursiveCall(false), 283 ExposesReturnsTwice(false), HasDynamicAlloca(false), 284 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 285 HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0), 286 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), 287 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0), 288 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), 289 NumConstantPtrCmps(0), NumConstantPtrDiffs(0), 290 NumInstructionsSimplified(0), SROACostSavings(0), 291 SROACostSavingsLost(0) {} 292 293 bool analyzeCall(CallSite CS); 294 295 int getThreshold() { return Threshold; } 296 int getCost() { return Cost; } 297 298 // Keep a bunch of stats about the cost savings found so we can print them 299 // out when debugging. 300 unsigned NumConstantArgs; 301 unsigned NumConstantOffsetPtrArgs; 302 unsigned NumAllocaArgs; 303 unsigned NumConstantPtrCmps; 304 unsigned NumConstantPtrDiffs; 305 unsigned NumInstructionsSimplified; 306 unsigned SROACostSavings; 307 unsigned SROACostSavingsLost; 308 309 void dump(); 310 }; 311 312 } // namespace 313 314 /// Test whether the given value is an Alloca-derived function argument. 315 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 316 return SROAArgValues.count(V); 317 } 318 319 /// Lookup the SROA-candidate argument and cost iterator which V maps to. 320 /// Returns false if V does not map to a SROA-candidate. 321 bool CallAnalyzer::lookupSROAArgAndCost( 322 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 323 if (SROAArgValues.empty() || SROAArgCosts.empty()) 324 return false; 325 326 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 327 if (ArgIt == SROAArgValues.end()) 328 return false; 329 330 Arg = ArgIt->second; 331 CostIt = SROAArgCosts.find(Arg); 332 return CostIt != SROAArgCosts.end(); 333 } 334 335 /// Disable SROA for the candidate marked by this cost iterator. 336 /// 337 /// This marks the candidate as no longer viable for SROA, and adds the cost 338 /// savings associated with it back into the inline cost measurement. 339 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 340 // If we're no longer able to perform SROA we need to undo its cost savings 341 // and prevent subsequent analysis. 342 Cost += CostIt->second; 343 SROACostSavings -= CostIt->second; 344 SROACostSavingsLost += CostIt->second; 345 SROAArgCosts.erase(CostIt); 346 disableLoadElimination(); 347 } 348 349 /// If 'V' maps to a SROA candidate, disable SROA for it. 350 void CallAnalyzer::disableSROA(Value *V) { 351 Value *SROAArg; 352 DenseMap<Value *, int>::iterator CostIt; 353 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 354 disableSROA(CostIt); 355 } 356 357 /// Accumulate the given cost for a particular SROA candidate. 358 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 359 int InstructionCost) { 360 CostIt->second += InstructionCost; 361 SROACostSavings += InstructionCost; 362 } 363 364 void CallAnalyzer::disableLoadElimination() { 365 if (EnableLoadElimination) { 366 Cost += LoadEliminationCost; 367 LoadEliminationCost = 0; 368 EnableLoadElimination = false; 369 } 370 } 371 372 /// Accumulate a constant GEP offset into an APInt if possible. 373 /// 374 /// Returns false if unable to compute the offset for any reason. Respects any 375 /// simplified values known during the analysis of this callsite. 376 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 377 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 378 assert(IntPtrWidth == Offset.getBitWidth()); 379 380 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 381 GTI != GTE; ++GTI) { 382 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 383 if (!OpC) 384 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 385 OpC = dyn_cast<ConstantInt>(SimpleOp); 386 if (!OpC) 387 return false; 388 if (OpC->isZero()) 389 continue; 390 391 // Handle a struct index, which adds its field offset to the pointer. 392 if (StructType *STy = GTI.getStructTypeOrNull()) { 393 unsigned ElementIdx = OpC->getZExtValue(); 394 const StructLayout *SL = DL.getStructLayout(STy); 395 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 396 continue; 397 } 398 399 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 400 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 401 } 402 return true; 403 } 404 405 /// Use TTI to check whether a GEP is free. 406 /// 407 /// Respects any simplified values known during the analysis of this callsite. 408 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 409 SmallVector<Value *, 4> Operands; 410 Operands.push_back(GEP.getOperand(0)); 411 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 412 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 413 Operands.push_back(SimpleOp); 414 else 415 Operands.push_back(*I); 416 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 417 } 418 419 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 420 // Check whether inlining will turn a dynamic alloca into a static 421 // alloca and handle that case. 422 if (I.isArrayAllocation()) { 423 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 424 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 425 Type *Ty = I.getAllocatedType(); 426 AllocatedSize = SaturatingMultiplyAdd( 427 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 428 return Base::visitAlloca(I); 429 } 430 } 431 432 // Accumulate the allocated size. 433 if (I.isStaticAlloca()) { 434 Type *Ty = I.getAllocatedType(); 435 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 436 } 437 438 // We will happily inline static alloca instructions. 439 if (I.isStaticAlloca()) 440 return Base::visitAlloca(I); 441 442 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 443 // a variety of reasons, and so we would like to not inline them into 444 // functions which don't currently have a dynamic alloca. This simply 445 // disables inlining altogether in the presence of a dynamic alloca. 446 HasDynamicAlloca = true; 447 return false; 448 } 449 450 bool CallAnalyzer::visitPHI(PHINode &I) { 451 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 452 // though we don't want to propagate it's bonuses. The idea is to disable 453 // SROA if it *might* be used in an inappropriate manner. 454 455 // Phi nodes are always zero-cost. 456 // FIXME: Pointer sizes may differ between different address spaces, so do we 457 // need to use correct address space in the call to getPointerSizeInBits here? 458 // Or could we skip the getPointerSizeInBits call completely? As far as I can 459 // see the ZeroOffset is used as a dummy value, so we can probably use any 460 // bit width for the ZeroOffset? 461 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 462 bool CheckSROA = I.getType()->isPointerTy(); 463 464 // Track the constant or pointer with constant offset we've seen so far. 465 Constant *FirstC = nullptr; 466 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 467 Value *FirstV = nullptr; 468 469 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 470 BasicBlock *Pred = I.getIncomingBlock(i); 471 // If the incoming block is dead, skip the incoming block. 472 if (DeadBlocks.count(Pred)) 473 continue; 474 // If the parent block of phi is not the known successor of the incoming 475 // block, skip the incoming block. 476 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 477 if (KnownSuccessor && KnownSuccessor != I.getParent()) 478 continue; 479 480 Value *V = I.getIncomingValue(i); 481 // If the incoming value is this phi itself, skip the incoming value. 482 if (&I == V) 483 continue; 484 485 Constant *C = dyn_cast<Constant>(V); 486 if (!C) 487 C = SimplifiedValues.lookup(V); 488 489 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 490 if (!C && CheckSROA) 491 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 492 493 if (!C && !BaseAndOffset.first) 494 // The incoming value is neither a constant nor a pointer with constant 495 // offset, exit early. 496 return true; 497 498 if (FirstC) { 499 if (FirstC == C) 500 // If we've seen a constant incoming value before and it is the same 501 // constant we see this time, continue checking the next incoming value. 502 continue; 503 // Otherwise early exit because we either see a different constant or saw 504 // a constant before but we have a pointer with constant offset this time. 505 return true; 506 } 507 508 if (FirstV) { 509 // The same logic as above, but check pointer with constant offset here. 510 if (FirstBaseAndOffset == BaseAndOffset) 511 continue; 512 return true; 513 } 514 515 if (C) { 516 // This is the 1st time we've seen a constant, record it. 517 FirstC = C; 518 continue; 519 } 520 521 // The remaining case is that this is the 1st time we've seen a pointer with 522 // constant offset, record it. 523 FirstV = V; 524 FirstBaseAndOffset = BaseAndOffset; 525 } 526 527 // Check if we can map phi to a constant. 528 if (FirstC) { 529 SimplifiedValues[&I] = FirstC; 530 return true; 531 } 532 533 // Check if we can map phi to a pointer with constant offset. 534 if (FirstBaseAndOffset.first) { 535 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 536 537 Value *SROAArg; 538 DenseMap<Value *, int>::iterator CostIt; 539 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 540 SROAArgValues[&I] = SROAArg; 541 } 542 543 return true; 544 } 545 546 /// Check we can fold GEPs of constant-offset call site argument pointers. 547 /// This requires target data and inbounds GEPs. 548 /// 549 /// \return true if the specified GEP can be folded. 550 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 551 // Check if we have a base + offset for the pointer. 552 std::pair<Value *, APInt> BaseAndOffset = 553 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 554 if (!BaseAndOffset.first) 555 return false; 556 557 // Check if the offset of this GEP is constant, and if so accumulate it 558 // into Offset. 559 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 560 return false; 561 562 // Add the result as a new mapping to Base + Offset. 563 ConstantOffsetPtrs[&I] = BaseAndOffset; 564 565 return true; 566 } 567 568 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 569 Value *SROAArg; 570 DenseMap<Value *, int>::iterator CostIt; 571 bool SROACandidate = 572 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 573 574 // Lambda to check whether a GEP's indices are all constant. 575 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 576 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 577 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 578 return false; 579 return true; 580 }; 581 582 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 583 if (SROACandidate) 584 SROAArgValues[&I] = SROAArg; 585 586 // Constant GEPs are modeled as free. 587 return true; 588 } 589 590 // Variable GEPs will require math and will disable SROA. 591 if (SROACandidate) 592 disableSROA(CostIt); 593 return isGEPFree(I); 594 } 595 596 /// Simplify \p I if its operands are constants and update SimplifiedValues. 597 /// \p Evaluate is a callable specific to instruction type that evaluates the 598 /// instruction when all the operands are constants. 599 template <typename Callable> 600 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 601 SmallVector<Constant *, 2> COps; 602 for (Value *Op : I.operands()) { 603 Constant *COp = dyn_cast<Constant>(Op); 604 if (!COp) 605 COp = SimplifiedValues.lookup(Op); 606 if (!COp) 607 return false; 608 COps.push_back(COp); 609 } 610 auto *C = Evaluate(COps); 611 if (!C) 612 return false; 613 SimplifiedValues[&I] = C; 614 return true; 615 } 616 617 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 618 // Propagate constants through bitcasts. 619 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 620 return ConstantExpr::getBitCast(COps[0], I.getType()); 621 })) 622 return true; 623 624 // Track base/offsets through casts 625 std::pair<Value *, APInt> BaseAndOffset = 626 ConstantOffsetPtrs.lookup(I.getOperand(0)); 627 // Casts don't change the offset, just wrap it up. 628 if (BaseAndOffset.first) 629 ConstantOffsetPtrs[&I] = BaseAndOffset; 630 631 // Also look for SROA candidates here. 632 Value *SROAArg; 633 DenseMap<Value *, int>::iterator CostIt; 634 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 635 SROAArgValues[&I] = SROAArg; 636 637 // Bitcasts are always zero cost. 638 return true; 639 } 640 641 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 642 // Propagate constants through ptrtoint. 643 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 644 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 645 })) 646 return true; 647 648 // Track base/offset pairs when converted to a plain integer provided the 649 // integer is large enough to represent the pointer. 650 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 651 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 652 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 653 std::pair<Value *, APInt> BaseAndOffset = 654 ConstantOffsetPtrs.lookup(I.getOperand(0)); 655 if (BaseAndOffset.first) 656 ConstantOffsetPtrs[&I] = BaseAndOffset; 657 } 658 659 // This is really weird. Technically, ptrtoint will disable SROA. However, 660 // unless that ptrtoint is *used* somewhere in the live basic blocks after 661 // inlining, it will be nuked, and SROA should proceed. All of the uses which 662 // would block SROA would also block SROA if applied directly to a pointer, 663 // and so we can just add the integer in here. The only places where SROA is 664 // preserved either cannot fire on an integer, or won't in-and-of themselves 665 // disable SROA (ext) w/o some later use that we would see and disable. 666 Value *SROAArg; 667 DenseMap<Value *, int>::iterator CostIt; 668 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 669 SROAArgValues[&I] = SROAArg; 670 671 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 672 } 673 674 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 675 // Propagate constants through ptrtoint. 676 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 677 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 678 })) 679 return true; 680 681 // Track base/offset pairs when round-tripped through a pointer without 682 // modifications provided the integer is not too large. 683 Value *Op = I.getOperand(0); 684 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 685 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 686 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 687 if (BaseAndOffset.first) 688 ConstantOffsetPtrs[&I] = BaseAndOffset; 689 } 690 691 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 692 Value *SROAArg; 693 DenseMap<Value *, int>::iterator CostIt; 694 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 695 SROAArgValues[&I] = SROAArg; 696 697 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 698 } 699 700 bool CallAnalyzer::visitCastInst(CastInst &I) { 701 // Propagate constants through ptrtoint. 702 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 703 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 704 })) 705 return true; 706 707 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 708 disableSROA(I.getOperand(0)); 709 710 // If this is a floating-point cast, and the target says this operation 711 // is expensive, this may eventually become a library call. Treat the cost 712 // as such. 713 switch (I.getOpcode()) { 714 case Instruction::FPTrunc: 715 case Instruction::FPExt: 716 case Instruction::UIToFP: 717 case Instruction::SIToFP: 718 case Instruction::FPToUI: 719 case Instruction::FPToSI: 720 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 721 Cost += InlineConstants::CallPenalty; 722 default: 723 break; 724 } 725 726 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 727 } 728 729 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 730 Value *Operand = I.getOperand(0); 731 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 732 return ConstantFoldInstOperands(&I, COps[0], DL); 733 })) 734 return true; 735 736 // Disable any SROA on the argument to arbitrary unary operators. 737 disableSROA(Operand); 738 739 return false; 740 } 741 742 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 743 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 744 } 745 746 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 747 // Does the *call site* have the NonNull attribute set on an argument? We 748 // use the attribute on the call site to memoize any analysis done in the 749 // caller. This will also trip if the callee function has a non-null 750 // parameter attribute, but that's a less interesting case because hopefully 751 // the callee would already have been simplified based on that. 752 if (Argument *A = dyn_cast<Argument>(V)) 753 if (paramHasAttr(A, Attribute::NonNull)) 754 return true; 755 756 // Is this an alloca in the caller? This is distinct from the attribute case 757 // above because attributes aren't updated within the inliner itself and we 758 // always want to catch the alloca derived case. 759 if (isAllocaDerivedArg(V)) 760 // We can actually predict the result of comparisons between an 761 // alloca-derived value and null. Note that this fires regardless of 762 // SROA firing. 763 return true; 764 765 return false; 766 } 767 768 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 769 // If the normal destination of the invoke or the parent block of the call 770 // site is unreachable-terminated, there is little point in inlining this 771 // unless there is literally zero cost. 772 // FIXME: Note that it is possible that an unreachable-terminated block has a 773 // hot entry. For example, in below scenario inlining hot_call_X() may be 774 // beneficial : 775 // main() { 776 // hot_call_1(); 777 // ... 778 // hot_call_N() 779 // exit(0); 780 // } 781 // For now, we are not handling this corner case here as it is rare in real 782 // code. In future, we should elaborate this based on BPI and BFI in more 783 // general threshold adjusting heuristics in updateThreshold(). 784 Instruction *Instr = CS.getInstruction(); 785 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 786 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 787 return false; 788 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 789 return false; 790 791 return true; 792 } 793 794 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 795 // If global profile summary is available, then callsite's coldness is 796 // determined based on that. 797 if (PSI && PSI->hasProfileSummary()) 798 return PSI->isColdCallSite(CS, CallerBFI); 799 800 // Otherwise we need BFI to be available. 801 if (!CallerBFI) 802 return false; 803 804 // Determine if the callsite is cold relative to caller's entry. We could 805 // potentially cache the computation of scaled entry frequency, but the added 806 // complexity is not worth it unless this scaling shows up high in the 807 // profiles. 808 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 809 auto CallSiteBB = CS.getInstruction()->getParent(); 810 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 811 auto CallerEntryFreq = 812 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 813 return CallSiteFreq < CallerEntryFreq * ColdProb; 814 } 815 816 Optional<int> 817 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 818 BlockFrequencyInfo *CallerBFI) { 819 820 // If global profile summary is available, then callsite's hotness is 821 // determined based on that. 822 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 823 return Params.HotCallSiteThreshold; 824 825 // Otherwise we need BFI to be available and to have a locally hot callsite 826 // threshold. 827 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 828 return None; 829 830 // Determine if the callsite is hot relative to caller's entry. We could 831 // potentially cache the computation of scaled entry frequency, but the added 832 // complexity is not worth it unless this scaling shows up high in the 833 // profiles. 834 auto CallSiteBB = CS.getInstruction()->getParent(); 835 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 836 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 837 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 838 return Params.LocallyHotCallSiteThreshold; 839 840 // Otherwise treat it normally. 841 return None; 842 } 843 844 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 845 // If no size growth is allowed for this inlining, set Threshold to 0. 846 if (!allowSizeGrowth(CS)) { 847 Threshold = 0; 848 return; 849 } 850 851 Function *Caller = CS.getCaller(); 852 853 // return min(A, B) if B is valid. 854 auto MinIfValid = [](int A, Optional<int> B) { 855 return B ? std::min(A, B.getValue()) : A; 856 }; 857 858 // return max(A, B) if B is valid. 859 auto MaxIfValid = [](int A, Optional<int> B) { 860 return B ? std::max(A, B.getValue()) : A; 861 }; 862 863 // Various bonus percentages. These are multiplied by Threshold to get the 864 // bonus values. 865 // SingleBBBonus: This bonus is applied if the callee has a single reachable 866 // basic block at the given callsite context. This is speculatively applied 867 // and withdrawn if more than one basic block is seen. 868 // 869 // Vector bonuses: We want to more aggressively inline vector-dense kernels 870 // and apply this bonus based on the percentage of vector instructions. A 871 // bonus is applied if the vector instructions exceed 50% and half that amount 872 // is applied if it exceeds 10%. Note that these bonuses are some what 873 // arbitrary and evolved over time by accident as much as because they are 874 // principled bonuses. 875 // FIXME: It would be nice to base the bonus values on something more 876 // scientific. 877 // 878 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 879 // of the last call to a static function as inlining such functions is 880 // guaranteed to reduce code size. 881 // 882 // These bonus percentages may be set to 0 based on properties of the caller 883 // and the callsite. 884 int SingleBBBonusPercent = 50; 885 int VectorBonusPercent = 150; 886 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 887 888 // Lambda to set all the above bonus and bonus percentages to 0. 889 auto DisallowAllBonuses = [&]() { 890 SingleBBBonusPercent = 0; 891 VectorBonusPercent = 0; 892 LastCallToStaticBonus = 0; 893 }; 894 895 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 896 // and reduce the threshold if the caller has the necessary attribute. 897 if (Caller->optForMinSize()) { 898 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 899 // For minsize, we want to disable the single BB bonus and the vector 900 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 901 // a static function will, at the minimum, eliminate the parameter setup and 902 // call/return instructions. 903 SingleBBBonusPercent = 0; 904 VectorBonusPercent = 0; 905 } else if (Caller->optForSize()) 906 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 907 908 // Adjust the threshold based on inlinehint attribute and profile based 909 // hotness information if the caller does not have MinSize attribute. 910 if (!Caller->optForMinSize()) { 911 if (Callee.hasFnAttribute(Attribute::InlineHint)) 912 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 913 914 // FIXME: After switching to the new passmanager, simplify the logic below 915 // by checking only the callsite hotness/coldness as we will reliably 916 // have local profile information. 917 // 918 // Callsite hotness and coldness can be determined if sample profile is 919 // used (which adds hotness metadata to calls) or if caller's 920 // BlockFrequencyInfo is available. 921 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 922 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 923 if (!Caller->optForSize() && HotCallSiteThreshold) { 924 DEBUG(dbgs() << "Hot callsite.\n"); 925 // FIXME: This should update the threshold only if it exceeds the 926 // current threshold, but AutoFDO + ThinLTO currently relies on this 927 // behavior to prevent inlining of hot callsites during ThinLTO 928 // compile phase. 929 Threshold = HotCallSiteThreshold.getValue(); 930 } else if (isColdCallSite(CS, CallerBFI)) { 931 DEBUG(dbgs() << "Cold callsite.\n"); 932 // Do not apply bonuses for a cold callsite including the 933 // LastCallToStatic bonus. While this bonus might result in code size 934 // reduction, it can cause the size of a non-cold caller to increase 935 // preventing it from being inlined. 936 DisallowAllBonuses(); 937 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 938 } else if (PSI) { 939 // Use callee's global profile information only if we have no way of 940 // determining this via callsite information. 941 if (PSI->isFunctionEntryHot(&Callee)) { 942 DEBUG(dbgs() << "Hot callee.\n"); 943 // If callsite hotness can not be determined, we may still know 944 // that the callee is hot and treat it as a weaker hint for threshold 945 // increase. 946 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 947 } else if (PSI->isFunctionEntryCold(&Callee)) { 948 DEBUG(dbgs() << "Cold callee.\n"); 949 // Do not apply bonuses for a cold callee including the 950 // LastCallToStatic bonus. While this bonus might result in code size 951 // reduction, it can cause the size of a non-cold caller to increase 952 // preventing it from being inlined. 953 DisallowAllBonuses(); 954 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 955 } 956 } 957 } 958 959 // Finally, take the target-specific inlining threshold multiplier into 960 // account. 961 Threshold *= TTI.getInliningThresholdMultiplier(); 962 963 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 964 VectorBonus = Threshold * VectorBonusPercent / 100; 965 966 bool OnlyOneCallAndLocalLinkage = 967 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 968 // If there is only one call of the function, and it has internal linkage, 969 // the cost of inlining it drops dramatically. It may seem odd to update 970 // Cost in updateThreshold, but the bonus depends on the logic in this method. 971 if (OnlyOneCallAndLocalLinkage) 972 Cost -= LastCallToStaticBonus; 973 } 974 975 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 976 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 977 // First try to handle simplified comparisons. 978 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 979 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 980 })) 981 return true; 982 983 if (I.getOpcode() == Instruction::FCmp) 984 return false; 985 986 // Otherwise look for a comparison between constant offset pointers with 987 // a common base. 988 Value *LHSBase, *RHSBase; 989 APInt LHSOffset, RHSOffset; 990 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 991 if (LHSBase) { 992 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 993 if (RHSBase && LHSBase == RHSBase) { 994 // We have common bases, fold the icmp to a constant based on the 995 // offsets. 996 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 997 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 998 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 999 SimplifiedValues[&I] = C; 1000 ++NumConstantPtrCmps; 1001 return true; 1002 } 1003 } 1004 } 1005 1006 // If the comparison is an equality comparison with null, we can simplify it 1007 // if we know the value (argument) can't be null 1008 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1009 isKnownNonNullInCallee(I.getOperand(0))) { 1010 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1011 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1012 : ConstantInt::getFalse(I.getType()); 1013 return true; 1014 } 1015 // Finally check for SROA candidates in comparisons. 1016 Value *SROAArg; 1017 DenseMap<Value *, int>::iterator CostIt; 1018 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1019 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1020 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1021 return true; 1022 } 1023 1024 disableSROA(CostIt); 1025 } 1026 1027 return false; 1028 } 1029 1030 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1031 // Try to handle a special case: we can fold computing the difference of two 1032 // constant-related pointers. 1033 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1034 Value *LHSBase, *RHSBase; 1035 APInt LHSOffset, RHSOffset; 1036 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1037 if (LHSBase) { 1038 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1039 if (RHSBase && LHSBase == RHSBase) { 1040 // We have common bases, fold the subtract to a constant based on the 1041 // offsets. 1042 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1043 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1044 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1045 SimplifiedValues[&I] = C; 1046 ++NumConstantPtrDiffs; 1047 return true; 1048 } 1049 } 1050 } 1051 1052 // Otherwise, fall back to the generic logic for simplifying and handling 1053 // instructions. 1054 return Base::visitSub(I); 1055 } 1056 1057 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1058 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1059 Constant *CLHS = dyn_cast<Constant>(LHS); 1060 if (!CLHS) 1061 CLHS = SimplifiedValues.lookup(LHS); 1062 Constant *CRHS = dyn_cast<Constant>(RHS); 1063 if (!CRHS) 1064 CRHS = SimplifiedValues.lookup(RHS); 1065 1066 Value *SimpleV = nullptr; 1067 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1068 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1069 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1070 else 1071 SimpleV = 1072 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1073 1074 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1075 SimplifiedValues[&I] = C; 1076 1077 if (SimpleV) 1078 return true; 1079 1080 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1081 disableSROA(LHS); 1082 disableSROA(RHS); 1083 1084 // If the instruction is floating point, and the target says this operation 1085 // is expensive, this may eventually become a library call. Treat the cost 1086 // as such. 1087 if (I.getType()->isFloatingPointTy() && 1088 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1089 Cost += InlineConstants::CallPenalty; 1090 1091 return false; 1092 } 1093 1094 bool CallAnalyzer::visitLoad(LoadInst &I) { 1095 Value *SROAArg; 1096 DenseMap<Value *, int>::iterator CostIt; 1097 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1098 if (I.isSimple()) { 1099 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1100 return true; 1101 } 1102 1103 disableSROA(CostIt); 1104 } 1105 1106 // If the data is already loaded from this address and hasn't been clobbered 1107 // by any stores or calls, this load is likely to be redundant and can be 1108 // eliminated. 1109 if (EnableLoadElimination && 1110 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1111 LoadEliminationCost += InlineConstants::InstrCost; 1112 return true; 1113 } 1114 1115 return false; 1116 } 1117 1118 bool CallAnalyzer::visitStore(StoreInst &I) { 1119 Value *SROAArg; 1120 DenseMap<Value *, int>::iterator CostIt; 1121 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1122 if (I.isSimple()) { 1123 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1124 return true; 1125 } 1126 1127 disableSROA(CostIt); 1128 } 1129 1130 // The store can potentially clobber loads and prevent repeated loads from 1131 // being eliminated. 1132 // FIXME: 1133 // 1. We can probably keep an initial set of eliminatable loads substracted 1134 // from the cost even when we finally see a store. We just need to disable 1135 // *further* accumulation of elimination savings. 1136 // 2. We should probably at some point thread MemorySSA for the callee into 1137 // this and then use that to actually compute *really* precise savings. 1138 disableLoadElimination(); 1139 return false; 1140 } 1141 1142 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1143 // Constant folding for extract value is trivial. 1144 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1145 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1146 })) 1147 return true; 1148 1149 // SROA can look through these but give them a cost. 1150 return false; 1151 } 1152 1153 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1154 // Constant folding for insert value is trivial. 1155 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1156 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1157 /*InsertedValueOperand*/ COps[1], 1158 I.getIndices()); 1159 })) 1160 return true; 1161 1162 // SROA can look through these but give them a cost. 1163 return false; 1164 } 1165 1166 /// Try to simplify a call site. 1167 /// 1168 /// Takes a concrete function and callsite and tries to actually simplify it by 1169 /// analyzing the arguments and call itself with instsimplify. Returns true if 1170 /// it has simplified the callsite to some other entity (a constant), making it 1171 /// free. 1172 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1173 // FIXME: Using the instsimplify logic directly for this is inefficient 1174 // because we have to continually rebuild the argument list even when no 1175 // simplifications can be performed. Until that is fixed with remapping 1176 // inside of instsimplify, directly constant fold calls here. 1177 if (!canConstantFoldCallTo(CS, F)) 1178 return false; 1179 1180 // Try to re-map the arguments to constants. 1181 SmallVector<Constant *, 4> ConstantArgs; 1182 ConstantArgs.reserve(CS.arg_size()); 1183 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1184 ++I) { 1185 Constant *C = dyn_cast<Constant>(*I); 1186 if (!C) 1187 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1188 if (!C) 1189 return false; // This argument doesn't map to a constant. 1190 1191 ConstantArgs.push_back(C); 1192 } 1193 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1194 SimplifiedValues[CS.getInstruction()] = C; 1195 return true; 1196 } 1197 1198 return false; 1199 } 1200 1201 bool CallAnalyzer::visitCallSite(CallSite CS) { 1202 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1203 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1204 // This aborts the entire analysis. 1205 ExposesReturnsTwice = true; 1206 return false; 1207 } 1208 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1209 ContainsNoDuplicateCall = true; 1210 1211 if (Function *F = CS.getCalledFunction()) { 1212 // When we have a concrete function, first try to simplify it directly. 1213 if (simplifyCallSite(F, CS)) 1214 return true; 1215 1216 // Next check if it is an intrinsic we know about. 1217 // FIXME: Lift this into part of the InstVisitor. 1218 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1219 switch (II->getIntrinsicID()) { 1220 default: 1221 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1222 disableLoadElimination(); 1223 return Base::visitCallSite(CS); 1224 1225 case Intrinsic::load_relative: 1226 // This is normally lowered to 4 LLVM instructions. 1227 Cost += 3 * InlineConstants::InstrCost; 1228 return false; 1229 1230 case Intrinsic::memset: 1231 case Intrinsic::memcpy: 1232 case Intrinsic::memmove: 1233 disableLoadElimination(); 1234 // SROA can usually chew through these intrinsics, but they aren't free. 1235 return false; 1236 case Intrinsic::icall_branch_funnel: 1237 case Intrinsic::localescape: 1238 HasUninlineableIntrinsic = true; 1239 return false; 1240 case Intrinsic::vastart: 1241 case Intrinsic::vaend: 1242 UsesVarArgs = true; 1243 return false; 1244 } 1245 } 1246 1247 if (F == CS.getInstruction()->getFunction()) { 1248 // This flag will fully abort the analysis, so don't bother with anything 1249 // else. 1250 IsRecursiveCall = true; 1251 return false; 1252 } 1253 1254 if (TTI.isLoweredToCall(F)) { 1255 // We account for the average 1 instruction per call argument setup 1256 // here. 1257 Cost += CS.arg_size() * InlineConstants::InstrCost; 1258 1259 // Everything other than inline ASM will also have a significant cost 1260 // merely from making the call. 1261 if (!isa<InlineAsm>(CS.getCalledValue())) 1262 Cost += InlineConstants::CallPenalty; 1263 } 1264 1265 if (!CS.onlyReadsMemory()) 1266 disableLoadElimination(); 1267 return Base::visitCallSite(CS); 1268 } 1269 1270 // Otherwise we're in a very special case -- an indirect function call. See 1271 // if we can be particularly clever about this. 1272 Value *Callee = CS.getCalledValue(); 1273 1274 // First, pay the price of the argument setup. We account for the average 1275 // 1 instruction per call argument setup here. 1276 Cost += CS.arg_size() * InlineConstants::InstrCost; 1277 1278 // Next, check if this happens to be an indirect function call to a known 1279 // function in this inline context. If not, we've done all we can. 1280 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1281 if (!F) { 1282 if (!CS.onlyReadsMemory()) 1283 disableLoadElimination(); 1284 return Base::visitCallSite(CS); 1285 } 1286 1287 // If we have a constant that we are calling as a function, we can peer 1288 // through it and see the function target. This happens not infrequently 1289 // during devirtualization and so we want to give it a hefty bonus for 1290 // inlining, but cap that bonus in the event that inlining wouldn't pan 1291 // out. Pretend to inline the function, with a custom threshold. 1292 auto IndirectCallParams = Params; 1293 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1294 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1295 IndirectCallParams); 1296 if (CA.analyzeCall(CS)) { 1297 // We were able to inline the indirect call! Subtract the cost from the 1298 // threshold to get the bonus we want to apply, but don't go below zero. 1299 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1300 } 1301 1302 if (!F->onlyReadsMemory()) 1303 disableLoadElimination(); 1304 return Base::visitCallSite(CS); 1305 } 1306 1307 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1308 // At least one return instruction will be free after inlining. 1309 bool Free = !HasReturn; 1310 HasReturn = true; 1311 return Free; 1312 } 1313 1314 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1315 // We model unconditional branches as essentially free -- they really 1316 // shouldn't exist at all, but handling them makes the behavior of the 1317 // inliner more regular and predictable. Interestingly, conditional branches 1318 // which will fold away are also free. 1319 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1320 dyn_cast_or_null<ConstantInt>( 1321 SimplifiedValues.lookup(BI.getCondition())); 1322 } 1323 1324 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1325 bool CheckSROA = SI.getType()->isPointerTy(); 1326 Value *TrueVal = SI.getTrueValue(); 1327 Value *FalseVal = SI.getFalseValue(); 1328 1329 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1330 if (!TrueC) 1331 TrueC = SimplifiedValues.lookup(TrueVal); 1332 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1333 if (!FalseC) 1334 FalseC = SimplifiedValues.lookup(FalseVal); 1335 Constant *CondC = 1336 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1337 1338 if (!CondC) { 1339 // Select C, X, X => X 1340 if (TrueC == FalseC && TrueC) { 1341 SimplifiedValues[&SI] = TrueC; 1342 return true; 1343 } 1344 1345 if (!CheckSROA) 1346 return Base::visitSelectInst(SI); 1347 1348 std::pair<Value *, APInt> TrueBaseAndOffset = 1349 ConstantOffsetPtrs.lookup(TrueVal); 1350 std::pair<Value *, APInt> FalseBaseAndOffset = 1351 ConstantOffsetPtrs.lookup(FalseVal); 1352 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1353 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1354 1355 Value *SROAArg; 1356 DenseMap<Value *, int>::iterator CostIt; 1357 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1358 SROAArgValues[&SI] = SROAArg; 1359 return true; 1360 } 1361 1362 return Base::visitSelectInst(SI); 1363 } 1364 1365 // Select condition is a constant. 1366 Value *SelectedV = CondC->isAllOnesValue() 1367 ? TrueVal 1368 : (CondC->isNullValue()) ? FalseVal : nullptr; 1369 if (!SelectedV) { 1370 // Condition is a vector constant that is not all 1s or all 0s. If all 1371 // operands are constants, ConstantExpr::getSelect() can handle the cases 1372 // such as select vectors. 1373 if (TrueC && FalseC) { 1374 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1375 SimplifiedValues[&SI] = C; 1376 return true; 1377 } 1378 } 1379 return Base::visitSelectInst(SI); 1380 } 1381 1382 // Condition is either all 1s or all 0s. SI can be simplified. 1383 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1384 SimplifiedValues[&SI] = SelectedC; 1385 return true; 1386 } 1387 1388 if (!CheckSROA) 1389 return true; 1390 1391 std::pair<Value *, APInt> BaseAndOffset = 1392 ConstantOffsetPtrs.lookup(SelectedV); 1393 if (BaseAndOffset.first) { 1394 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1395 1396 Value *SROAArg; 1397 DenseMap<Value *, int>::iterator CostIt; 1398 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1399 SROAArgValues[&SI] = SROAArg; 1400 } 1401 1402 return true; 1403 } 1404 1405 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1406 // We model unconditional switches as free, see the comments on handling 1407 // branches. 1408 if (isa<ConstantInt>(SI.getCondition())) 1409 return true; 1410 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1411 if (isa<ConstantInt>(V)) 1412 return true; 1413 1414 // Assume the most general case where the switch is lowered into 1415 // either a jump table, bit test, or a balanced binary tree consisting of 1416 // case clusters without merging adjacent clusters with the same 1417 // destination. We do not consider the switches that are lowered with a mix 1418 // of jump table/bit test/binary search tree. The cost of the switch is 1419 // proportional to the size of the tree or the size of jump table range. 1420 // 1421 // NB: We convert large switches which are just used to initialize large phi 1422 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1423 // inlining those. It will prevent inlining in cases where the optimization 1424 // does not (yet) fire. 1425 1426 // Maximum valid cost increased in this function. 1427 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1428 1429 // Exit early for a large switch, assuming one case needs at least one 1430 // instruction. 1431 // FIXME: This is not true for a bit test, but ignore such case for now to 1432 // save compile-time. 1433 int64_t CostLowerBound = 1434 std::min((int64_t)CostUpperBound, 1435 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1436 1437 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1438 Cost = CostLowerBound; 1439 return false; 1440 } 1441 1442 unsigned JumpTableSize = 0; 1443 unsigned NumCaseCluster = 1444 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1445 1446 // If suitable for a jump table, consider the cost for the table size and 1447 // branch to destination. 1448 if (JumpTableSize) { 1449 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1450 4 * InlineConstants::InstrCost; 1451 1452 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1453 return false; 1454 } 1455 1456 // Considering forming a binary search, we should find the number of nodes 1457 // which is same as the number of comparisons when lowered. For a given 1458 // number of clusters, n, we can define a recursive function, f(n), to find 1459 // the number of nodes in the tree. The recursion is : 1460 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1461 // and f(n) = n, when n <= 3. 1462 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1463 // when n > 3. So, the number of comparisons from leaves should be n, while 1464 // the number of non-leaf should be : 1465 // 2^(log2(n) - 1) - 1 1466 // = 2^log2(n) * 2^-1 - 1 1467 // = n / 2 - 1. 1468 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1469 // number of comparisons in a simple closed form : 1470 // n + n / 2 - 1 = n * 3 / 2 - 1 1471 if (NumCaseCluster <= 3) { 1472 // Suppose a comparison includes one compare and one conditional branch. 1473 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1474 return false; 1475 } 1476 1477 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1478 int64_t SwitchCost = 1479 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1480 1481 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1482 return false; 1483 } 1484 1485 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1486 // We never want to inline functions that contain an indirectbr. This is 1487 // incorrect because all the blockaddress's (in static global initializers 1488 // for example) would be referring to the original function, and this 1489 // indirect jump would jump from the inlined copy of the function into the 1490 // original function which is extremely undefined behavior. 1491 // FIXME: This logic isn't really right; we can safely inline functions with 1492 // indirectbr's as long as no other function or global references the 1493 // blockaddress of a block within the current function. 1494 HasIndirectBr = true; 1495 return false; 1496 } 1497 1498 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1499 // FIXME: It's not clear that a single instruction is an accurate model for 1500 // the inline cost of a resume instruction. 1501 return false; 1502 } 1503 1504 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1505 // FIXME: It's not clear that a single instruction is an accurate model for 1506 // the inline cost of a cleanupret instruction. 1507 return false; 1508 } 1509 1510 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1511 // FIXME: It's not clear that a single instruction is an accurate model for 1512 // the inline cost of a catchret instruction. 1513 return false; 1514 } 1515 1516 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1517 // FIXME: It might be reasonably to discount the cost of instructions leading 1518 // to unreachable as they have the lowest possible impact on both runtime and 1519 // code size. 1520 return true; // No actual code is needed for unreachable. 1521 } 1522 1523 bool CallAnalyzer::visitInstruction(Instruction &I) { 1524 // Some instructions are free. All of the free intrinsics can also be 1525 // handled by SROA, etc. 1526 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1527 return true; 1528 1529 // We found something we don't understand or can't handle. Mark any SROA-able 1530 // values in the operand list as no longer viable. 1531 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1532 disableSROA(*OI); 1533 1534 return false; 1535 } 1536 1537 /// Analyze a basic block for its contribution to the inline cost. 1538 /// 1539 /// This method walks the analyzer over every instruction in the given basic 1540 /// block and accounts for their cost during inlining at this callsite. It 1541 /// aborts early if the threshold has been exceeded or an impossible to inline 1542 /// construct has been detected. It returns false if inlining is no longer 1543 /// viable, and true if inlining remains viable. 1544 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1545 SmallPtrSetImpl<const Value *> &EphValues) { 1546 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1547 // FIXME: Currently, the number of instructions in a function regardless of 1548 // our ability to simplify them during inline to constants or dead code, 1549 // are actually used by the vector bonus heuristic. As long as that's true, 1550 // we have to special case debug intrinsics here to prevent differences in 1551 // inlining due to debug symbols. Eventually, the number of unsimplified 1552 // instructions shouldn't factor into the cost computation, but until then, 1553 // hack around it here. 1554 if (isa<DbgInfoIntrinsic>(I)) 1555 continue; 1556 1557 // Skip ephemeral values. 1558 if (EphValues.count(&*I)) 1559 continue; 1560 1561 ++NumInstructions; 1562 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1563 ++NumVectorInstructions; 1564 1565 // If the instruction simplified to a constant, there is no cost to this 1566 // instruction. Visit the instructions using our InstVisitor to account for 1567 // all of the per-instruction logic. The visit tree returns true if we 1568 // consumed the instruction in any way, and false if the instruction's base 1569 // cost should count against inlining. 1570 if (Base::visit(&*I)) 1571 ++NumInstructionsSimplified; 1572 else 1573 Cost += InlineConstants::InstrCost; 1574 1575 using namespace ore; 1576 // If the visit this instruction detected an uninlinable pattern, abort. 1577 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1578 HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) { 1579 if (ORE) 1580 ORE->emit([&]() { 1581 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1582 CandidateCS.getInstruction()) 1583 << NV("Callee", &F) 1584 << " has uninlinable pattern and cost is not fully computed"; 1585 }); 1586 return false; 1587 } 1588 1589 // If the caller is a recursive function then we don't want to inline 1590 // functions which allocate a lot of stack space because it would increase 1591 // the caller stack usage dramatically. 1592 if (IsCallerRecursive && 1593 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1594 if (ORE) 1595 ORE->emit([&]() { 1596 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1597 CandidateCS.getInstruction()) 1598 << NV("Callee", &F) 1599 << " is recursive and allocates too much stack space. Cost is " 1600 "not fully computed"; 1601 }); 1602 return false; 1603 } 1604 1605 // Check if we've past the maximum possible threshold so we don't spin in 1606 // huge basic blocks that will never inline. 1607 if (Cost >= Threshold && !ComputeFullInlineCost) 1608 return false; 1609 } 1610 1611 return true; 1612 } 1613 1614 /// Compute the base pointer and cumulative constant offsets for V. 1615 /// 1616 /// This strips all constant offsets off of V, leaving it the base pointer, and 1617 /// accumulates the total constant offset applied in the returned constant. It 1618 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1619 /// no constant offsets applied. 1620 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1621 if (!V->getType()->isPointerTy()) 1622 return nullptr; 1623 1624 unsigned AS = V->getType()->getPointerAddressSpace(); 1625 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1626 APInt Offset = APInt::getNullValue(IntPtrWidth); 1627 1628 // Even though we don't look through PHI nodes, we could be called on an 1629 // instruction in an unreachable block, which may be on a cycle. 1630 SmallPtrSet<Value *, 4> Visited; 1631 Visited.insert(V); 1632 do { 1633 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1634 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1635 return nullptr; 1636 V = GEP->getPointerOperand(); 1637 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1638 V = cast<Operator>(V)->getOperand(0); 1639 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1640 if (GA->isInterposable()) 1641 break; 1642 V = GA->getAliasee(); 1643 } else { 1644 break; 1645 } 1646 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1647 } while (Visited.insert(V).second); 1648 1649 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1650 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1651 } 1652 1653 /// Find dead blocks due to deleted CFG edges during inlining. 1654 /// 1655 /// If we know the successor of the current block, \p CurrBB, has to be \p 1656 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1657 /// no live incoming CFG edges. If one block is found to be dead, we can 1658 /// continue growing the dead block list by checking the successors of the dead 1659 /// blocks to see if all their incoming edges are dead or not. 1660 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1661 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1662 // A CFG edge is dead if the predecessor is dead or the predessor has a 1663 // known successor which is not the one under exam. 1664 return (DeadBlocks.count(Pred) || 1665 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1666 }; 1667 1668 auto IsNewlyDead = [&](BasicBlock *BB) { 1669 // If all the edges to a block are dead, the block is also dead. 1670 return (!DeadBlocks.count(BB) && 1671 llvm::all_of(predecessors(BB), 1672 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1673 }; 1674 1675 for (BasicBlock *Succ : successors(CurrBB)) { 1676 if (Succ == NextBB || !IsNewlyDead(Succ)) 1677 continue; 1678 SmallVector<BasicBlock *, 4> NewDead; 1679 NewDead.push_back(Succ); 1680 while (!NewDead.empty()) { 1681 BasicBlock *Dead = NewDead.pop_back_val(); 1682 if (DeadBlocks.insert(Dead)) 1683 // Continue growing the dead block lists. 1684 for (BasicBlock *S : successors(Dead)) 1685 if (IsNewlyDead(S)) 1686 NewDead.push_back(S); 1687 } 1688 } 1689 } 1690 1691 /// Analyze a call site for potential inlining. 1692 /// 1693 /// Returns true if inlining this call is viable, and false if it is not 1694 /// viable. It computes the cost and adjusts the threshold based on numerous 1695 /// factors and heuristics. If this method returns false but the computed cost 1696 /// is below the computed threshold, then inlining was forcibly disabled by 1697 /// some artifact of the routine. 1698 bool CallAnalyzer::analyzeCall(CallSite CS) { 1699 ++NumCallsAnalyzed; 1700 1701 // Perform some tweaks to the cost and threshold based on the direct 1702 // callsite information. 1703 1704 // We want to more aggressively inline vector-dense kernels, so up the 1705 // threshold, and we'll lower it if the % of vector instructions gets too 1706 // low. Note that these bonuses are some what arbitrary and evolved over time 1707 // by accident as much as because they are principled bonuses. 1708 // 1709 // FIXME: It would be nice to remove all such bonuses. At least it would be 1710 // nice to base the bonus values on something more scientific. 1711 assert(NumInstructions == 0); 1712 assert(NumVectorInstructions == 0); 1713 1714 // Update the threshold based on callsite properties 1715 updateThreshold(CS, F); 1716 1717 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1718 // this Threshold any time, and cost cannot decrease, we can stop processing 1719 // the rest of the function body. 1720 Threshold += (SingleBBBonus + VectorBonus); 1721 1722 // Give out bonuses for the callsite, as the instructions setting them up 1723 // will be gone after inlining. 1724 Cost -= getCallsiteCost(CS, DL); 1725 1726 // If this function uses the coldcc calling convention, prefer not to inline 1727 // it. 1728 if (F.getCallingConv() == CallingConv::Cold) 1729 Cost += InlineConstants::ColdccPenalty; 1730 1731 // Check if we're done. This can happen due to bonuses and penalties. 1732 if (Cost >= Threshold && !ComputeFullInlineCost) 1733 return false; 1734 1735 if (F.empty()) 1736 return true; 1737 1738 Function *Caller = CS.getInstruction()->getFunction(); 1739 // Check if the caller function is recursive itself. 1740 for (User *U : Caller->users()) { 1741 CallSite Site(U); 1742 if (!Site) 1743 continue; 1744 Instruction *I = Site.getInstruction(); 1745 if (I->getFunction() == Caller) { 1746 IsCallerRecursive = true; 1747 break; 1748 } 1749 } 1750 1751 // Populate our simplified values by mapping from function arguments to call 1752 // arguments with known important simplifications. 1753 CallSite::arg_iterator CAI = CS.arg_begin(); 1754 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1755 FAI != FAE; ++FAI, ++CAI) { 1756 assert(CAI != CS.arg_end()); 1757 if (Constant *C = dyn_cast<Constant>(CAI)) 1758 SimplifiedValues[&*FAI] = C; 1759 1760 Value *PtrArg = *CAI; 1761 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1762 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1763 1764 // We can SROA any pointer arguments derived from alloca instructions. 1765 if (isa<AllocaInst>(PtrArg)) { 1766 SROAArgValues[&*FAI] = PtrArg; 1767 SROAArgCosts[PtrArg] = 0; 1768 } 1769 } 1770 } 1771 NumConstantArgs = SimplifiedValues.size(); 1772 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1773 NumAllocaArgs = SROAArgValues.size(); 1774 1775 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1776 // the ephemeral values multiple times (and they're completely determined by 1777 // the callee, so this is purely duplicate work). 1778 SmallPtrSet<const Value *, 32> EphValues; 1779 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1780 1781 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1782 // adding basic blocks of the callee which can be proven to be dead for this 1783 // particular call site in order to get more accurate cost estimates. This 1784 // requires a somewhat heavyweight iteration pattern: we need to walk the 1785 // basic blocks in a breadth-first order as we insert live successors. To 1786 // accomplish this, prioritizing for small iterations because we exit after 1787 // crossing our threshold, we use a small-size optimized SetVector. 1788 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1789 SmallPtrSet<BasicBlock *, 16>> 1790 BBSetVector; 1791 BBSetVector BBWorklist; 1792 BBWorklist.insert(&F.getEntryBlock()); 1793 bool SingleBB = true; 1794 // Note that we *must not* cache the size, this loop grows the worklist. 1795 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1796 // Bail out the moment we cross the threshold. This means we'll under-count 1797 // the cost, but only when undercounting doesn't matter. 1798 if (Cost >= Threshold && !ComputeFullInlineCost) 1799 break; 1800 1801 BasicBlock *BB = BBWorklist[Idx]; 1802 if (BB->empty()) 1803 continue; 1804 1805 // Disallow inlining a blockaddress. A blockaddress only has defined 1806 // behavior for an indirect branch in the same function, and we do not 1807 // currently support inlining indirect branches. But, the inliner may not 1808 // see an indirect branch that ends up being dead code at a particular call 1809 // site. If the blockaddress escapes the function, e.g., via a global 1810 // variable, inlining may lead to an invalid cross-function reference. 1811 if (BB->hasAddressTaken()) 1812 return false; 1813 1814 // Analyze the cost of this block. If we blow through the threshold, this 1815 // returns false, and we can bail on out. 1816 if (!analyzeBlock(BB, EphValues)) 1817 return false; 1818 1819 TerminatorInst *TI = BB->getTerminator(); 1820 1821 // Add in the live successors by first checking whether we have terminator 1822 // that may be simplified based on the values simplified by this call. 1823 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1824 if (BI->isConditional()) { 1825 Value *Cond = BI->getCondition(); 1826 if (ConstantInt *SimpleCond = 1827 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1828 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1829 BBWorklist.insert(NextBB); 1830 KnownSuccessors[BB] = NextBB; 1831 findDeadBlocks(BB, NextBB); 1832 continue; 1833 } 1834 } 1835 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1836 Value *Cond = SI->getCondition(); 1837 if (ConstantInt *SimpleCond = 1838 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1839 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1840 BBWorklist.insert(NextBB); 1841 KnownSuccessors[BB] = NextBB; 1842 findDeadBlocks(BB, NextBB); 1843 continue; 1844 } 1845 } 1846 1847 // If we're unable to select a particular successor, just count all of 1848 // them. 1849 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1850 ++TIdx) 1851 BBWorklist.insert(TI->getSuccessor(TIdx)); 1852 1853 // If we had any successors at this point, than post-inlining is likely to 1854 // have them as well. Note that we assume any basic blocks which existed 1855 // due to branches or switches which folded above will also fold after 1856 // inlining. 1857 if (SingleBB && TI->getNumSuccessors() > 1) { 1858 // Take off the bonus we applied to the threshold. 1859 Threshold -= SingleBBBonus; 1860 SingleBB = false; 1861 } 1862 } 1863 1864 bool OnlyOneCallAndLocalLinkage = 1865 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1866 // If this is a noduplicate call, we can still inline as long as 1867 // inlining this would cause the removal of the caller (so the instruction 1868 // is not actually duplicated, just moved). 1869 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1870 return false; 1871 1872 // We applied the maximum possible vector bonus at the beginning. Now, 1873 // subtract the excess bonus, if any, from the Threshold before 1874 // comparing against Cost. 1875 if (NumVectorInstructions <= NumInstructions / 10) 1876 Threshold -= VectorBonus; 1877 else if (NumVectorInstructions <= NumInstructions / 2) 1878 Threshold -= VectorBonus/2; 1879 1880 return Cost < std::max(1, Threshold); 1881 } 1882 1883 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1884 /// Dump stats about this call's analysis. 1885 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1886 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1887 DEBUG_PRINT_STAT(NumConstantArgs); 1888 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1889 DEBUG_PRINT_STAT(NumAllocaArgs); 1890 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1891 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1892 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1893 DEBUG_PRINT_STAT(NumInstructions); 1894 DEBUG_PRINT_STAT(SROACostSavings); 1895 DEBUG_PRINT_STAT(SROACostSavingsLost); 1896 DEBUG_PRINT_STAT(LoadEliminationCost); 1897 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1898 DEBUG_PRINT_STAT(Cost); 1899 DEBUG_PRINT_STAT(Threshold); 1900 #undef DEBUG_PRINT_STAT 1901 } 1902 #endif 1903 1904 /// Test that there are no attribute conflicts between Caller and Callee 1905 /// that prevent inlining. 1906 static bool functionsHaveCompatibleAttributes(Function *Caller, 1907 Function *Callee, 1908 TargetTransformInfo &TTI) { 1909 return TTI.areInlineCompatible(Caller, Callee) && 1910 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1911 } 1912 1913 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1914 int Cost = 0; 1915 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1916 if (CS.isByValArgument(I)) { 1917 // We approximate the number of loads and stores needed by dividing the 1918 // size of the byval type by the target's pointer size. 1919 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1920 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1921 unsigned AS = PTy->getAddressSpace(); 1922 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1923 // Ceiling division. 1924 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1925 1926 // If it generates more than 8 stores it is likely to be expanded as an 1927 // inline memcpy so we take that as an upper bound. Otherwise we assume 1928 // one load and one store per word copied. 1929 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1930 // here instead of a magic number of 8, but it's not available via 1931 // DataLayout. 1932 NumStores = std::min(NumStores, 8U); 1933 1934 Cost += 2 * NumStores * InlineConstants::InstrCost; 1935 } else { 1936 // For non-byval arguments subtract off one instruction per call 1937 // argument. 1938 Cost += InlineConstants::InstrCost; 1939 } 1940 } 1941 // The call instruction also disappears after inlining. 1942 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1943 return Cost; 1944 } 1945 1946 InlineCost llvm::getInlineCost( 1947 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1948 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1949 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1950 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1951 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1952 GetAssumptionCache, GetBFI, PSI, ORE); 1953 } 1954 1955 InlineCost llvm::getInlineCost( 1956 CallSite CS, Function *Callee, const InlineParams &Params, 1957 TargetTransformInfo &CalleeTTI, 1958 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1959 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1960 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1961 1962 // Cannot inline indirect calls. 1963 if (!Callee) 1964 return llvm::InlineCost::getNever(); 1965 1966 // Never inline calls with byval arguments that does not have the alloca 1967 // address space. Since byval arguments can be replaced with a copy to an 1968 // alloca, the inlined code would need to be adjusted to handle that the 1969 // argument is in the alloca address space (so it is a little bit complicated 1970 // to solve). 1971 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 1972 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) 1973 if (CS.isByValArgument(I)) { 1974 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1975 if (PTy->getAddressSpace() != AllocaAS) 1976 return llvm::InlineCost::getNever(); 1977 } 1978 1979 // Calls to functions with always-inline attributes should be inlined 1980 // whenever possible. 1981 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1982 if (isInlineViable(*Callee)) 1983 return llvm::InlineCost::getAlways(); 1984 return llvm::InlineCost::getNever(); 1985 } 1986 1987 // Never inline functions with conflicting attributes (unless callee has 1988 // always-inline attribute). 1989 Function *Caller = CS.getCaller(); 1990 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1991 return llvm::InlineCost::getNever(); 1992 1993 // Don't inline this call if the caller has the optnone attribute. 1994 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1995 return llvm::InlineCost::getNever(); 1996 1997 // Don't inline functions which can be interposed at link-time. Don't inline 1998 // functions marked noinline or call sites marked noinline. 1999 // Note: inlining non-exact non-interposable functions is fine, since we know 2000 // we have *a* correct implementation of the source level function. 2001 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 2002 CS.isNoInline()) 2003 return llvm::InlineCost::getNever(); 2004 2005 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2006 << "... (caller:" << Caller->getName() << ")\n"); 2007 2008 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 2009 Params); 2010 bool ShouldInline = CA.analyzeCall(CS); 2011 2012 DEBUG(CA.dump()); 2013 2014 // Check if there was a reason to force inlining or no inlining. 2015 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2016 return InlineCost::getNever(); 2017 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2018 return InlineCost::getAlways(); 2019 2020 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2021 } 2022 2023 bool llvm::isInlineViable(Function &F) { 2024 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2025 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2026 // Disallow inlining of functions which contain indirect branches or 2027 // blockaddresses. 2028 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 2029 return false; 2030 2031 for (auto &II : *BI) { 2032 CallSite CS(&II); 2033 if (!CS) 2034 continue; 2035 2036 // Disallow recursive calls. 2037 if (&F == CS.getCalledFunction()) 2038 return false; 2039 2040 // Disallow calls which expose returns-twice to a function not previously 2041 // attributed as such. 2042 if (!ReturnsTwice && CS.isCall() && 2043 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 2044 return false; 2045 2046 if (CS.getCalledFunction()) 2047 switch (CS.getCalledFunction()->getIntrinsicID()) { 2048 default: 2049 break; 2050 // Disallow inlining of @llvm.icall.branch.funnel because current 2051 // backend can't separate call targets from call arguments. 2052 case llvm::Intrinsic::icall_branch_funnel: 2053 // Disallow inlining functions that call @llvm.localescape. Doing this 2054 // correctly would require major changes to the inliner. 2055 case llvm::Intrinsic::localescape: 2056 // Disallow inlining of functions that access VarArgs. 2057 case llvm::Intrinsic::vastart: 2058 case llvm::Intrinsic::vaend: 2059 return false; 2060 } 2061 } 2062 } 2063 2064 return true; 2065 } 2066 2067 // APIs to create InlineParams based on command line flags and/or other 2068 // parameters. 2069 2070 InlineParams llvm::getInlineParams(int Threshold) { 2071 InlineParams Params; 2072 2073 // This field is the threshold to use for a callee by default. This is 2074 // derived from one or more of: 2075 // * optimization or size-optimization levels, 2076 // * a value passed to createFunctionInliningPass function, or 2077 // * the -inline-threshold flag. 2078 // If the -inline-threshold flag is explicitly specified, that is used 2079 // irrespective of anything else. 2080 if (InlineThreshold.getNumOccurrences() > 0) 2081 Params.DefaultThreshold = InlineThreshold; 2082 else 2083 Params.DefaultThreshold = Threshold; 2084 2085 // Set the HintThreshold knob from the -inlinehint-threshold. 2086 Params.HintThreshold = HintThreshold; 2087 2088 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2089 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2090 2091 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2092 // populate LocallyHotCallSiteThreshold. Later, we populate 2093 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2094 // we know that optimization level is O3 (in the getInlineParams variant that 2095 // takes the opt and size levels). 2096 // FIXME: Remove this check (and make the assignment unconditional) after 2097 // addressing size regression issues at O2. 2098 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2099 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2100 2101 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2102 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2103 2104 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2105 // -inlinehint-threshold commandline option is not explicitly given. If that 2106 // option is present, then its value applies even for callees with size and 2107 // minsize attributes. 2108 // If the -inline-threshold is not specified, set the ColdThreshold from the 2109 // -inlinecold-threshold even if it is not explicitly passed. If 2110 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2111 // explicitly specified to set the ColdThreshold knob 2112 if (InlineThreshold.getNumOccurrences() == 0) { 2113 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2114 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2115 Params.ColdThreshold = ColdThreshold; 2116 } else if (ColdThreshold.getNumOccurrences() > 0) { 2117 Params.ColdThreshold = ColdThreshold; 2118 } 2119 return Params; 2120 } 2121 2122 InlineParams llvm::getInlineParams() { 2123 return getInlineParams(InlineThreshold); 2124 } 2125 2126 // Compute the default threshold for inlining based on the opt level and the 2127 // size opt level. 2128 static int computeThresholdFromOptLevels(unsigned OptLevel, 2129 unsigned SizeOptLevel) { 2130 if (OptLevel > 2) 2131 return InlineConstants::OptAggressiveThreshold; 2132 if (SizeOptLevel == 1) // -Os 2133 return InlineConstants::OptSizeThreshold; 2134 if (SizeOptLevel == 2) // -Oz 2135 return InlineConstants::OptMinSizeThreshold; 2136 return InlineThreshold; 2137 } 2138 2139 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2140 auto Params = 2141 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2142 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2143 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2144 // when it is specified explicitly. 2145 if (OptLevel > 2) 2146 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2147 return Params; 2148 } 2149