1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/CFG.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/GetElementPtrTypeIterator.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/InstVisitor.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Support/raw_ostream.h" 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "inline-cost" 43 44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 45 46 static cl::opt<int> InlineThreshold( 47 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 48 cl::desc("Control the amount of inlining to perform (default = 225)")); 49 50 static cl::opt<int> HintThreshold( 51 "inlinehint-threshold", cl::Hidden, cl::init(325), 52 cl::desc("Threshold for inlining functions with inline hint")); 53 54 static cl::opt<int> 55 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 56 cl::init(45), 57 cl::desc("Threshold for inlining cold callsites")); 58 59 // We introduce this threshold to help performance of instrumentation based 60 // PGO before we actually hook up inliner with analysis passes such as BPI and 61 // BFI. 62 static cl::opt<int> ColdThreshold( 63 "inlinecold-threshold", cl::Hidden, cl::init(45), 64 cl::desc("Threshold for inlining functions with cold attribute")); 65 66 static cl::opt<int> 67 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 68 cl::ZeroOrMore, 69 cl::desc("Threshold for hot callsites ")); 70 71 static cl::opt<int> LocallyHotCallSiteThreshold( 72 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 73 cl::desc("Threshold for locally hot callsites ")); 74 75 static cl::opt<int> ColdCallSiteRelFreq( 76 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 77 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 78 "entry frequency, for a callsite to be cold in the absence of " 79 "profile information.")); 80 81 static cl::opt<int> HotCallSiteRelFreq( 82 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 83 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 84 "entry frequency, for a callsite to be hot in the absence of " 85 "profile information.")); 86 87 static cl::opt<bool> OptComputeFullInlineCost( 88 "inline-cost-full", cl::Hidden, cl::init(false), 89 cl::desc("Compute the full inline cost of a call site even when the cost " 90 "exceeds the threshold.")); 91 92 namespace { 93 94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 95 typedef InstVisitor<CallAnalyzer, bool> Base; 96 friend class InstVisitor<CallAnalyzer, bool>; 97 98 /// The TargetTransformInfo available for this compilation. 99 const TargetTransformInfo &TTI; 100 101 /// Getter for the cache of @llvm.assume intrinsics. 102 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 103 104 /// Getter for BlockFrequencyInfo 105 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 106 107 /// Profile summary information. 108 ProfileSummaryInfo *PSI; 109 110 /// The called function. 111 Function &F; 112 113 // Cache the DataLayout since we use it a lot. 114 const DataLayout &DL; 115 116 /// The OptimizationRemarkEmitter available for this compilation. 117 OptimizationRemarkEmitter *ORE; 118 119 /// The candidate callsite being analyzed. Please do not use this to do 120 /// analysis in the caller function; we want the inline cost query to be 121 /// easily cacheable. Instead, use the cover function paramHasAttr. 122 CallSite CandidateCS; 123 124 /// Tunable parameters that control the analysis. 125 const InlineParams &Params; 126 127 int Threshold; 128 int Cost; 129 bool ComputeFullInlineCost; 130 131 bool IsCallerRecursive; 132 bool IsRecursiveCall; 133 bool ExposesReturnsTwice; 134 bool HasDynamicAlloca; 135 bool ContainsNoDuplicateCall; 136 bool HasReturn; 137 bool HasIndirectBr; 138 bool HasFrameEscape; 139 bool UsesVarArgs; 140 141 /// Number of bytes allocated statically by the callee. 142 uint64_t AllocatedSize; 143 unsigned NumInstructions, NumVectorInstructions; 144 int VectorBonus, TenPercentVectorBonus; 145 // Bonus to be applied when the callee has only one reachable basic block. 146 int SingleBBBonus; 147 148 /// While we walk the potentially-inlined instructions, we build up and 149 /// maintain a mapping of simplified values specific to this callsite. The 150 /// idea is to propagate any special information we have about arguments to 151 /// this call through the inlinable section of the function, and account for 152 /// likely simplifications post-inlining. The most important aspect we track 153 /// is CFG altering simplifications -- when we prove a basic block dead, that 154 /// can cause dramatic shifts in the cost of inlining a function. 155 DenseMap<Value *, Constant *> SimplifiedValues; 156 157 /// Keep track of the values which map back (through function arguments) to 158 /// allocas on the caller stack which could be simplified through SROA. 159 DenseMap<Value *, Value *> SROAArgValues; 160 161 /// The mapping of caller Alloca values to their accumulated cost savings. If 162 /// we have to disable SROA for one of the allocas, this tells us how much 163 /// cost must be added. 164 DenseMap<Value *, int> SROAArgCosts; 165 166 /// Keep track of values which map to a pointer base and constant offset. 167 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 168 169 /// Keep track of dead blocks due to the constant arguments. 170 SetVector<BasicBlock *> DeadBlocks; 171 172 /// The mapping of the blocks to their known unique successors due to the 173 /// constant arguments. 174 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 175 176 /// Model the elimination of repeated loads that is expected to happen 177 /// whenever we simplify away the stores that would otherwise cause them to be 178 /// loads. 179 bool EnableLoadElimination; 180 SmallPtrSet<Value *, 16> LoadAddrSet; 181 int LoadEliminationCost; 182 183 // Custom simplification helper routines. 184 bool isAllocaDerivedArg(Value *V); 185 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 186 DenseMap<Value *, int>::iterator &CostIt); 187 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 188 void disableSROA(Value *V); 189 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 190 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 191 int InstructionCost); 192 void disableLoadElimination(); 193 bool isGEPFree(GetElementPtrInst &GEP); 194 bool canFoldInboundsGEP(GetElementPtrInst &I); 195 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 196 bool simplifyCallSite(Function *F, CallSite CS); 197 template <typename Callable> 198 bool simplifyInstruction(Instruction &I, Callable Evaluate); 199 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 200 201 /// Return true if the given argument to the function being considered for 202 /// inlining has the given attribute set either at the call site or the 203 /// function declaration. Primarily used to inspect call site specific 204 /// attributes since these can be more precise than the ones on the callee 205 /// itself. 206 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 207 208 /// Return true if the given value is known non null within the callee if 209 /// inlined through this particular callsite. 210 bool isKnownNonNullInCallee(Value *V); 211 212 /// Update Threshold based on callsite properties such as callee 213 /// attributes and callee hotness for PGO builds. The Callee is explicitly 214 /// passed to support analyzing indirect calls whose target is inferred by 215 /// analysis. 216 void updateThreshold(CallSite CS, Function &Callee); 217 218 /// Return true if size growth is allowed when inlining the callee at CS. 219 bool allowSizeGrowth(CallSite CS); 220 221 /// Return true if \p CS is a cold callsite. 222 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 223 224 /// Return a higher threshold if \p CS is a hot callsite. 225 Optional<int> getHotCallSiteThreshold(CallSite CS, 226 BlockFrequencyInfo *CallerBFI); 227 228 // Custom analysis routines. 229 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 230 231 // Disable several entry points to the visitor so we don't accidentally use 232 // them by declaring but not defining them here. 233 void visit(Module *); 234 void visit(Module &); 235 void visit(Function *); 236 void visit(Function &); 237 void visit(BasicBlock *); 238 void visit(BasicBlock &); 239 240 // Provide base case for our instruction visit. 241 bool visitInstruction(Instruction &I); 242 243 // Our visit overrides. 244 bool visitAlloca(AllocaInst &I); 245 bool visitPHI(PHINode &I); 246 bool visitGetElementPtr(GetElementPtrInst &I); 247 bool visitBitCast(BitCastInst &I); 248 bool visitPtrToInt(PtrToIntInst &I); 249 bool visitIntToPtr(IntToPtrInst &I); 250 bool visitCastInst(CastInst &I); 251 bool visitUnaryInstruction(UnaryInstruction &I); 252 bool visitCmpInst(CmpInst &I); 253 bool visitSub(BinaryOperator &I); 254 bool visitBinaryOperator(BinaryOperator &I); 255 bool visitLoad(LoadInst &I); 256 bool visitStore(StoreInst &I); 257 bool visitExtractValue(ExtractValueInst &I); 258 bool visitInsertValue(InsertValueInst &I); 259 bool visitCallSite(CallSite CS); 260 bool visitReturnInst(ReturnInst &RI); 261 bool visitBranchInst(BranchInst &BI); 262 bool visitSelectInst(SelectInst &SI); 263 bool visitSwitchInst(SwitchInst &SI); 264 bool visitIndirectBrInst(IndirectBrInst &IBI); 265 bool visitResumeInst(ResumeInst &RI); 266 bool visitCleanupReturnInst(CleanupReturnInst &RI); 267 bool visitCatchReturnInst(CatchReturnInst &RI); 268 bool visitUnreachableInst(UnreachableInst &I); 269 270 public: 271 CallAnalyzer(const TargetTransformInfo &TTI, 272 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 273 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 274 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 275 Function &Callee, CallSite CSArg, const InlineParams &Params) 276 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 277 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 278 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 279 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 280 Params.ComputeFullInlineCost || ORE), 281 IsCallerRecursive(false), IsRecursiveCall(false), 282 ExposesReturnsTwice(false), HasDynamicAlloca(false), 283 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 284 HasFrameEscape(false), UsesVarArgs(false), AllocatedSize(0), NumInstructions(0), 285 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), 286 EnableLoadElimination(true), LoadEliminationCost(0), NumConstantArgs(0), 287 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 288 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 289 SROACostSavings(0), SROACostSavingsLost(0) {} 290 291 bool analyzeCall(CallSite CS); 292 293 int getThreshold() { return Threshold; } 294 int getCost() { return Cost; } 295 296 // Keep a bunch of stats about the cost savings found so we can print them 297 // out when debugging. 298 unsigned NumConstantArgs; 299 unsigned NumConstantOffsetPtrArgs; 300 unsigned NumAllocaArgs; 301 unsigned NumConstantPtrCmps; 302 unsigned NumConstantPtrDiffs; 303 unsigned NumInstructionsSimplified; 304 unsigned SROACostSavings; 305 unsigned SROACostSavingsLost; 306 307 void dump(); 308 }; 309 310 } // namespace 311 312 /// \brief Test whether the given value is an Alloca-derived function argument. 313 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 314 return SROAArgValues.count(V); 315 } 316 317 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 318 /// Returns false if V does not map to a SROA-candidate. 319 bool CallAnalyzer::lookupSROAArgAndCost( 320 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 321 if (SROAArgValues.empty() || SROAArgCosts.empty()) 322 return false; 323 324 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 325 if (ArgIt == SROAArgValues.end()) 326 return false; 327 328 Arg = ArgIt->second; 329 CostIt = SROAArgCosts.find(Arg); 330 return CostIt != SROAArgCosts.end(); 331 } 332 333 /// \brief Disable SROA for the candidate marked by this cost iterator. 334 /// 335 /// This marks the candidate as no longer viable for SROA, and adds the cost 336 /// savings associated with it back into the inline cost measurement. 337 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 338 // If we're no longer able to perform SROA we need to undo its cost savings 339 // and prevent subsequent analysis. 340 Cost += CostIt->second; 341 SROACostSavings -= CostIt->second; 342 SROACostSavingsLost += CostIt->second; 343 SROAArgCosts.erase(CostIt); 344 disableLoadElimination(); 345 } 346 347 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 348 void CallAnalyzer::disableSROA(Value *V) { 349 Value *SROAArg; 350 DenseMap<Value *, int>::iterator CostIt; 351 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 352 disableSROA(CostIt); 353 } 354 355 /// \brief Accumulate the given cost for a particular SROA candidate. 356 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 357 int InstructionCost) { 358 CostIt->second += InstructionCost; 359 SROACostSavings += InstructionCost; 360 } 361 362 void CallAnalyzer::disableLoadElimination() { 363 if (EnableLoadElimination) { 364 Cost += LoadEliminationCost; 365 LoadEliminationCost = 0; 366 EnableLoadElimination = false; 367 } 368 } 369 370 /// \brief Accumulate a constant GEP offset into an APInt if possible. 371 /// 372 /// Returns false if unable to compute the offset for any reason. Respects any 373 /// simplified values known during the analysis of this callsite. 374 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 375 unsigned IntPtrWidth = DL.getPointerTypeSizeInBits(GEP.getType()); 376 assert(IntPtrWidth == Offset.getBitWidth()); 377 378 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 379 GTI != GTE; ++GTI) { 380 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 381 if (!OpC) 382 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 383 OpC = dyn_cast<ConstantInt>(SimpleOp); 384 if (!OpC) 385 return false; 386 if (OpC->isZero()) 387 continue; 388 389 // Handle a struct index, which adds its field offset to the pointer. 390 if (StructType *STy = GTI.getStructTypeOrNull()) { 391 unsigned ElementIdx = OpC->getZExtValue(); 392 const StructLayout *SL = DL.getStructLayout(STy); 393 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 394 continue; 395 } 396 397 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 398 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 399 } 400 return true; 401 } 402 403 /// \brief Use TTI to check whether a GEP is free. 404 /// 405 /// Respects any simplified values known during the analysis of this callsite. 406 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 407 SmallVector<Value *, 4> Operands; 408 Operands.push_back(GEP.getOperand(0)); 409 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 410 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 411 Operands.push_back(SimpleOp); 412 else 413 Operands.push_back(*I); 414 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 415 } 416 417 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 418 // Check whether inlining will turn a dynamic alloca into a static 419 // alloca and handle that case. 420 if (I.isArrayAllocation()) { 421 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 422 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 423 Type *Ty = I.getAllocatedType(); 424 AllocatedSize = SaturatingMultiplyAdd( 425 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 426 return Base::visitAlloca(I); 427 } 428 } 429 430 // Accumulate the allocated size. 431 if (I.isStaticAlloca()) { 432 Type *Ty = I.getAllocatedType(); 433 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 434 } 435 436 // We will happily inline static alloca instructions. 437 if (I.isStaticAlloca()) 438 return Base::visitAlloca(I); 439 440 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 441 // a variety of reasons, and so we would like to not inline them into 442 // functions which don't currently have a dynamic alloca. This simply 443 // disables inlining altogether in the presence of a dynamic alloca. 444 HasDynamicAlloca = true; 445 return false; 446 } 447 448 bool CallAnalyzer::visitPHI(PHINode &I) { 449 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 450 // though we don't want to propagate it's bonuses. The idea is to disable 451 // SROA if it *might* be used in an inappropriate manner. 452 453 // Phi nodes are always zero-cost. 454 // FIXME: Pointer sizes may differ between different address spaces, so do we 455 // need to use correct address space in the call to getPointerSizeInBits here? 456 // Or could we skip the getPointerSizeInBits call completely? As far as I can 457 // see the ZeroOffset is used as a dummy value, so we can probably use any 458 // bit width for the ZeroOffset? 459 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 460 bool CheckSROA = I.getType()->isPointerTy(); 461 462 // Track the constant or pointer with constant offset we've seen so far. 463 Constant *FirstC = nullptr; 464 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 465 Value *FirstV = nullptr; 466 467 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 468 BasicBlock *Pred = I.getIncomingBlock(i); 469 // If the incoming block is dead, skip the incoming block. 470 if (DeadBlocks.count(Pred)) 471 continue; 472 // If the parent block of phi is not the known successor of the incoming 473 // block, skip the incoming block. 474 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 475 if (KnownSuccessor && KnownSuccessor != I.getParent()) 476 continue; 477 478 Value *V = I.getIncomingValue(i); 479 // If the incoming value is this phi itself, skip the incoming value. 480 if (&I == V) 481 continue; 482 483 Constant *C = dyn_cast<Constant>(V); 484 if (!C) 485 C = SimplifiedValues.lookup(V); 486 487 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 488 if (!C && CheckSROA) 489 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 490 491 if (!C && !BaseAndOffset.first) 492 // The incoming value is neither a constant nor a pointer with constant 493 // offset, exit early. 494 return true; 495 496 if (FirstC) { 497 if (FirstC == C) 498 // If we've seen a constant incoming value before and it is the same 499 // constant we see this time, continue checking the next incoming value. 500 continue; 501 // Otherwise early exit because we either see a different constant or saw 502 // a constant before but we have a pointer with constant offset this time. 503 return true; 504 } 505 506 if (FirstV) { 507 // The same logic as above, but check pointer with constant offset here. 508 if (FirstBaseAndOffset == BaseAndOffset) 509 continue; 510 return true; 511 } 512 513 if (C) { 514 // This is the 1st time we've seen a constant, record it. 515 FirstC = C; 516 continue; 517 } 518 519 // The remaining case is that this is the 1st time we've seen a pointer with 520 // constant offset, record it. 521 FirstV = V; 522 FirstBaseAndOffset = BaseAndOffset; 523 } 524 525 // Check if we can map phi to a constant. 526 if (FirstC) { 527 SimplifiedValues[&I] = FirstC; 528 return true; 529 } 530 531 // Check if we can map phi to a pointer with constant offset. 532 if (FirstBaseAndOffset.first) { 533 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 534 535 Value *SROAArg; 536 DenseMap<Value *, int>::iterator CostIt; 537 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 538 SROAArgValues[&I] = SROAArg; 539 } 540 541 return true; 542 } 543 544 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. 545 /// This requires target data and inbounds GEPs. 546 /// 547 /// \return true if the specified GEP can be folded. 548 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 549 // Check if we have a base + offset for the pointer. 550 std::pair<Value *, APInt> BaseAndOffset = 551 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 552 if (!BaseAndOffset.first) 553 return false; 554 555 // Check if the offset of this GEP is constant, and if so accumulate it 556 // into Offset. 557 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 558 return false; 559 560 // Add the result as a new mapping to Base + Offset. 561 ConstantOffsetPtrs[&I] = BaseAndOffset; 562 563 return true; 564 } 565 566 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 567 Value *SROAArg; 568 DenseMap<Value *, int>::iterator CostIt; 569 bool SROACandidate = 570 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 571 572 // Lambda to check whether a GEP's indices are all constant. 573 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 574 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 575 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 576 return false; 577 return true; 578 }; 579 580 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 581 if (SROACandidate) 582 SROAArgValues[&I] = SROAArg; 583 584 // Constant GEPs are modeled as free. 585 return true; 586 } 587 588 // Variable GEPs will require math and will disable SROA. 589 if (SROACandidate) 590 disableSROA(CostIt); 591 return isGEPFree(I); 592 } 593 594 /// Simplify \p I if its operands are constants and update SimplifiedValues. 595 /// \p Evaluate is a callable specific to instruction type that evaluates the 596 /// instruction when all the operands are constants. 597 template <typename Callable> 598 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 599 SmallVector<Constant *, 2> COps; 600 for (Value *Op : I.operands()) { 601 Constant *COp = dyn_cast<Constant>(Op); 602 if (!COp) 603 COp = SimplifiedValues.lookup(Op); 604 if (!COp) 605 return false; 606 COps.push_back(COp); 607 } 608 auto *C = Evaluate(COps); 609 if (!C) 610 return false; 611 SimplifiedValues[&I] = C; 612 return true; 613 } 614 615 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 616 // Propagate constants through bitcasts. 617 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 618 return ConstantExpr::getBitCast(COps[0], I.getType()); 619 })) 620 return true; 621 622 // Track base/offsets through casts 623 std::pair<Value *, APInt> BaseAndOffset = 624 ConstantOffsetPtrs.lookup(I.getOperand(0)); 625 // Casts don't change the offset, just wrap it up. 626 if (BaseAndOffset.first) 627 ConstantOffsetPtrs[&I] = BaseAndOffset; 628 629 // Also look for SROA candidates here. 630 Value *SROAArg; 631 DenseMap<Value *, int>::iterator CostIt; 632 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 633 SROAArgValues[&I] = SROAArg; 634 635 // Bitcasts are always zero cost. 636 return true; 637 } 638 639 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 640 // Propagate constants through ptrtoint. 641 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 642 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 643 })) 644 return true; 645 646 // Track base/offset pairs when converted to a plain integer provided the 647 // integer is large enough to represent the pointer. 648 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 649 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 650 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 651 std::pair<Value *, APInt> BaseAndOffset = 652 ConstantOffsetPtrs.lookup(I.getOperand(0)); 653 if (BaseAndOffset.first) 654 ConstantOffsetPtrs[&I] = BaseAndOffset; 655 } 656 657 // This is really weird. Technically, ptrtoint will disable SROA. However, 658 // unless that ptrtoint is *used* somewhere in the live basic blocks after 659 // inlining, it will be nuked, and SROA should proceed. All of the uses which 660 // would block SROA would also block SROA if applied directly to a pointer, 661 // and so we can just add the integer in here. The only places where SROA is 662 // preserved either cannot fire on an integer, or won't in-and-of themselves 663 // disable SROA (ext) w/o some later use that we would see and disable. 664 Value *SROAArg; 665 DenseMap<Value *, int>::iterator CostIt; 666 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 667 SROAArgValues[&I] = SROAArg; 668 669 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 670 } 671 672 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 673 // Propagate constants through ptrtoint. 674 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 675 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 676 })) 677 return true; 678 679 // Track base/offset pairs when round-tripped through a pointer without 680 // modifications provided the integer is not too large. 681 Value *Op = I.getOperand(0); 682 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 683 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 684 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 685 if (BaseAndOffset.first) 686 ConstantOffsetPtrs[&I] = BaseAndOffset; 687 } 688 689 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 690 Value *SROAArg; 691 DenseMap<Value *, int>::iterator CostIt; 692 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 693 SROAArgValues[&I] = SROAArg; 694 695 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 696 } 697 698 bool CallAnalyzer::visitCastInst(CastInst &I) { 699 // Propagate constants through ptrtoint. 700 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 701 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 702 })) 703 return true; 704 705 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 706 disableSROA(I.getOperand(0)); 707 708 // If this is a floating-point cast, and the target says this operation 709 // is expensive, this may eventually become a library call. Treat the cost 710 // as such. 711 switch (I.getOpcode()) { 712 case Instruction::FPTrunc: 713 case Instruction::FPExt: 714 case Instruction::UIToFP: 715 case Instruction::SIToFP: 716 case Instruction::FPToUI: 717 case Instruction::FPToSI: 718 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 719 Cost += InlineConstants::CallPenalty; 720 default: 721 break; 722 } 723 724 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 725 } 726 727 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 728 Value *Operand = I.getOperand(0); 729 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 730 return ConstantFoldInstOperands(&I, COps[0], DL); 731 })) 732 return true; 733 734 // Disable any SROA on the argument to arbitrary unary operators. 735 disableSROA(Operand); 736 737 return false; 738 } 739 740 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 741 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 742 } 743 744 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 745 // Does the *call site* have the NonNull attribute set on an argument? We 746 // use the attribute on the call site to memoize any analysis done in the 747 // caller. This will also trip if the callee function has a non-null 748 // parameter attribute, but that's a less interesting case because hopefully 749 // the callee would already have been simplified based on that. 750 if (Argument *A = dyn_cast<Argument>(V)) 751 if (paramHasAttr(A, Attribute::NonNull)) 752 return true; 753 754 // Is this an alloca in the caller? This is distinct from the attribute case 755 // above because attributes aren't updated within the inliner itself and we 756 // always want to catch the alloca derived case. 757 if (isAllocaDerivedArg(V)) 758 // We can actually predict the result of comparisons between an 759 // alloca-derived value and null. Note that this fires regardless of 760 // SROA firing. 761 return true; 762 763 return false; 764 } 765 766 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 767 // If the normal destination of the invoke or the parent block of the call 768 // site is unreachable-terminated, there is little point in inlining this 769 // unless there is literally zero cost. 770 // FIXME: Note that it is possible that an unreachable-terminated block has a 771 // hot entry. For example, in below scenario inlining hot_call_X() may be 772 // beneficial : 773 // main() { 774 // hot_call_1(); 775 // ... 776 // hot_call_N() 777 // exit(0); 778 // } 779 // For now, we are not handling this corner case here as it is rare in real 780 // code. In future, we should elaborate this based on BPI and BFI in more 781 // general threshold adjusting heuristics in updateThreshold(). 782 Instruction *Instr = CS.getInstruction(); 783 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 784 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 785 return false; 786 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 787 return false; 788 789 return true; 790 } 791 792 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 793 // If global profile summary is available, then callsite's coldness is 794 // determined based on that. 795 if (PSI && PSI->hasProfileSummary()) 796 return PSI->isColdCallSite(CS, CallerBFI); 797 798 // Otherwise we need BFI to be available. 799 if (!CallerBFI) 800 return false; 801 802 // Determine if the callsite is cold relative to caller's entry. We could 803 // potentially cache the computation of scaled entry frequency, but the added 804 // complexity is not worth it unless this scaling shows up high in the 805 // profiles. 806 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 807 auto CallSiteBB = CS.getInstruction()->getParent(); 808 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 809 auto CallerEntryFreq = 810 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 811 return CallSiteFreq < CallerEntryFreq * ColdProb; 812 } 813 814 Optional<int> 815 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 816 BlockFrequencyInfo *CallerBFI) { 817 818 // If global profile summary is available, then callsite's hotness is 819 // determined based on that. 820 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 821 return Params.HotCallSiteThreshold; 822 823 // Otherwise we need BFI to be available and to have a locally hot callsite 824 // threshold. 825 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 826 return None; 827 828 // Determine if the callsite is hot relative to caller's entry. We could 829 // potentially cache the computation of scaled entry frequency, but the added 830 // complexity is not worth it unless this scaling shows up high in the 831 // profiles. 832 auto CallSiteBB = CS.getInstruction()->getParent(); 833 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 834 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 835 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 836 return Params.LocallyHotCallSiteThreshold; 837 838 // Otherwise treat it normally. 839 return None; 840 } 841 842 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 843 // If no size growth is allowed for this inlining, set Threshold to 0. 844 if (!allowSizeGrowth(CS)) { 845 Threshold = 0; 846 return; 847 } 848 849 Function *Caller = CS.getCaller(); 850 851 // return min(A, B) if B is valid. 852 auto MinIfValid = [](int A, Optional<int> B) { 853 return B ? std::min(A, B.getValue()) : A; 854 }; 855 856 // return max(A, B) if B is valid. 857 auto MaxIfValid = [](int A, Optional<int> B) { 858 return B ? std::max(A, B.getValue()) : A; 859 }; 860 861 // Various bonus percentages. These are multiplied by Threshold to get the 862 // bonus values. 863 // SingleBBBonus: This bonus is applied if the callee has a single reachable 864 // basic block at the given callsite context. This is speculatively applied 865 // and withdrawn if more than one basic block is seen. 866 // 867 // Vector bonuses: We want to more aggressively inline vector-dense kernels 868 // and apply this bonus based on the percentage of vector instructions. A 869 // bonus is applied if the vector instructions exceed 50% and half that amount 870 // is applied if it exceeds 10%. Note that these bonuses are some what 871 // arbitrary and evolved over time by accident as much as because they are 872 // principled bonuses. 873 // FIXME: It would be nice to base the bonus values on something more 874 // scientific. 875 // 876 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 877 // of the last call to a static function as inlining such functions is 878 // guaranteed to reduce code size. 879 // 880 // These bonus percentages may be set to 0 based on properties of the caller 881 // and the callsite. 882 int SingleBBBonusPercent = 50; 883 int VectorBonusPercent = 150; 884 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 885 886 // Lambda to set all the above bonus and bonus percentages to 0. 887 auto DisallowAllBonuses = [&]() { 888 SingleBBBonusPercent = 0; 889 VectorBonusPercent = 0; 890 LastCallToStaticBonus = 0; 891 }; 892 893 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 894 // and reduce the threshold if the caller has the necessary attribute. 895 if (Caller->optForMinSize()) { 896 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 897 // For minsize, we want to disable the single BB bonus and the vector 898 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 899 // a static function will, at the minimum, eliminate the parameter setup and 900 // call/return instructions. 901 SingleBBBonusPercent = 0; 902 VectorBonusPercent = 0; 903 } else if (Caller->optForSize()) 904 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 905 906 // Adjust the threshold based on inlinehint attribute and profile based 907 // hotness information if the caller does not have MinSize attribute. 908 if (!Caller->optForMinSize()) { 909 if (Callee.hasFnAttribute(Attribute::InlineHint)) 910 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 911 912 // FIXME: After switching to the new passmanager, simplify the logic below 913 // by checking only the callsite hotness/coldness as we will reliably 914 // have local profile information. 915 // 916 // Callsite hotness and coldness can be determined if sample profile is 917 // used (which adds hotness metadata to calls) or if caller's 918 // BlockFrequencyInfo is available. 919 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 920 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 921 if (!Caller->optForSize() && HotCallSiteThreshold) { 922 DEBUG(dbgs() << "Hot callsite.\n"); 923 // FIXME: This should update the threshold only if it exceeds the 924 // current threshold, but AutoFDO + ThinLTO currently relies on this 925 // behavior to prevent inlining of hot callsites during ThinLTO 926 // compile phase. 927 Threshold = HotCallSiteThreshold.getValue(); 928 } else if (isColdCallSite(CS, CallerBFI)) { 929 DEBUG(dbgs() << "Cold callsite.\n"); 930 // Do not apply bonuses for a cold callsite including the 931 // LastCallToStatic bonus. While this bonus might result in code size 932 // reduction, it can cause the size of a non-cold caller to increase 933 // preventing it from being inlined. 934 DisallowAllBonuses(); 935 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 936 } else if (PSI) { 937 // Use callee's global profile information only if we have no way of 938 // determining this via callsite information. 939 if (PSI->isFunctionEntryHot(&Callee)) { 940 DEBUG(dbgs() << "Hot callee.\n"); 941 // If callsite hotness can not be determined, we may still know 942 // that the callee is hot and treat it as a weaker hint for threshold 943 // increase. 944 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 945 } else if (PSI->isFunctionEntryCold(&Callee)) { 946 DEBUG(dbgs() << "Cold callee.\n"); 947 // Do not apply bonuses for a cold callee including the 948 // LastCallToStatic bonus. While this bonus might result in code size 949 // reduction, it can cause the size of a non-cold caller to increase 950 // preventing it from being inlined. 951 DisallowAllBonuses(); 952 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 953 } 954 } 955 } 956 957 // Finally, take the target-specific inlining threshold multiplier into 958 // account. 959 Threshold *= TTI.getInliningThresholdMultiplier(); 960 961 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 962 VectorBonus = Threshold * VectorBonusPercent / 100; 963 964 bool OnlyOneCallAndLocalLinkage = 965 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 966 // If there is only one call of the function, and it has internal linkage, 967 // the cost of inlining it drops dramatically. It may seem odd to update 968 // Cost in updateThreshold, but the bonus depends on the logic in this method. 969 if (OnlyOneCallAndLocalLinkage) 970 Cost -= LastCallToStaticBonus; 971 } 972 973 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 974 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 975 // First try to handle simplified comparisons. 976 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 977 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 978 })) 979 return true; 980 981 if (I.getOpcode() == Instruction::FCmp) 982 return false; 983 984 // Otherwise look for a comparison between constant offset pointers with 985 // a common base. 986 Value *LHSBase, *RHSBase; 987 APInt LHSOffset, RHSOffset; 988 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 989 if (LHSBase) { 990 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 991 if (RHSBase && LHSBase == RHSBase) { 992 // We have common bases, fold the icmp to a constant based on the 993 // offsets. 994 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 995 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 996 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 997 SimplifiedValues[&I] = C; 998 ++NumConstantPtrCmps; 999 return true; 1000 } 1001 } 1002 } 1003 1004 // If the comparison is an equality comparison with null, we can simplify it 1005 // if we know the value (argument) can't be null 1006 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1007 isKnownNonNullInCallee(I.getOperand(0))) { 1008 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1009 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1010 : ConstantInt::getFalse(I.getType()); 1011 return true; 1012 } 1013 // Finally check for SROA candidates in comparisons. 1014 Value *SROAArg; 1015 DenseMap<Value *, int>::iterator CostIt; 1016 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1017 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1018 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1019 return true; 1020 } 1021 1022 disableSROA(CostIt); 1023 } 1024 1025 return false; 1026 } 1027 1028 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1029 // Try to handle a special case: we can fold computing the difference of two 1030 // constant-related pointers. 1031 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1032 Value *LHSBase, *RHSBase; 1033 APInt LHSOffset, RHSOffset; 1034 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1035 if (LHSBase) { 1036 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1037 if (RHSBase && LHSBase == RHSBase) { 1038 // We have common bases, fold the subtract to a constant based on the 1039 // offsets. 1040 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1041 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1042 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1043 SimplifiedValues[&I] = C; 1044 ++NumConstantPtrDiffs; 1045 return true; 1046 } 1047 } 1048 } 1049 1050 // Otherwise, fall back to the generic logic for simplifying and handling 1051 // instructions. 1052 return Base::visitSub(I); 1053 } 1054 1055 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1056 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1057 Constant *CLHS = dyn_cast<Constant>(LHS); 1058 if (!CLHS) 1059 CLHS = SimplifiedValues.lookup(LHS); 1060 Constant *CRHS = dyn_cast<Constant>(RHS); 1061 if (!CRHS) 1062 CRHS = SimplifiedValues.lookup(RHS); 1063 1064 Value *SimpleV = nullptr; 1065 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1066 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1067 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1068 else 1069 SimpleV = 1070 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1071 1072 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1073 SimplifiedValues[&I] = C; 1074 1075 if (SimpleV) 1076 return true; 1077 1078 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1079 disableSROA(LHS); 1080 disableSROA(RHS); 1081 1082 // If the instruction is floating point, and the target says this operation 1083 // is expensive, this may eventually become a library call. Treat the cost 1084 // as such. 1085 if (I.getType()->isFloatingPointTy() && 1086 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1087 Cost += InlineConstants::CallPenalty; 1088 1089 return false; 1090 } 1091 1092 bool CallAnalyzer::visitLoad(LoadInst &I) { 1093 Value *SROAArg; 1094 DenseMap<Value *, int>::iterator CostIt; 1095 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1096 if (I.isSimple()) { 1097 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1098 return true; 1099 } 1100 1101 disableSROA(CostIt); 1102 } 1103 1104 // If the data is already loaded from this address and hasn't been clobbered 1105 // by any stores or calls, this load is likely to be redundant and can be 1106 // eliminated. 1107 if (EnableLoadElimination && 1108 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1109 LoadEliminationCost += InlineConstants::InstrCost; 1110 return true; 1111 } 1112 1113 return false; 1114 } 1115 1116 bool CallAnalyzer::visitStore(StoreInst &I) { 1117 Value *SROAArg; 1118 DenseMap<Value *, int>::iterator CostIt; 1119 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1120 if (I.isSimple()) { 1121 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1122 return true; 1123 } 1124 1125 disableSROA(CostIt); 1126 } 1127 1128 // The store can potentially clobber loads and prevent repeated loads from 1129 // being eliminated. 1130 // FIXME: 1131 // 1. We can probably keep an initial set of eliminatable loads substracted 1132 // from the cost even when we finally see a store. We just need to disable 1133 // *further* accumulation of elimination savings. 1134 // 2. We should probably at some point thread MemorySSA for the callee into 1135 // this and then use that to actually compute *really* precise savings. 1136 disableLoadElimination(); 1137 return false; 1138 } 1139 1140 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1141 // Constant folding for extract value is trivial. 1142 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1143 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1144 })) 1145 return true; 1146 1147 // SROA can look through these but give them a cost. 1148 return false; 1149 } 1150 1151 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1152 // Constant folding for insert value is trivial. 1153 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1154 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1155 /*InsertedValueOperand*/ COps[1], 1156 I.getIndices()); 1157 })) 1158 return true; 1159 1160 // SROA can look through these but give them a cost. 1161 return false; 1162 } 1163 1164 /// \brief Try to simplify a call site. 1165 /// 1166 /// Takes a concrete function and callsite and tries to actually simplify it by 1167 /// analyzing the arguments and call itself with instsimplify. Returns true if 1168 /// it has simplified the callsite to some other entity (a constant), making it 1169 /// free. 1170 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1171 // FIXME: Using the instsimplify logic directly for this is inefficient 1172 // because we have to continually rebuild the argument list even when no 1173 // simplifications can be performed. Until that is fixed with remapping 1174 // inside of instsimplify, directly constant fold calls here. 1175 if (!canConstantFoldCallTo(CS, F)) 1176 return false; 1177 1178 // Try to re-map the arguments to constants. 1179 SmallVector<Constant *, 4> ConstantArgs; 1180 ConstantArgs.reserve(CS.arg_size()); 1181 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1182 ++I) { 1183 Constant *C = dyn_cast<Constant>(*I); 1184 if (!C) 1185 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1186 if (!C) 1187 return false; // This argument doesn't map to a constant. 1188 1189 ConstantArgs.push_back(C); 1190 } 1191 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1192 SimplifiedValues[CS.getInstruction()] = C; 1193 return true; 1194 } 1195 1196 return false; 1197 } 1198 1199 bool CallAnalyzer::visitCallSite(CallSite CS) { 1200 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1201 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1202 // This aborts the entire analysis. 1203 ExposesReturnsTwice = true; 1204 return false; 1205 } 1206 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1207 ContainsNoDuplicateCall = true; 1208 1209 if (Function *F = CS.getCalledFunction()) { 1210 // When we have a concrete function, first try to simplify it directly. 1211 if (simplifyCallSite(F, CS)) 1212 return true; 1213 1214 // Next check if it is an intrinsic we know about. 1215 // FIXME: Lift this into part of the InstVisitor. 1216 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1217 switch (II->getIntrinsicID()) { 1218 default: 1219 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1220 disableLoadElimination(); 1221 return Base::visitCallSite(CS); 1222 1223 case Intrinsic::load_relative: 1224 // This is normally lowered to 4 LLVM instructions. 1225 Cost += 3 * InlineConstants::InstrCost; 1226 return false; 1227 1228 case Intrinsic::memset: 1229 case Intrinsic::memcpy: 1230 case Intrinsic::memmove: 1231 disableLoadElimination(); 1232 // SROA can usually chew through these intrinsics, but they aren't free. 1233 return false; 1234 case Intrinsic::localescape: 1235 HasFrameEscape = true; 1236 return false; 1237 case Intrinsic::vastart: 1238 case Intrinsic::vaend: 1239 UsesVarArgs = true; 1240 return false; 1241 } 1242 } 1243 1244 if (F == CS.getInstruction()->getFunction()) { 1245 // This flag will fully abort the analysis, so don't bother with anything 1246 // else. 1247 IsRecursiveCall = true; 1248 return false; 1249 } 1250 1251 if (TTI.isLoweredToCall(F)) { 1252 // We account for the average 1 instruction per call argument setup 1253 // here. 1254 Cost += CS.arg_size() * InlineConstants::InstrCost; 1255 1256 // Everything other than inline ASM will also have a significant cost 1257 // merely from making the call. 1258 if (!isa<InlineAsm>(CS.getCalledValue())) 1259 Cost += InlineConstants::CallPenalty; 1260 } 1261 1262 if (!CS.onlyReadsMemory()) 1263 disableLoadElimination(); 1264 return Base::visitCallSite(CS); 1265 } 1266 1267 // Otherwise we're in a very special case -- an indirect function call. See 1268 // if we can be particularly clever about this. 1269 Value *Callee = CS.getCalledValue(); 1270 1271 // First, pay the price of the argument setup. We account for the average 1272 // 1 instruction per call argument setup here. 1273 Cost += CS.arg_size() * InlineConstants::InstrCost; 1274 1275 // Next, check if this happens to be an indirect function call to a known 1276 // function in this inline context. If not, we've done all we can. 1277 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1278 if (!F) { 1279 if (!CS.onlyReadsMemory()) 1280 disableLoadElimination(); 1281 return Base::visitCallSite(CS); 1282 } 1283 1284 // If we have a constant that we are calling as a function, we can peer 1285 // through it and see the function target. This happens not infrequently 1286 // during devirtualization and so we want to give it a hefty bonus for 1287 // inlining, but cap that bonus in the event that inlining wouldn't pan 1288 // out. Pretend to inline the function, with a custom threshold. 1289 auto IndirectCallParams = Params; 1290 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1291 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1292 IndirectCallParams); 1293 if (CA.analyzeCall(CS)) { 1294 // We were able to inline the indirect call! Subtract the cost from the 1295 // threshold to get the bonus we want to apply, but don't go below zero. 1296 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1297 } 1298 1299 if (!F->onlyReadsMemory()) 1300 disableLoadElimination(); 1301 return Base::visitCallSite(CS); 1302 } 1303 1304 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1305 // At least one return instruction will be free after inlining. 1306 bool Free = !HasReturn; 1307 HasReturn = true; 1308 return Free; 1309 } 1310 1311 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1312 // We model unconditional branches as essentially free -- they really 1313 // shouldn't exist at all, but handling them makes the behavior of the 1314 // inliner more regular and predictable. Interestingly, conditional branches 1315 // which will fold away are also free. 1316 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1317 dyn_cast_or_null<ConstantInt>( 1318 SimplifiedValues.lookup(BI.getCondition())); 1319 } 1320 1321 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1322 bool CheckSROA = SI.getType()->isPointerTy(); 1323 Value *TrueVal = SI.getTrueValue(); 1324 Value *FalseVal = SI.getFalseValue(); 1325 1326 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1327 if (!TrueC) 1328 TrueC = SimplifiedValues.lookup(TrueVal); 1329 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1330 if (!FalseC) 1331 FalseC = SimplifiedValues.lookup(FalseVal); 1332 Constant *CondC = 1333 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1334 1335 if (!CondC) { 1336 // Select C, X, X => X 1337 if (TrueC == FalseC && TrueC) { 1338 SimplifiedValues[&SI] = TrueC; 1339 return true; 1340 } 1341 1342 if (!CheckSROA) 1343 return Base::visitSelectInst(SI); 1344 1345 std::pair<Value *, APInt> TrueBaseAndOffset = 1346 ConstantOffsetPtrs.lookup(TrueVal); 1347 std::pair<Value *, APInt> FalseBaseAndOffset = 1348 ConstantOffsetPtrs.lookup(FalseVal); 1349 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1350 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1351 1352 Value *SROAArg; 1353 DenseMap<Value *, int>::iterator CostIt; 1354 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1355 SROAArgValues[&SI] = SROAArg; 1356 return true; 1357 } 1358 1359 return Base::visitSelectInst(SI); 1360 } 1361 1362 // Select condition is a constant. 1363 Value *SelectedV = CondC->isAllOnesValue() 1364 ? TrueVal 1365 : (CondC->isNullValue()) ? FalseVal : nullptr; 1366 if (!SelectedV) { 1367 // Condition is a vector constant that is not all 1s or all 0s. If all 1368 // operands are constants, ConstantExpr::getSelect() can handle the cases 1369 // such as select vectors. 1370 if (TrueC && FalseC) { 1371 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1372 SimplifiedValues[&SI] = C; 1373 return true; 1374 } 1375 } 1376 return Base::visitSelectInst(SI); 1377 } 1378 1379 // Condition is either all 1s or all 0s. SI can be simplified. 1380 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1381 SimplifiedValues[&SI] = SelectedC; 1382 return true; 1383 } 1384 1385 if (!CheckSROA) 1386 return true; 1387 1388 std::pair<Value *, APInt> BaseAndOffset = 1389 ConstantOffsetPtrs.lookup(SelectedV); 1390 if (BaseAndOffset.first) { 1391 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1392 1393 Value *SROAArg; 1394 DenseMap<Value *, int>::iterator CostIt; 1395 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1396 SROAArgValues[&SI] = SROAArg; 1397 } 1398 1399 return true; 1400 } 1401 1402 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1403 // We model unconditional switches as free, see the comments on handling 1404 // branches. 1405 if (isa<ConstantInt>(SI.getCondition())) 1406 return true; 1407 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1408 if (isa<ConstantInt>(V)) 1409 return true; 1410 1411 // Assume the most general case where the switch is lowered into 1412 // either a jump table, bit test, or a balanced binary tree consisting of 1413 // case clusters without merging adjacent clusters with the same 1414 // destination. We do not consider the switches that are lowered with a mix 1415 // of jump table/bit test/binary search tree. The cost of the switch is 1416 // proportional to the size of the tree or the size of jump table range. 1417 // 1418 // NB: We convert large switches which are just used to initialize large phi 1419 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1420 // inlining those. It will prevent inlining in cases where the optimization 1421 // does not (yet) fire. 1422 1423 // Maximum valid cost increased in this function. 1424 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1425 1426 // Exit early for a large switch, assuming one case needs at least one 1427 // instruction. 1428 // FIXME: This is not true for a bit test, but ignore such case for now to 1429 // save compile-time. 1430 int64_t CostLowerBound = 1431 std::min((int64_t)CostUpperBound, 1432 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1433 1434 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1435 Cost = CostLowerBound; 1436 return false; 1437 } 1438 1439 unsigned JumpTableSize = 0; 1440 unsigned NumCaseCluster = 1441 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1442 1443 // If suitable for a jump table, consider the cost for the table size and 1444 // branch to destination. 1445 if (JumpTableSize) { 1446 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1447 4 * InlineConstants::InstrCost; 1448 1449 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1450 return false; 1451 } 1452 1453 // Considering forming a binary search, we should find the number of nodes 1454 // which is same as the number of comparisons when lowered. For a given 1455 // number of clusters, n, we can define a recursive function, f(n), to find 1456 // the number of nodes in the tree. The recursion is : 1457 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1458 // and f(n) = n, when n <= 3. 1459 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1460 // when n > 3. So, the number of comparisons from leaves should be n, while 1461 // the number of non-leaf should be : 1462 // 2^(log2(n) - 1) - 1 1463 // = 2^log2(n) * 2^-1 - 1 1464 // = n / 2 - 1. 1465 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1466 // number of comparisons in a simple closed form : 1467 // n + n / 2 - 1 = n * 3 / 2 - 1 1468 if (NumCaseCluster <= 3) { 1469 // Suppose a comparison includes one compare and one conditional branch. 1470 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1471 return false; 1472 } 1473 1474 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1475 int64_t SwitchCost = 1476 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1477 1478 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1479 return false; 1480 } 1481 1482 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1483 // We never want to inline functions that contain an indirectbr. This is 1484 // incorrect because all the blockaddress's (in static global initializers 1485 // for example) would be referring to the original function, and this 1486 // indirect jump would jump from the inlined copy of the function into the 1487 // original function which is extremely undefined behavior. 1488 // FIXME: This logic isn't really right; we can safely inline functions with 1489 // indirectbr's as long as no other function or global references the 1490 // blockaddress of a block within the current function. 1491 HasIndirectBr = true; 1492 return false; 1493 } 1494 1495 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1496 // FIXME: It's not clear that a single instruction is an accurate model for 1497 // the inline cost of a resume instruction. 1498 return false; 1499 } 1500 1501 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1502 // FIXME: It's not clear that a single instruction is an accurate model for 1503 // the inline cost of a cleanupret instruction. 1504 return false; 1505 } 1506 1507 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1508 // FIXME: It's not clear that a single instruction is an accurate model for 1509 // the inline cost of a catchret instruction. 1510 return false; 1511 } 1512 1513 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1514 // FIXME: It might be reasonably to discount the cost of instructions leading 1515 // to unreachable as they have the lowest possible impact on both runtime and 1516 // code size. 1517 return true; // No actual code is needed for unreachable. 1518 } 1519 1520 bool CallAnalyzer::visitInstruction(Instruction &I) { 1521 // Some instructions are free. All of the free intrinsics can also be 1522 // handled by SROA, etc. 1523 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1524 return true; 1525 1526 // We found something we don't understand or can't handle. Mark any SROA-able 1527 // values in the operand list as no longer viable. 1528 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1529 disableSROA(*OI); 1530 1531 return false; 1532 } 1533 1534 /// \brief Analyze a basic block for its contribution to the inline cost. 1535 /// 1536 /// This method walks the analyzer over every instruction in the given basic 1537 /// block and accounts for their cost during inlining at this callsite. It 1538 /// aborts early if the threshold has been exceeded or an impossible to inline 1539 /// construct has been detected. It returns false if inlining is no longer 1540 /// viable, and true if inlining remains viable. 1541 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1542 SmallPtrSetImpl<const Value *> &EphValues) { 1543 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1544 // FIXME: Currently, the number of instructions in a function regardless of 1545 // our ability to simplify them during inline to constants or dead code, 1546 // are actually used by the vector bonus heuristic. As long as that's true, 1547 // we have to special case debug intrinsics here to prevent differences in 1548 // inlining due to debug symbols. Eventually, the number of unsimplified 1549 // instructions shouldn't factor into the cost computation, but until then, 1550 // hack around it here. 1551 if (isa<DbgInfoIntrinsic>(I)) 1552 continue; 1553 1554 // Skip ephemeral values. 1555 if (EphValues.count(&*I)) 1556 continue; 1557 1558 ++NumInstructions; 1559 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1560 ++NumVectorInstructions; 1561 1562 // If the instruction simplified to a constant, there is no cost to this 1563 // instruction. Visit the instructions using our InstVisitor to account for 1564 // all of the per-instruction logic. The visit tree returns true if we 1565 // consumed the instruction in any way, and false if the instruction's base 1566 // cost should count against inlining. 1567 if (Base::visit(&*I)) 1568 ++NumInstructionsSimplified; 1569 else 1570 Cost += InlineConstants::InstrCost; 1571 1572 using namespace ore; 1573 // If the visit this instruction detected an uninlinable pattern, abort. 1574 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1575 HasIndirectBr || HasFrameEscape || UsesVarArgs) { 1576 if (ORE) 1577 ORE->emit([&]() { 1578 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1579 CandidateCS.getInstruction()) 1580 << NV("Callee", &F) 1581 << " has uninlinable pattern and cost is not fully computed"; 1582 }); 1583 return false; 1584 } 1585 1586 // If the caller is a recursive function then we don't want to inline 1587 // functions which allocate a lot of stack space because it would increase 1588 // the caller stack usage dramatically. 1589 if (IsCallerRecursive && 1590 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1591 if (ORE) 1592 ORE->emit([&]() { 1593 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1594 CandidateCS.getInstruction()) 1595 << NV("Callee", &F) 1596 << " is recursive and allocates too much stack space. Cost is " 1597 "not fully computed"; 1598 }); 1599 return false; 1600 } 1601 1602 // Check if we've past the maximum possible threshold so we don't spin in 1603 // huge basic blocks that will never inline. 1604 if (Cost >= Threshold && !ComputeFullInlineCost) 1605 return false; 1606 } 1607 1608 return true; 1609 } 1610 1611 /// \brief Compute the base pointer and cumulative constant offsets for V. 1612 /// 1613 /// This strips all constant offsets off of V, leaving it the base pointer, and 1614 /// accumulates the total constant offset applied in the returned constant. It 1615 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1616 /// no constant offsets applied. 1617 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1618 if (!V->getType()->isPointerTy()) 1619 return nullptr; 1620 1621 unsigned AS = V->getType()->getPointerAddressSpace(); 1622 unsigned IntPtrWidth = DL.getPointerSizeInBits(AS); 1623 APInt Offset = APInt::getNullValue(IntPtrWidth); 1624 1625 // Even though we don't look through PHI nodes, we could be called on an 1626 // instruction in an unreachable block, which may be on a cycle. 1627 SmallPtrSet<Value *, 4> Visited; 1628 Visited.insert(V); 1629 do { 1630 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1631 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1632 return nullptr; 1633 V = GEP->getPointerOperand(); 1634 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1635 V = cast<Operator>(V)->getOperand(0); 1636 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1637 if (GA->isInterposable()) 1638 break; 1639 V = GA->getAliasee(); 1640 } else { 1641 break; 1642 } 1643 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1644 } while (Visited.insert(V).second); 1645 1646 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1647 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1648 } 1649 1650 /// \brief Find dead blocks due to deleted CFG edges during inlining. 1651 /// 1652 /// If we know the successor of the current block, \p CurrBB, has to be \p 1653 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1654 /// no live incoming CFG edges. If one block is found to be dead, we can 1655 /// continue growing the dead block list by checking the successors of the dead 1656 /// blocks to see if all their incoming edges are dead or not. 1657 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1658 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1659 // A CFG edge is dead if the predecessor is dead or the predessor has a 1660 // known successor which is not the one under exam. 1661 return (DeadBlocks.count(Pred) || 1662 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1663 }; 1664 1665 auto IsNewlyDead = [&](BasicBlock *BB) { 1666 // If all the edges to a block are dead, the block is also dead. 1667 return (!DeadBlocks.count(BB) && 1668 llvm::all_of(predecessors(BB), 1669 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1670 }; 1671 1672 for (BasicBlock *Succ : successors(CurrBB)) { 1673 if (Succ == NextBB || !IsNewlyDead(Succ)) 1674 continue; 1675 SmallVector<BasicBlock *, 4> NewDead; 1676 NewDead.push_back(Succ); 1677 while (!NewDead.empty()) { 1678 BasicBlock *Dead = NewDead.pop_back_val(); 1679 if (DeadBlocks.insert(Dead)) 1680 // Continue growing the dead block lists. 1681 for (BasicBlock *S : successors(Dead)) 1682 if (IsNewlyDead(S)) 1683 NewDead.push_back(S); 1684 } 1685 } 1686 } 1687 1688 /// \brief Analyze a call site for potential inlining. 1689 /// 1690 /// Returns true if inlining this call is viable, and false if it is not 1691 /// viable. It computes the cost and adjusts the threshold based on numerous 1692 /// factors and heuristics. If this method returns false but the computed cost 1693 /// is below the computed threshold, then inlining was forcibly disabled by 1694 /// some artifact of the routine. 1695 bool CallAnalyzer::analyzeCall(CallSite CS) { 1696 ++NumCallsAnalyzed; 1697 1698 // Perform some tweaks to the cost and threshold based on the direct 1699 // callsite information. 1700 1701 // We want to more aggressively inline vector-dense kernels, so up the 1702 // threshold, and we'll lower it if the % of vector instructions gets too 1703 // low. Note that these bonuses are some what arbitrary and evolved over time 1704 // by accident as much as because they are principled bonuses. 1705 // 1706 // FIXME: It would be nice to remove all such bonuses. At least it would be 1707 // nice to base the bonus values on something more scientific. 1708 assert(NumInstructions == 0); 1709 assert(NumVectorInstructions == 0); 1710 1711 // Update the threshold based on callsite properties 1712 updateThreshold(CS, F); 1713 1714 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1715 // this Threshold any time, and cost cannot decrease, we can stop processing 1716 // the rest of the function body. 1717 Threshold += (SingleBBBonus + VectorBonus); 1718 1719 // Give out bonuses for the callsite, as the instructions setting them up 1720 // will be gone after inlining. 1721 Cost -= getCallsiteCost(CS, DL); 1722 1723 // If this function uses the coldcc calling convention, prefer not to inline 1724 // it. 1725 if (F.getCallingConv() == CallingConv::Cold) 1726 Cost += InlineConstants::ColdccPenalty; 1727 1728 // Check if we're done. This can happen due to bonuses and penalties. 1729 if (Cost >= Threshold && !ComputeFullInlineCost) 1730 return false; 1731 1732 if (F.empty()) 1733 return true; 1734 1735 Function *Caller = CS.getInstruction()->getFunction(); 1736 // Check if the caller function is recursive itself. 1737 for (User *U : Caller->users()) { 1738 CallSite Site(U); 1739 if (!Site) 1740 continue; 1741 Instruction *I = Site.getInstruction(); 1742 if (I->getFunction() == Caller) { 1743 IsCallerRecursive = true; 1744 break; 1745 } 1746 } 1747 1748 // Populate our simplified values by mapping from function arguments to call 1749 // arguments with known important simplifications. 1750 CallSite::arg_iterator CAI = CS.arg_begin(); 1751 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1752 FAI != FAE; ++FAI, ++CAI) { 1753 assert(CAI != CS.arg_end()); 1754 if (Constant *C = dyn_cast<Constant>(CAI)) 1755 SimplifiedValues[&*FAI] = C; 1756 1757 Value *PtrArg = *CAI; 1758 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1759 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1760 1761 // We can SROA any pointer arguments derived from alloca instructions. 1762 if (isa<AllocaInst>(PtrArg)) { 1763 SROAArgValues[&*FAI] = PtrArg; 1764 SROAArgCosts[PtrArg] = 0; 1765 } 1766 } 1767 } 1768 NumConstantArgs = SimplifiedValues.size(); 1769 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1770 NumAllocaArgs = SROAArgValues.size(); 1771 1772 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1773 // the ephemeral values multiple times (and they're completely determined by 1774 // the callee, so this is purely duplicate work). 1775 SmallPtrSet<const Value *, 32> EphValues; 1776 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1777 1778 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1779 // adding basic blocks of the callee which can be proven to be dead for this 1780 // particular call site in order to get more accurate cost estimates. This 1781 // requires a somewhat heavyweight iteration pattern: we need to walk the 1782 // basic blocks in a breadth-first order as we insert live successors. To 1783 // accomplish this, prioritizing for small iterations because we exit after 1784 // crossing our threshold, we use a small-size optimized SetVector. 1785 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1786 SmallPtrSet<BasicBlock *, 16>> 1787 BBSetVector; 1788 BBSetVector BBWorklist; 1789 BBWorklist.insert(&F.getEntryBlock()); 1790 bool SingleBB = true; 1791 // Note that we *must not* cache the size, this loop grows the worklist. 1792 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1793 // Bail out the moment we cross the threshold. This means we'll under-count 1794 // the cost, but only when undercounting doesn't matter. 1795 if (Cost >= Threshold && !ComputeFullInlineCost) 1796 break; 1797 1798 BasicBlock *BB = BBWorklist[Idx]; 1799 if (BB->empty()) 1800 continue; 1801 1802 // Disallow inlining a blockaddress. A blockaddress only has defined 1803 // behavior for an indirect branch in the same function, and we do not 1804 // currently support inlining indirect branches. But, the inliner may not 1805 // see an indirect branch that ends up being dead code at a particular call 1806 // site. If the blockaddress escapes the function, e.g., via a global 1807 // variable, inlining may lead to an invalid cross-function reference. 1808 if (BB->hasAddressTaken()) 1809 return false; 1810 1811 // Analyze the cost of this block. If we blow through the threshold, this 1812 // returns false, and we can bail on out. 1813 if (!analyzeBlock(BB, EphValues)) 1814 return false; 1815 1816 TerminatorInst *TI = BB->getTerminator(); 1817 1818 // Add in the live successors by first checking whether we have terminator 1819 // that may be simplified based on the values simplified by this call. 1820 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1821 if (BI->isConditional()) { 1822 Value *Cond = BI->getCondition(); 1823 if (ConstantInt *SimpleCond = 1824 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1825 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1826 BBWorklist.insert(NextBB); 1827 KnownSuccessors[BB] = NextBB; 1828 findDeadBlocks(BB, NextBB); 1829 continue; 1830 } 1831 } 1832 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1833 Value *Cond = SI->getCondition(); 1834 if (ConstantInt *SimpleCond = 1835 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1836 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1837 BBWorklist.insert(NextBB); 1838 KnownSuccessors[BB] = NextBB; 1839 findDeadBlocks(BB, NextBB); 1840 continue; 1841 } 1842 } 1843 1844 // If we're unable to select a particular successor, just count all of 1845 // them. 1846 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1847 ++TIdx) 1848 BBWorklist.insert(TI->getSuccessor(TIdx)); 1849 1850 // If we had any successors at this point, than post-inlining is likely to 1851 // have them as well. Note that we assume any basic blocks which existed 1852 // due to branches or switches which folded above will also fold after 1853 // inlining. 1854 if (SingleBB && TI->getNumSuccessors() > 1) { 1855 // Take off the bonus we applied to the threshold. 1856 Threshold -= SingleBBBonus; 1857 SingleBB = false; 1858 } 1859 } 1860 1861 bool OnlyOneCallAndLocalLinkage = 1862 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1863 // If this is a noduplicate call, we can still inline as long as 1864 // inlining this would cause the removal of the caller (so the instruction 1865 // is not actually duplicated, just moved). 1866 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1867 return false; 1868 1869 // We applied the maximum possible vector bonus at the beginning. Now, 1870 // subtract the excess bonus, if any, from the Threshold before 1871 // comparing against Cost. 1872 if (NumVectorInstructions <= NumInstructions / 10) 1873 Threshold -= VectorBonus; 1874 else if (NumVectorInstructions <= NumInstructions / 2) 1875 Threshold -= VectorBonus/2; 1876 1877 return Cost < std::max(1, Threshold); 1878 } 1879 1880 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1881 /// \brief Dump stats about this call's analysis. 1882 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1883 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1884 DEBUG_PRINT_STAT(NumConstantArgs); 1885 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1886 DEBUG_PRINT_STAT(NumAllocaArgs); 1887 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1888 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1889 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1890 DEBUG_PRINT_STAT(NumInstructions); 1891 DEBUG_PRINT_STAT(SROACostSavings); 1892 DEBUG_PRINT_STAT(SROACostSavingsLost); 1893 DEBUG_PRINT_STAT(LoadEliminationCost); 1894 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1895 DEBUG_PRINT_STAT(Cost); 1896 DEBUG_PRINT_STAT(Threshold); 1897 #undef DEBUG_PRINT_STAT 1898 } 1899 #endif 1900 1901 /// \brief Test that there are no attribute conflicts between Caller and Callee 1902 /// that prevent inlining. 1903 static bool functionsHaveCompatibleAttributes(Function *Caller, 1904 Function *Callee, 1905 TargetTransformInfo &TTI) { 1906 return TTI.areInlineCompatible(Caller, Callee) && 1907 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1908 } 1909 1910 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1911 int Cost = 0; 1912 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1913 if (CS.isByValArgument(I)) { 1914 // We approximate the number of loads and stores needed by dividing the 1915 // size of the byval type by the target's pointer size. 1916 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1917 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1918 unsigned AS = PTy->getAddressSpace(); 1919 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1920 // Ceiling division. 1921 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1922 1923 // If it generates more than 8 stores it is likely to be expanded as an 1924 // inline memcpy so we take that as an upper bound. Otherwise we assume 1925 // one load and one store per word copied. 1926 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1927 // here instead of a magic number of 8, but it's not available via 1928 // DataLayout. 1929 NumStores = std::min(NumStores, 8U); 1930 1931 Cost += 2 * NumStores * InlineConstants::InstrCost; 1932 } else { 1933 // For non-byval arguments subtract off one instruction per call 1934 // argument. 1935 Cost += InlineConstants::InstrCost; 1936 } 1937 } 1938 // The call instruction also disappears after inlining. 1939 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1940 return Cost; 1941 } 1942 1943 InlineCost llvm::getInlineCost( 1944 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1945 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1946 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1947 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1948 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1949 GetAssumptionCache, GetBFI, PSI, ORE); 1950 } 1951 1952 InlineCost llvm::getInlineCost( 1953 CallSite CS, Function *Callee, const InlineParams &Params, 1954 TargetTransformInfo &CalleeTTI, 1955 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1956 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1957 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1958 1959 // Cannot inline indirect calls. 1960 if (!Callee) 1961 return llvm::InlineCost::getNever(); 1962 1963 // Never inline calls with byval arguments that does not have the alloca 1964 // address space. Since byval arguments can be replaced with a copy to an 1965 // alloca, the inlined code would need to be adjusted to handle that the 1966 // argument is in the alloca address space (so it is a little bit complicated 1967 // to solve). 1968 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 1969 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) 1970 if (CS.isByValArgument(I)) { 1971 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1972 if (PTy->getAddressSpace() != AllocaAS) 1973 return llvm::InlineCost::getNever(); 1974 } 1975 1976 // Calls to functions with always-inline attributes should be inlined 1977 // whenever possible. 1978 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1979 if (isInlineViable(*Callee)) 1980 return llvm::InlineCost::getAlways(); 1981 return llvm::InlineCost::getNever(); 1982 } 1983 1984 // Never inline functions with conflicting attributes (unless callee has 1985 // always-inline attribute). 1986 Function *Caller = CS.getCaller(); 1987 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1988 return llvm::InlineCost::getNever(); 1989 1990 // Don't inline this call if the caller has the optnone attribute. 1991 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1992 return llvm::InlineCost::getNever(); 1993 1994 // Don't inline functions which can be interposed at link-time. Don't inline 1995 // functions marked noinline or call sites marked noinline. 1996 // Note: inlining non-exact non-interposable functions is fine, since we know 1997 // we have *a* correct implementation of the source level function. 1998 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1999 CS.isNoInline()) 2000 return llvm::InlineCost::getNever(); 2001 2002 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2003 << "... (caller:" << Caller->getName() << ")\n"); 2004 2005 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 2006 Params); 2007 bool ShouldInline = CA.analyzeCall(CS); 2008 2009 DEBUG(CA.dump()); 2010 2011 // Check if there was a reason to force inlining or no inlining. 2012 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2013 return InlineCost::getNever(); 2014 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2015 return InlineCost::getAlways(); 2016 2017 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2018 } 2019 2020 bool llvm::isInlineViable(Function &F) { 2021 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2022 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2023 // Disallow inlining of functions which contain indirect branches or 2024 // blockaddresses. 2025 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 2026 return false; 2027 2028 for (auto &II : *BI) { 2029 CallSite CS(&II); 2030 if (!CS) 2031 continue; 2032 2033 // Disallow recursive calls. 2034 if (&F == CS.getCalledFunction()) 2035 return false; 2036 2037 // Disallow calls which expose returns-twice to a function not previously 2038 // attributed as such. 2039 if (!ReturnsTwice && CS.isCall() && 2040 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 2041 return false; 2042 2043 // Disallow inlining functions that call @llvm.localescape. Doing this 2044 // correctly would require major changes to the inliner. 2045 if (CS.getCalledFunction() && 2046 CS.getCalledFunction()->getIntrinsicID() == 2047 llvm::Intrinsic::localescape) 2048 return false; 2049 } 2050 } 2051 2052 return true; 2053 } 2054 2055 // APIs to create InlineParams based on command line flags and/or other 2056 // parameters. 2057 2058 InlineParams llvm::getInlineParams(int Threshold) { 2059 InlineParams Params; 2060 2061 // This field is the threshold to use for a callee by default. This is 2062 // derived from one or more of: 2063 // * optimization or size-optimization levels, 2064 // * a value passed to createFunctionInliningPass function, or 2065 // * the -inline-threshold flag. 2066 // If the -inline-threshold flag is explicitly specified, that is used 2067 // irrespective of anything else. 2068 if (InlineThreshold.getNumOccurrences() > 0) 2069 Params.DefaultThreshold = InlineThreshold; 2070 else 2071 Params.DefaultThreshold = Threshold; 2072 2073 // Set the HintThreshold knob from the -inlinehint-threshold. 2074 Params.HintThreshold = HintThreshold; 2075 2076 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2077 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2078 2079 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2080 // populate LocallyHotCallSiteThreshold. Later, we populate 2081 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2082 // we know that optimization level is O3 (in the getInlineParams variant that 2083 // takes the opt and size levels). 2084 // FIXME: Remove this check (and make the assignment unconditional) after 2085 // addressing size regression issues at O2. 2086 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2087 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2088 2089 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2090 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2091 2092 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2093 // -inlinehint-threshold commandline option is not explicitly given. If that 2094 // option is present, then its value applies even for callees with size and 2095 // minsize attributes. 2096 // If the -inline-threshold is not specified, set the ColdThreshold from the 2097 // -inlinecold-threshold even if it is not explicitly passed. If 2098 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2099 // explicitly specified to set the ColdThreshold knob 2100 if (InlineThreshold.getNumOccurrences() == 0) { 2101 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2102 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2103 Params.ColdThreshold = ColdThreshold; 2104 } else if (ColdThreshold.getNumOccurrences() > 0) { 2105 Params.ColdThreshold = ColdThreshold; 2106 } 2107 return Params; 2108 } 2109 2110 InlineParams llvm::getInlineParams() { 2111 return getInlineParams(InlineThreshold); 2112 } 2113 2114 // Compute the default threshold for inlining based on the opt level and the 2115 // size opt level. 2116 static int computeThresholdFromOptLevels(unsigned OptLevel, 2117 unsigned SizeOptLevel) { 2118 if (OptLevel > 2) 2119 return InlineConstants::OptAggressiveThreshold; 2120 if (SizeOptLevel == 1) // -Os 2121 return InlineConstants::OptSizeThreshold; 2122 if (SizeOptLevel == 2) // -Oz 2123 return InlineConstants::OptMinSizeThreshold; 2124 return InlineThreshold; 2125 } 2126 2127 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2128 auto Params = 2129 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2130 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2131 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2132 // when it is specified explicitly. 2133 if (OptLevel > 2) 2134 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2135 return Params; 2136 } 2137