1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/CFG.h" 25 #include "llvm/Analysis/InstructionSimplify.h" 26 #include "llvm/Analysis/ProfileSummaryInfo.h" 27 #include "llvm/Analysis/TargetTransformInfo.h" 28 #include "llvm/Analysis/ValueTracking.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/CallingConv.h" 31 #include "llvm/IR/DataLayout.h" 32 #include "llvm/IR/GetElementPtrTypeIterator.h" 33 #include "llvm/IR/GlobalAlias.h" 34 #include "llvm/IR/InstVisitor.h" 35 #include "llvm/IR/IntrinsicInst.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/Support/Debug.h" 38 #include "llvm/Support/raw_ostream.h" 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "inline-cost" 43 44 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 45 46 static cl::opt<int> InlineThreshold( 47 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 48 cl::desc("Control the amount of inlining to perform (default = 225)")); 49 50 static cl::opt<int> HintThreshold( 51 "inlinehint-threshold", cl::Hidden, cl::init(325), 52 cl::desc("Threshold for inlining functions with inline hint")); 53 54 static cl::opt<int> 55 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 56 cl::init(45), 57 cl::desc("Threshold for inlining cold callsites")); 58 59 // We introduce this threshold to help performance of instrumentation based 60 // PGO before we actually hook up inliner with analysis passes such as BPI and 61 // BFI. 62 static cl::opt<int> ColdThreshold( 63 "inlinecold-threshold", cl::Hidden, cl::init(45), 64 cl::desc("Threshold for inlining functions with cold attribute")); 65 66 static cl::opt<int> 67 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 68 cl::ZeroOrMore, 69 cl::desc("Threshold for hot callsites ")); 70 71 static cl::opt<int> LocallyHotCallSiteThreshold( 72 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 73 cl::desc("Threshold for locally hot callsites ")); 74 75 static cl::opt<int> ColdCallSiteRelFreq( 76 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 77 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 78 "entry frequency, for a callsite to be cold in the absence of " 79 "profile information.")); 80 81 static cl::opt<int> HotCallSiteRelFreq( 82 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 83 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 84 "entry frequency, for a callsite to be hot in the absence of " 85 "profile information.")); 86 87 static cl::opt<bool> OptComputeFullInlineCost( 88 "inline-cost-full", cl::Hidden, cl::init(false), 89 cl::desc("Compute the full inline cost of a call site even when the cost " 90 "exceeds the threshold.")); 91 92 namespace { 93 94 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 95 typedef InstVisitor<CallAnalyzer, bool> Base; 96 friend class InstVisitor<CallAnalyzer, bool>; 97 98 /// The TargetTransformInfo available for this compilation. 99 const TargetTransformInfo &TTI; 100 101 /// Getter for the cache of @llvm.assume intrinsics. 102 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 103 104 /// Getter for BlockFrequencyInfo 105 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 106 107 /// Profile summary information. 108 ProfileSummaryInfo *PSI; 109 110 /// The called function. 111 Function &F; 112 113 // Cache the DataLayout since we use it a lot. 114 const DataLayout &DL; 115 116 /// The OptimizationRemarkEmitter available for this compilation. 117 OptimizationRemarkEmitter *ORE; 118 119 /// The candidate callsite being analyzed. Please do not use this to do 120 /// analysis in the caller function; we want the inline cost query to be 121 /// easily cacheable. Instead, use the cover function paramHasAttr. 122 CallSite CandidateCS; 123 124 /// Tunable parameters that control the analysis. 125 const InlineParams &Params; 126 127 int Threshold; 128 int Cost; 129 bool ComputeFullInlineCost; 130 131 bool IsCallerRecursive; 132 bool IsRecursiveCall; 133 bool ExposesReturnsTwice; 134 bool HasDynamicAlloca; 135 bool ContainsNoDuplicateCall; 136 bool HasReturn; 137 bool HasIndirectBr; 138 bool HasUninlineableIntrinsic; 139 bool UsesVarArgs; 140 141 /// Number of bytes allocated statically by the callee. 142 uint64_t AllocatedSize; 143 unsigned NumInstructions, NumVectorInstructions; 144 int VectorBonus, TenPercentVectorBonus; 145 // Bonus to be applied when the callee has only one reachable basic block. 146 int SingleBBBonus; 147 148 /// While we walk the potentially-inlined instructions, we build up and 149 /// maintain a mapping of simplified values specific to this callsite. The 150 /// idea is to propagate any special information we have about arguments to 151 /// this call through the inlinable section of the function, and account for 152 /// likely simplifications post-inlining. The most important aspect we track 153 /// is CFG altering simplifications -- when we prove a basic block dead, that 154 /// can cause dramatic shifts in the cost of inlining a function. 155 DenseMap<Value *, Constant *> SimplifiedValues; 156 157 /// Keep track of the values which map back (through function arguments) to 158 /// allocas on the caller stack which could be simplified through SROA. 159 DenseMap<Value *, Value *> SROAArgValues; 160 161 /// The mapping of caller Alloca values to their accumulated cost savings. If 162 /// we have to disable SROA for one of the allocas, this tells us how much 163 /// cost must be added. 164 DenseMap<Value *, int> SROAArgCosts; 165 166 /// Keep track of values which map to a pointer base and constant offset. 167 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 168 169 /// Keep track of dead blocks due to the constant arguments. 170 SetVector<BasicBlock *> DeadBlocks; 171 172 /// The mapping of the blocks to their known unique successors due to the 173 /// constant arguments. 174 DenseMap<BasicBlock *, BasicBlock *> KnownSuccessors; 175 176 /// Model the elimination of repeated loads that is expected to happen 177 /// whenever we simplify away the stores that would otherwise cause them to be 178 /// loads. 179 bool EnableLoadElimination; 180 SmallPtrSet<Value *, 16> LoadAddrSet; 181 int LoadEliminationCost; 182 183 // Custom simplification helper routines. 184 bool isAllocaDerivedArg(Value *V); 185 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 186 DenseMap<Value *, int>::iterator &CostIt); 187 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 188 void disableSROA(Value *V); 189 void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB); 190 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 191 int InstructionCost); 192 void disableLoadElimination(); 193 bool isGEPFree(GetElementPtrInst &GEP); 194 bool canFoldInboundsGEP(GetElementPtrInst &I); 195 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 196 bool simplifyCallSite(Function *F, CallSite CS); 197 template <typename Callable> 198 bool simplifyInstruction(Instruction &I, Callable Evaluate); 199 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 200 201 /// Return true if the given argument to the function being considered for 202 /// inlining has the given attribute set either at the call site or the 203 /// function declaration. Primarily used to inspect call site specific 204 /// attributes since these can be more precise than the ones on the callee 205 /// itself. 206 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 207 208 /// Return true if the given value is known non null within the callee if 209 /// inlined through this particular callsite. 210 bool isKnownNonNullInCallee(Value *V); 211 212 /// Update Threshold based on callsite properties such as callee 213 /// attributes and callee hotness for PGO builds. The Callee is explicitly 214 /// passed to support analyzing indirect calls whose target is inferred by 215 /// analysis. 216 void updateThreshold(CallSite CS, Function &Callee); 217 218 /// Return true if size growth is allowed when inlining the callee at CS. 219 bool allowSizeGrowth(CallSite CS); 220 221 /// Return true if \p CS is a cold callsite. 222 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 223 224 /// Return a higher threshold if \p CS is a hot callsite. 225 Optional<int> getHotCallSiteThreshold(CallSite CS, 226 BlockFrequencyInfo *CallerBFI); 227 228 // Custom analysis routines. 229 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 230 231 // Disable several entry points to the visitor so we don't accidentally use 232 // them by declaring but not defining them here. 233 void visit(Module *); 234 void visit(Module &); 235 void visit(Function *); 236 void visit(Function &); 237 void visit(BasicBlock *); 238 void visit(BasicBlock &); 239 240 // Provide base case for our instruction visit. 241 bool visitInstruction(Instruction &I); 242 243 // Our visit overrides. 244 bool visitAlloca(AllocaInst &I); 245 bool visitPHI(PHINode &I); 246 bool visitGetElementPtr(GetElementPtrInst &I); 247 bool visitBitCast(BitCastInst &I); 248 bool visitPtrToInt(PtrToIntInst &I); 249 bool visitIntToPtr(IntToPtrInst &I); 250 bool visitCastInst(CastInst &I); 251 bool visitUnaryInstruction(UnaryInstruction &I); 252 bool visitCmpInst(CmpInst &I); 253 bool visitSub(BinaryOperator &I); 254 bool visitBinaryOperator(BinaryOperator &I); 255 bool visitLoad(LoadInst &I); 256 bool visitStore(StoreInst &I); 257 bool visitExtractValue(ExtractValueInst &I); 258 bool visitInsertValue(InsertValueInst &I); 259 bool visitCallSite(CallSite CS); 260 bool visitReturnInst(ReturnInst &RI); 261 bool visitBranchInst(BranchInst &BI); 262 bool visitSelectInst(SelectInst &SI); 263 bool visitSwitchInst(SwitchInst &SI); 264 bool visitIndirectBrInst(IndirectBrInst &IBI); 265 bool visitResumeInst(ResumeInst &RI); 266 bool visitCleanupReturnInst(CleanupReturnInst &RI); 267 bool visitCatchReturnInst(CatchReturnInst &RI); 268 bool visitUnreachableInst(UnreachableInst &I); 269 270 public: 271 CallAnalyzer(const TargetTransformInfo &TTI, 272 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 273 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 274 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 275 Function &Callee, CallSite CSArg, const InlineParams &Params) 276 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 277 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 278 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 279 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 280 Params.ComputeFullInlineCost || ORE), 281 IsCallerRecursive(false), IsRecursiveCall(false), 282 ExposesReturnsTwice(false), HasDynamicAlloca(false), 283 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 284 HasUninlineableIntrinsic(false), UsesVarArgs(false), AllocatedSize(0), 285 NumInstructions(0), NumVectorInstructions(0), VectorBonus(0), 286 SingleBBBonus(0), EnableLoadElimination(true), LoadEliminationCost(0), 287 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), 288 NumConstantPtrCmps(0), NumConstantPtrDiffs(0), 289 NumInstructionsSimplified(0), SROACostSavings(0), 290 SROACostSavingsLost(0) {} 291 292 bool analyzeCall(CallSite CS); 293 294 int getThreshold() { return Threshold; } 295 int getCost() { return Cost; } 296 297 // Keep a bunch of stats about the cost savings found so we can print them 298 // out when debugging. 299 unsigned NumConstantArgs; 300 unsigned NumConstantOffsetPtrArgs; 301 unsigned NumAllocaArgs; 302 unsigned NumConstantPtrCmps; 303 unsigned NumConstantPtrDiffs; 304 unsigned NumInstructionsSimplified; 305 unsigned SROACostSavings; 306 unsigned SROACostSavingsLost; 307 308 void dump(); 309 }; 310 311 } // namespace 312 313 /// \brief Test whether the given value is an Alloca-derived function argument. 314 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 315 return SROAArgValues.count(V); 316 } 317 318 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 319 /// Returns false if V does not map to a SROA-candidate. 320 bool CallAnalyzer::lookupSROAArgAndCost( 321 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 322 if (SROAArgValues.empty() || SROAArgCosts.empty()) 323 return false; 324 325 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 326 if (ArgIt == SROAArgValues.end()) 327 return false; 328 329 Arg = ArgIt->second; 330 CostIt = SROAArgCosts.find(Arg); 331 return CostIt != SROAArgCosts.end(); 332 } 333 334 /// \brief Disable SROA for the candidate marked by this cost iterator. 335 /// 336 /// This marks the candidate as no longer viable for SROA, and adds the cost 337 /// savings associated with it back into the inline cost measurement. 338 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 339 // If we're no longer able to perform SROA we need to undo its cost savings 340 // and prevent subsequent analysis. 341 Cost += CostIt->second; 342 SROACostSavings -= CostIt->second; 343 SROACostSavingsLost += CostIt->second; 344 SROAArgCosts.erase(CostIt); 345 disableLoadElimination(); 346 } 347 348 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 349 void CallAnalyzer::disableSROA(Value *V) { 350 Value *SROAArg; 351 DenseMap<Value *, int>::iterator CostIt; 352 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 353 disableSROA(CostIt); 354 } 355 356 /// \brief Accumulate the given cost for a particular SROA candidate. 357 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 358 int InstructionCost) { 359 CostIt->second += InstructionCost; 360 SROACostSavings += InstructionCost; 361 } 362 363 void CallAnalyzer::disableLoadElimination() { 364 if (EnableLoadElimination) { 365 Cost += LoadEliminationCost; 366 LoadEliminationCost = 0; 367 EnableLoadElimination = false; 368 } 369 } 370 371 /// \brief Accumulate a constant GEP offset into an APInt if possible. 372 /// 373 /// Returns false if unable to compute the offset for any reason. Respects any 374 /// simplified values known during the analysis of this callsite. 375 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 376 unsigned IntPtrWidth = DL.getIndexTypeSizeInBits(GEP.getType()); 377 assert(IntPtrWidth == Offset.getBitWidth()); 378 379 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 380 GTI != GTE; ++GTI) { 381 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 382 if (!OpC) 383 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 384 OpC = dyn_cast<ConstantInt>(SimpleOp); 385 if (!OpC) 386 return false; 387 if (OpC->isZero()) 388 continue; 389 390 // Handle a struct index, which adds its field offset to the pointer. 391 if (StructType *STy = GTI.getStructTypeOrNull()) { 392 unsigned ElementIdx = OpC->getZExtValue(); 393 const StructLayout *SL = DL.getStructLayout(STy); 394 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 395 continue; 396 } 397 398 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 399 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 400 } 401 return true; 402 } 403 404 /// \brief Use TTI to check whether a GEP is free. 405 /// 406 /// Respects any simplified values known during the analysis of this callsite. 407 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 408 SmallVector<Value *, 4> Operands; 409 Operands.push_back(GEP.getOperand(0)); 410 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 411 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 412 Operands.push_back(SimpleOp); 413 else 414 Operands.push_back(*I); 415 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 416 } 417 418 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 419 // Check whether inlining will turn a dynamic alloca into a static 420 // alloca and handle that case. 421 if (I.isArrayAllocation()) { 422 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 423 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 424 Type *Ty = I.getAllocatedType(); 425 AllocatedSize = SaturatingMultiplyAdd( 426 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 427 return Base::visitAlloca(I); 428 } 429 } 430 431 // Accumulate the allocated size. 432 if (I.isStaticAlloca()) { 433 Type *Ty = I.getAllocatedType(); 434 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 435 } 436 437 // We will happily inline static alloca instructions. 438 if (I.isStaticAlloca()) 439 return Base::visitAlloca(I); 440 441 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 442 // a variety of reasons, and so we would like to not inline them into 443 // functions which don't currently have a dynamic alloca. This simply 444 // disables inlining altogether in the presence of a dynamic alloca. 445 HasDynamicAlloca = true; 446 return false; 447 } 448 449 bool CallAnalyzer::visitPHI(PHINode &I) { 450 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 451 // though we don't want to propagate it's bonuses. The idea is to disable 452 // SROA if it *might* be used in an inappropriate manner. 453 454 // Phi nodes are always zero-cost. 455 // FIXME: Pointer sizes may differ between different address spaces, so do we 456 // need to use correct address space in the call to getPointerSizeInBits here? 457 // Or could we skip the getPointerSizeInBits call completely? As far as I can 458 // see the ZeroOffset is used as a dummy value, so we can probably use any 459 // bit width for the ZeroOffset? 460 APInt ZeroOffset = APInt::getNullValue(DL.getPointerSizeInBits(0)); 461 bool CheckSROA = I.getType()->isPointerTy(); 462 463 // Track the constant or pointer with constant offset we've seen so far. 464 Constant *FirstC = nullptr; 465 std::pair<Value *, APInt> FirstBaseAndOffset = {nullptr, ZeroOffset}; 466 Value *FirstV = nullptr; 467 468 for (unsigned i = 0, e = I.getNumIncomingValues(); i != e; ++i) { 469 BasicBlock *Pred = I.getIncomingBlock(i); 470 // If the incoming block is dead, skip the incoming block. 471 if (DeadBlocks.count(Pred)) 472 continue; 473 // If the parent block of phi is not the known successor of the incoming 474 // block, skip the incoming block. 475 BasicBlock *KnownSuccessor = KnownSuccessors[Pred]; 476 if (KnownSuccessor && KnownSuccessor != I.getParent()) 477 continue; 478 479 Value *V = I.getIncomingValue(i); 480 // If the incoming value is this phi itself, skip the incoming value. 481 if (&I == V) 482 continue; 483 484 Constant *C = dyn_cast<Constant>(V); 485 if (!C) 486 C = SimplifiedValues.lookup(V); 487 488 std::pair<Value *, APInt> BaseAndOffset = {nullptr, ZeroOffset}; 489 if (!C && CheckSROA) 490 BaseAndOffset = ConstantOffsetPtrs.lookup(V); 491 492 if (!C && !BaseAndOffset.first) 493 // The incoming value is neither a constant nor a pointer with constant 494 // offset, exit early. 495 return true; 496 497 if (FirstC) { 498 if (FirstC == C) 499 // If we've seen a constant incoming value before and it is the same 500 // constant we see this time, continue checking the next incoming value. 501 continue; 502 // Otherwise early exit because we either see a different constant or saw 503 // a constant before but we have a pointer with constant offset this time. 504 return true; 505 } 506 507 if (FirstV) { 508 // The same logic as above, but check pointer with constant offset here. 509 if (FirstBaseAndOffset == BaseAndOffset) 510 continue; 511 return true; 512 } 513 514 if (C) { 515 // This is the 1st time we've seen a constant, record it. 516 FirstC = C; 517 continue; 518 } 519 520 // The remaining case is that this is the 1st time we've seen a pointer with 521 // constant offset, record it. 522 FirstV = V; 523 FirstBaseAndOffset = BaseAndOffset; 524 } 525 526 // Check if we can map phi to a constant. 527 if (FirstC) { 528 SimplifiedValues[&I] = FirstC; 529 return true; 530 } 531 532 // Check if we can map phi to a pointer with constant offset. 533 if (FirstBaseAndOffset.first) { 534 ConstantOffsetPtrs[&I] = FirstBaseAndOffset; 535 536 Value *SROAArg; 537 DenseMap<Value *, int>::iterator CostIt; 538 if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt)) 539 SROAArgValues[&I] = SROAArg; 540 } 541 542 return true; 543 } 544 545 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. 546 /// This requires target data and inbounds GEPs. 547 /// 548 /// \return true if the specified GEP can be folded. 549 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 550 // Check if we have a base + offset for the pointer. 551 std::pair<Value *, APInt> BaseAndOffset = 552 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 553 if (!BaseAndOffset.first) 554 return false; 555 556 // Check if the offset of this GEP is constant, and if so accumulate it 557 // into Offset. 558 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 559 return false; 560 561 // Add the result as a new mapping to Base + Offset. 562 ConstantOffsetPtrs[&I] = BaseAndOffset; 563 564 return true; 565 } 566 567 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 568 Value *SROAArg; 569 DenseMap<Value *, int>::iterator CostIt; 570 bool SROACandidate = 571 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 572 573 // Lambda to check whether a GEP's indices are all constant. 574 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 575 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 576 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 577 return false; 578 return true; 579 }; 580 581 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 582 if (SROACandidate) 583 SROAArgValues[&I] = SROAArg; 584 585 // Constant GEPs are modeled as free. 586 return true; 587 } 588 589 // Variable GEPs will require math and will disable SROA. 590 if (SROACandidate) 591 disableSROA(CostIt); 592 return isGEPFree(I); 593 } 594 595 /// Simplify \p I if its operands are constants and update SimplifiedValues. 596 /// \p Evaluate is a callable specific to instruction type that evaluates the 597 /// instruction when all the operands are constants. 598 template <typename Callable> 599 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 600 SmallVector<Constant *, 2> COps; 601 for (Value *Op : I.operands()) { 602 Constant *COp = dyn_cast<Constant>(Op); 603 if (!COp) 604 COp = SimplifiedValues.lookup(Op); 605 if (!COp) 606 return false; 607 COps.push_back(COp); 608 } 609 auto *C = Evaluate(COps); 610 if (!C) 611 return false; 612 SimplifiedValues[&I] = C; 613 return true; 614 } 615 616 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 617 // Propagate constants through bitcasts. 618 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 619 return ConstantExpr::getBitCast(COps[0], I.getType()); 620 })) 621 return true; 622 623 // Track base/offsets through casts 624 std::pair<Value *, APInt> BaseAndOffset = 625 ConstantOffsetPtrs.lookup(I.getOperand(0)); 626 // Casts don't change the offset, just wrap it up. 627 if (BaseAndOffset.first) 628 ConstantOffsetPtrs[&I] = BaseAndOffset; 629 630 // Also look for SROA candidates here. 631 Value *SROAArg; 632 DenseMap<Value *, int>::iterator CostIt; 633 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 634 SROAArgValues[&I] = SROAArg; 635 636 // Bitcasts are always zero cost. 637 return true; 638 } 639 640 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 641 // Propagate constants through ptrtoint. 642 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 643 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 644 })) 645 return true; 646 647 // Track base/offset pairs when converted to a plain integer provided the 648 // integer is large enough to represent the pointer. 649 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 650 unsigned AS = I.getOperand(0)->getType()->getPointerAddressSpace(); 651 if (IntegerSize >= DL.getPointerSizeInBits(AS)) { 652 std::pair<Value *, APInt> BaseAndOffset = 653 ConstantOffsetPtrs.lookup(I.getOperand(0)); 654 if (BaseAndOffset.first) 655 ConstantOffsetPtrs[&I] = BaseAndOffset; 656 } 657 658 // This is really weird. Technically, ptrtoint will disable SROA. However, 659 // unless that ptrtoint is *used* somewhere in the live basic blocks after 660 // inlining, it will be nuked, and SROA should proceed. All of the uses which 661 // would block SROA would also block SROA if applied directly to a pointer, 662 // and so we can just add the integer in here. The only places where SROA is 663 // preserved either cannot fire on an integer, or won't in-and-of themselves 664 // disable SROA (ext) w/o some later use that we would see and disable. 665 Value *SROAArg; 666 DenseMap<Value *, int>::iterator CostIt; 667 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 668 SROAArgValues[&I] = SROAArg; 669 670 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 671 } 672 673 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 674 // Propagate constants through ptrtoint. 675 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 676 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 677 })) 678 return true; 679 680 // Track base/offset pairs when round-tripped through a pointer without 681 // modifications provided the integer is not too large. 682 Value *Op = I.getOperand(0); 683 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 684 if (IntegerSize <= DL.getPointerTypeSizeInBits(I.getType())) { 685 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 686 if (BaseAndOffset.first) 687 ConstantOffsetPtrs[&I] = BaseAndOffset; 688 } 689 690 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 691 Value *SROAArg; 692 DenseMap<Value *, int>::iterator CostIt; 693 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 694 SROAArgValues[&I] = SROAArg; 695 696 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 697 } 698 699 bool CallAnalyzer::visitCastInst(CastInst &I) { 700 // Propagate constants through ptrtoint. 701 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 702 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 703 })) 704 return true; 705 706 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 707 disableSROA(I.getOperand(0)); 708 709 // If this is a floating-point cast, and the target says this operation 710 // is expensive, this may eventually become a library call. Treat the cost 711 // as such. 712 switch (I.getOpcode()) { 713 case Instruction::FPTrunc: 714 case Instruction::FPExt: 715 case Instruction::UIToFP: 716 case Instruction::SIToFP: 717 case Instruction::FPToUI: 718 case Instruction::FPToSI: 719 if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 720 Cost += InlineConstants::CallPenalty; 721 default: 722 break; 723 } 724 725 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 726 } 727 728 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 729 Value *Operand = I.getOperand(0); 730 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 731 return ConstantFoldInstOperands(&I, COps[0], DL); 732 })) 733 return true; 734 735 // Disable any SROA on the argument to arbitrary unary operators. 736 disableSROA(Operand); 737 738 return false; 739 } 740 741 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 742 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 743 } 744 745 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 746 // Does the *call site* have the NonNull attribute set on an argument? We 747 // use the attribute on the call site to memoize any analysis done in the 748 // caller. This will also trip if the callee function has a non-null 749 // parameter attribute, but that's a less interesting case because hopefully 750 // the callee would already have been simplified based on that. 751 if (Argument *A = dyn_cast<Argument>(V)) 752 if (paramHasAttr(A, Attribute::NonNull)) 753 return true; 754 755 // Is this an alloca in the caller? This is distinct from the attribute case 756 // above because attributes aren't updated within the inliner itself and we 757 // always want to catch the alloca derived case. 758 if (isAllocaDerivedArg(V)) 759 // We can actually predict the result of comparisons between an 760 // alloca-derived value and null. Note that this fires regardless of 761 // SROA firing. 762 return true; 763 764 return false; 765 } 766 767 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 768 // If the normal destination of the invoke or the parent block of the call 769 // site is unreachable-terminated, there is little point in inlining this 770 // unless there is literally zero cost. 771 // FIXME: Note that it is possible that an unreachable-terminated block has a 772 // hot entry. For example, in below scenario inlining hot_call_X() may be 773 // beneficial : 774 // main() { 775 // hot_call_1(); 776 // ... 777 // hot_call_N() 778 // exit(0); 779 // } 780 // For now, we are not handling this corner case here as it is rare in real 781 // code. In future, we should elaborate this based on BPI and BFI in more 782 // general threshold adjusting heuristics in updateThreshold(). 783 Instruction *Instr = CS.getInstruction(); 784 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 785 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 786 return false; 787 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 788 return false; 789 790 return true; 791 } 792 793 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 794 // If global profile summary is available, then callsite's coldness is 795 // determined based on that. 796 if (PSI && PSI->hasProfileSummary()) 797 return PSI->isColdCallSite(CS, CallerBFI); 798 799 // Otherwise we need BFI to be available. 800 if (!CallerBFI) 801 return false; 802 803 // Determine if the callsite is cold relative to caller's entry. We could 804 // potentially cache the computation of scaled entry frequency, but the added 805 // complexity is not worth it unless this scaling shows up high in the 806 // profiles. 807 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 808 auto CallSiteBB = CS.getInstruction()->getParent(); 809 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 810 auto CallerEntryFreq = 811 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 812 return CallSiteFreq < CallerEntryFreq * ColdProb; 813 } 814 815 Optional<int> 816 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 817 BlockFrequencyInfo *CallerBFI) { 818 819 // If global profile summary is available, then callsite's hotness is 820 // determined based on that. 821 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 822 return Params.HotCallSiteThreshold; 823 824 // Otherwise we need BFI to be available and to have a locally hot callsite 825 // threshold. 826 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 827 return None; 828 829 // Determine if the callsite is hot relative to caller's entry. We could 830 // potentially cache the computation of scaled entry frequency, but the added 831 // complexity is not worth it unless this scaling shows up high in the 832 // profiles. 833 auto CallSiteBB = CS.getInstruction()->getParent(); 834 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 835 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 836 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 837 return Params.LocallyHotCallSiteThreshold; 838 839 // Otherwise treat it normally. 840 return None; 841 } 842 843 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 844 // If no size growth is allowed for this inlining, set Threshold to 0. 845 if (!allowSizeGrowth(CS)) { 846 Threshold = 0; 847 return; 848 } 849 850 Function *Caller = CS.getCaller(); 851 852 // return min(A, B) if B is valid. 853 auto MinIfValid = [](int A, Optional<int> B) { 854 return B ? std::min(A, B.getValue()) : A; 855 }; 856 857 // return max(A, B) if B is valid. 858 auto MaxIfValid = [](int A, Optional<int> B) { 859 return B ? std::max(A, B.getValue()) : A; 860 }; 861 862 // Various bonus percentages. These are multiplied by Threshold to get the 863 // bonus values. 864 // SingleBBBonus: This bonus is applied if the callee has a single reachable 865 // basic block at the given callsite context. This is speculatively applied 866 // and withdrawn if more than one basic block is seen. 867 // 868 // Vector bonuses: We want to more aggressively inline vector-dense kernels 869 // and apply this bonus based on the percentage of vector instructions. A 870 // bonus is applied if the vector instructions exceed 50% and half that amount 871 // is applied if it exceeds 10%. Note that these bonuses are some what 872 // arbitrary and evolved over time by accident as much as because they are 873 // principled bonuses. 874 // FIXME: It would be nice to base the bonus values on something more 875 // scientific. 876 // 877 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 878 // of the last call to a static function as inlining such functions is 879 // guaranteed to reduce code size. 880 // 881 // These bonus percentages may be set to 0 based on properties of the caller 882 // and the callsite. 883 int SingleBBBonusPercent = 50; 884 int VectorBonusPercent = 150; 885 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 886 887 // Lambda to set all the above bonus and bonus percentages to 0. 888 auto DisallowAllBonuses = [&]() { 889 SingleBBBonusPercent = 0; 890 VectorBonusPercent = 0; 891 LastCallToStaticBonus = 0; 892 }; 893 894 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 895 // and reduce the threshold if the caller has the necessary attribute. 896 if (Caller->optForMinSize()) { 897 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 898 // For minsize, we want to disable the single BB bonus and the vector 899 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 900 // a static function will, at the minimum, eliminate the parameter setup and 901 // call/return instructions. 902 SingleBBBonusPercent = 0; 903 VectorBonusPercent = 0; 904 } else if (Caller->optForSize()) 905 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 906 907 // Adjust the threshold based on inlinehint attribute and profile based 908 // hotness information if the caller does not have MinSize attribute. 909 if (!Caller->optForMinSize()) { 910 if (Callee.hasFnAttribute(Attribute::InlineHint)) 911 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 912 913 // FIXME: After switching to the new passmanager, simplify the logic below 914 // by checking only the callsite hotness/coldness as we will reliably 915 // have local profile information. 916 // 917 // Callsite hotness and coldness can be determined if sample profile is 918 // used (which adds hotness metadata to calls) or if caller's 919 // BlockFrequencyInfo is available. 920 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 921 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 922 if (!Caller->optForSize() && HotCallSiteThreshold) { 923 DEBUG(dbgs() << "Hot callsite.\n"); 924 // FIXME: This should update the threshold only if it exceeds the 925 // current threshold, but AutoFDO + ThinLTO currently relies on this 926 // behavior to prevent inlining of hot callsites during ThinLTO 927 // compile phase. 928 Threshold = HotCallSiteThreshold.getValue(); 929 } else if (isColdCallSite(CS, CallerBFI)) { 930 DEBUG(dbgs() << "Cold callsite.\n"); 931 // Do not apply bonuses for a cold callsite including the 932 // LastCallToStatic bonus. While this bonus might result in code size 933 // reduction, it can cause the size of a non-cold caller to increase 934 // preventing it from being inlined. 935 DisallowAllBonuses(); 936 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 937 } else if (PSI) { 938 // Use callee's global profile information only if we have no way of 939 // determining this via callsite information. 940 if (PSI->isFunctionEntryHot(&Callee)) { 941 DEBUG(dbgs() << "Hot callee.\n"); 942 // If callsite hotness can not be determined, we may still know 943 // that the callee is hot and treat it as a weaker hint for threshold 944 // increase. 945 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 946 } else if (PSI->isFunctionEntryCold(&Callee)) { 947 DEBUG(dbgs() << "Cold callee.\n"); 948 // Do not apply bonuses for a cold callee including the 949 // LastCallToStatic bonus. While this bonus might result in code size 950 // reduction, it can cause the size of a non-cold caller to increase 951 // preventing it from being inlined. 952 DisallowAllBonuses(); 953 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 954 } 955 } 956 } 957 958 // Finally, take the target-specific inlining threshold multiplier into 959 // account. 960 Threshold *= TTI.getInliningThresholdMultiplier(); 961 962 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 963 VectorBonus = Threshold * VectorBonusPercent / 100; 964 965 bool OnlyOneCallAndLocalLinkage = 966 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 967 // If there is only one call of the function, and it has internal linkage, 968 // the cost of inlining it drops dramatically. It may seem odd to update 969 // Cost in updateThreshold, but the bonus depends on the logic in this method. 970 if (OnlyOneCallAndLocalLinkage) 971 Cost -= LastCallToStaticBonus; 972 } 973 974 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 975 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 976 // First try to handle simplified comparisons. 977 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 978 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 979 })) 980 return true; 981 982 if (I.getOpcode() == Instruction::FCmp) 983 return false; 984 985 // Otherwise look for a comparison between constant offset pointers with 986 // a common base. 987 Value *LHSBase, *RHSBase; 988 APInt LHSOffset, RHSOffset; 989 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 990 if (LHSBase) { 991 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 992 if (RHSBase && LHSBase == RHSBase) { 993 // We have common bases, fold the icmp to a constant based on the 994 // offsets. 995 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 996 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 997 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 998 SimplifiedValues[&I] = C; 999 ++NumConstantPtrCmps; 1000 return true; 1001 } 1002 } 1003 } 1004 1005 // If the comparison is an equality comparison with null, we can simplify it 1006 // if we know the value (argument) can't be null 1007 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 1008 isKnownNonNullInCallee(I.getOperand(0))) { 1009 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 1010 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 1011 : ConstantInt::getFalse(I.getType()); 1012 return true; 1013 } 1014 // Finally check for SROA candidates in comparisons. 1015 Value *SROAArg; 1016 DenseMap<Value *, int>::iterator CostIt; 1017 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 1018 if (isa<ConstantPointerNull>(I.getOperand(1))) { 1019 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1020 return true; 1021 } 1022 1023 disableSROA(CostIt); 1024 } 1025 1026 return false; 1027 } 1028 1029 bool CallAnalyzer::visitSub(BinaryOperator &I) { 1030 // Try to handle a special case: we can fold computing the difference of two 1031 // constant-related pointers. 1032 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1033 Value *LHSBase, *RHSBase; 1034 APInt LHSOffset, RHSOffset; 1035 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 1036 if (LHSBase) { 1037 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 1038 if (RHSBase && LHSBase == RHSBase) { 1039 // We have common bases, fold the subtract to a constant based on the 1040 // offsets. 1041 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 1042 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 1043 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 1044 SimplifiedValues[&I] = C; 1045 ++NumConstantPtrDiffs; 1046 return true; 1047 } 1048 } 1049 } 1050 1051 // Otherwise, fall back to the generic logic for simplifying and handling 1052 // instructions. 1053 return Base::visitSub(I); 1054 } 1055 1056 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 1057 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 1058 Constant *CLHS = dyn_cast<Constant>(LHS); 1059 if (!CLHS) 1060 CLHS = SimplifiedValues.lookup(LHS); 1061 Constant *CRHS = dyn_cast<Constant>(RHS); 1062 if (!CRHS) 1063 CRHS = SimplifiedValues.lookup(RHS); 1064 1065 Value *SimpleV = nullptr; 1066 if (auto FI = dyn_cast<FPMathOperator>(&I)) 1067 SimpleV = SimplifyFPBinOp(I.getOpcode(), CLHS ? CLHS : LHS, 1068 CRHS ? CRHS : RHS, FI->getFastMathFlags(), DL); 1069 else 1070 SimpleV = 1071 SimplifyBinOp(I.getOpcode(), CLHS ? CLHS : LHS, CRHS ? CRHS : RHS, DL); 1072 1073 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) 1074 SimplifiedValues[&I] = C; 1075 1076 if (SimpleV) 1077 return true; 1078 1079 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 1080 disableSROA(LHS); 1081 disableSROA(RHS); 1082 1083 // If the instruction is floating point, and the target says this operation 1084 // is expensive, this may eventually become a library call. Treat the cost 1085 // as such. 1086 if (I.getType()->isFloatingPointTy() && 1087 TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive) 1088 Cost += InlineConstants::CallPenalty; 1089 1090 return false; 1091 } 1092 1093 bool CallAnalyzer::visitLoad(LoadInst &I) { 1094 Value *SROAArg; 1095 DenseMap<Value *, int>::iterator CostIt; 1096 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1097 if (I.isSimple()) { 1098 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1099 return true; 1100 } 1101 1102 disableSROA(CostIt); 1103 } 1104 1105 // If the data is already loaded from this address and hasn't been clobbered 1106 // by any stores or calls, this load is likely to be redundant and can be 1107 // eliminated. 1108 if (EnableLoadElimination && 1109 !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) { 1110 LoadEliminationCost += InlineConstants::InstrCost; 1111 return true; 1112 } 1113 1114 return false; 1115 } 1116 1117 bool CallAnalyzer::visitStore(StoreInst &I) { 1118 Value *SROAArg; 1119 DenseMap<Value *, int>::iterator CostIt; 1120 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 1121 if (I.isSimple()) { 1122 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1123 return true; 1124 } 1125 1126 disableSROA(CostIt); 1127 } 1128 1129 // The store can potentially clobber loads and prevent repeated loads from 1130 // being eliminated. 1131 // FIXME: 1132 // 1. We can probably keep an initial set of eliminatable loads substracted 1133 // from the cost even when we finally see a store. We just need to disable 1134 // *further* accumulation of elimination savings. 1135 // 2. We should probably at some point thread MemorySSA for the callee into 1136 // this and then use that to actually compute *really* precise savings. 1137 disableLoadElimination(); 1138 return false; 1139 } 1140 1141 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1142 // Constant folding for extract value is trivial. 1143 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1144 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1145 })) 1146 return true; 1147 1148 // SROA can look through these but give them a cost. 1149 return false; 1150 } 1151 1152 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1153 // Constant folding for insert value is trivial. 1154 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1155 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1156 /*InsertedValueOperand*/ COps[1], 1157 I.getIndices()); 1158 })) 1159 return true; 1160 1161 // SROA can look through these but give them a cost. 1162 return false; 1163 } 1164 1165 /// \brief Try to simplify a call site. 1166 /// 1167 /// Takes a concrete function and callsite and tries to actually simplify it by 1168 /// analyzing the arguments and call itself with instsimplify. Returns true if 1169 /// it has simplified the callsite to some other entity (a constant), making it 1170 /// free. 1171 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1172 // FIXME: Using the instsimplify logic directly for this is inefficient 1173 // because we have to continually rebuild the argument list even when no 1174 // simplifications can be performed. Until that is fixed with remapping 1175 // inside of instsimplify, directly constant fold calls here. 1176 if (!canConstantFoldCallTo(CS, F)) 1177 return false; 1178 1179 // Try to re-map the arguments to constants. 1180 SmallVector<Constant *, 4> ConstantArgs; 1181 ConstantArgs.reserve(CS.arg_size()); 1182 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1183 ++I) { 1184 Constant *C = dyn_cast<Constant>(*I); 1185 if (!C) 1186 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1187 if (!C) 1188 return false; // This argument doesn't map to a constant. 1189 1190 ConstantArgs.push_back(C); 1191 } 1192 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1193 SimplifiedValues[CS.getInstruction()] = C; 1194 return true; 1195 } 1196 1197 return false; 1198 } 1199 1200 bool CallAnalyzer::visitCallSite(CallSite CS) { 1201 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1202 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1203 // This aborts the entire analysis. 1204 ExposesReturnsTwice = true; 1205 return false; 1206 } 1207 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1208 ContainsNoDuplicateCall = true; 1209 1210 if (Function *F = CS.getCalledFunction()) { 1211 // When we have a concrete function, first try to simplify it directly. 1212 if (simplifyCallSite(F, CS)) 1213 return true; 1214 1215 // Next check if it is an intrinsic we know about. 1216 // FIXME: Lift this into part of the InstVisitor. 1217 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1218 switch (II->getIntrinsicID()) { 1219 default: 1220 if (!CS.onlyReadsMemory() && !isAssumeLikeIntrinsic(II)) 1221 disableLoadElimination(); 1222 return Base::visitCallSite(CS); 1223 1224 case Intrinsic::load_relative: 1225 // This is normally lowered to 4 LLVM instructions. 1226 Cost += 3 * InlineConstants::InstrCost; 1227 return false; 1228 1229 case Intrinsic::memset: 1230 case Intrinsic::memcpy: 1231 case Intrinsic::memmove: 1232 disableLoadElimination(); 1233 // SROA can usually chew through these intrinsics, but they aren't free. 1234 return false; 1235 case Intrinsic::icall_branch_funnel: 1236 case Intrinsic::localescape: 1237 HasUninlineableIntrinsic = true; 1238 return false; 1239 case Intrinsic::vastart: 1240 case Intrinsic::vaend: 1241 UsesVarArgs = true; 1242 return false; 1243 } 1244 } 1245 1246 if (F == CS.getInstruction()->getFunction()) { 1247 // This flag will fully abort the analysis, so don't bother with anything 1248 // else. 1249 IsRecursiveCall = true; 1250 return false; 1251 } 1252 1253 if (TTI.isLoweredToCall(F)) { 1254 // We account for the average 1 instruction per call argument setup 1255 // here. 1256 Cost += CS.arg_size() * InlineConstants::InstrCost; 1257 1258 // Everything other than inline ASM will also have a significant cost 1259 // merely from making the call. 1260 if (!isa<InlineAsm>(CS.getCalledValue())) 1261 Cost += InlineConstants::CallPenalty; 1262 } 1263 1264 if (!CS.onlyReadsMemory()) 1265 disableLoadElimination(); 1266 return Base::visitCallSite(CS); 1267 } 1268 1269 // Otherwise we're in a very special case -- an indirect function call. See 1270 // if we can be particularly clever about this. 1271 Value *Callee = CS.getCalledValue(); 1272 1273 // First, pay the price of the argument setup. We account for the average 1274 // 1 instruction per call argument setup here. 1275 Cost += CS.arg_size() * InlineConstants::InstrCost; 1276 1277 // Next, check if this happens to be an indirect function call to a known 1278 // function in this inline context. If not, we've done all we can. 1279 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1280 if (!F) { 1281 if (!CS.onlyReadsMemory()) 1282 disableLoadElimination(); 1283 return Base::visitCallSite(CS); 1284 } 1285 1286 // If we have a constant that we are calling as a function, we can peer 1287 // through it and see the function target. This happens not infrequently 1288 // during devirtualization and so we want to give it a hefty bonus for 1289 // inlining, but cap that bonus in the event that inlining wouldn't pan 1290 // out. Pretend to inline the function, with a custom threshold. 1291 auto IndirectCallParams = Params; 1292 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1293 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1294 IndirectCallParams); 1295 if (CA.analyzeCall(CS)) { 1296 // We were able to inline the indirect call! Subtract the cost from the 1297 // threshold to get the bonus we want to apply, but don't go below zero. 1298 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1299 } 1300 1301 if (!F->onlyReadsMemory()) 1302 disableLoadElimination(); 1303 return Base::visitCallSite(CS); 1304 } 1305 1306 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1307 // At least one return instruction will be free after inlining. 1308 bool Free = !HasReturn; 1309 HasReturn = true; 1310 return Free; 1311 } 1312 1313 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1314 // We model unconditional branches as essentially free -- they really 1315 // shouldn't exist at all, but handling them makes the behavior of the 1316 // inliner more regular and predictable. Interestingly, conditional branches 1317 // which will fold away are also free. 1318 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1319 dyn_cast_or_null<ConstantInt>( 1320 SimplifiedValues.lookup(BI.getCondition())); 1321 } 1322 1323 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1324 bool CheckSROA = SI.getType()->isPointerTy(); 1325 Value *TrueVal = SI.getTrueValue(); 1326 Value *FalseVal = SI.getFalseValue(); 1327 1328 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1329 if (!TrueC) 1330 TrueC = SimplifiedValues.lookup(TrueVal); 1331 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1332 if (!FalseC) 1333 FalseC = SimplifiedValues.lookup(FalseVal); 1334 Constant *CondC = 1335 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1336 1337 if (!CondC) { 1338 // Select C, X, X => X 1339 if (TrueC == FalseC && TrueC) { 1340 SimplifiedValues[&SI] = TrueC; 1341 return true; 1342 } 1343 1344 if (!CheckSROA) 1345 return Base::visitSelectInst(SI); 1346 1347 std::pair<Value *, APInt> TrueBaseAndOffset = 1348 ConstantOffsetPtrs.lookup(TrueVal); 1349 std::pair<Value *, APInt> FalseBaseAndOffset = 1350 ConstantOffsetPtrs.lookup(FalseVal); 1351 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1352 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1353 1354 Value *SROAArg; 1355 DenseMap<Value *, int>::iterator CostIt; 1356 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1357 SROAArgValues[&SI] = SROAArg; 1358 return true; 1359 } 1360 1361 return Base::visitSelectInst(SI); 1362 } 1363 1364 // Select condition is a constant. 1365 Value *SelectedV = CondC->isAllOnesValue() 1366 ? TrueVal 1367 : (CondC->isNullValue()) ? FalseVal : nullptr; 1368 if (!SelectedV) { 1369 // Condition is a vector constant that is not all 1s or all 0s. If all 1370 // operands are constants, ConstantExpr::getSelect() can handle the cases 1371 // such as select vectors. 1372 if (TrueC && FalseC) { 1373 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1374 SimplifiedValues[&SI] = C; 1375 return true; 1376 } 1377 } 1378 return Base::visitSelectInst(SI); 1379 } 1380 1381 // Condition is either all 1s or all 0s. SI can be simplified. 1382 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1383 SimplifiedValues[&SI] = SelectedC; 1384 return true; 1385 } 1386 1387 if (!CheckSROA) 1388 return true; 1389 1390 std::pair<Value *, APInt> BaseAndOffset = 1391 ConstantOffsetPtrs.lookup(SelectedV); 1392 if (BaseAndOffset.first) { 1393 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1394 1395 Value *SROAArg; 1396 DenseMap<Value *, int>::iterator CostIt; 1397 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1398 SROAArgValues[&SI] = SROAArg; 1399 } 1400 1401 return true; 1402 } 1403 1404 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1405 // We model unconditional switches as free, see the comments on handling 1406 // branches. 1407 if (isa<ConstantInt>(SI.getCondition())) 1408 return true; 1409 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1410 if (isa<ConstantInt>(V)) 1411 return true; 1412 1413 // Assume the most general case where the switch is lowered into 1414 // either a jump table, bit test, or a balanced binary tree consisting of 1415 // case clusters without merging adjacent clusters with the same 1416 // destination. We do not consider the switches that are lowered with a mix 1417 // of jump table/bit test/binary search tree. The cost of the switch is 1418 // proportional to the size of the tree or the size of jump table range. 1419 // 1420 // NB: We convert large switches which are just used to initialize large phi 1421 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1422 // inlining those. It will prevent inlining in cases where the optimization 1423 // does not (yet) fire. 1424 1425 // Maximum valid cost increased in this function. 1426 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1427 1428 // Exit early for a large switch, assuming one case needs at least one 1429 // instruction. 1430 // FIXME: This is not true for a bit test, but ignore such case for now to 1431 // save compile-time. 1432 int64_t CostLowerBound = 1433 std::min((int64_t)CostUpperBound, 1434 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1435 1436 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1437 Cost = CostLowerBound; 1438 return false; 1439 } 1440 1441 unsigned JumpTableSize = 0; 1442 unsigned NumCaseCluster = 1443 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1444 1445 // If suitable for a jump table, consider the cost for the table size and 1446 // branch to destination. 1447 if (JumpTableSize) { 1448 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1449 4 * InlineConstants::InstrCost; 1450 1451 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1452 return false; 1453 } 1454 1455 // Considering forming a binary search, we should find the number of nodes 1456 // which is same as the number of comparisons when lowered. For a given 1457 // number of clusters, n, we can define a recursive function, f(n), to find 1458 // the number of nodes in the tree. The recursion is : 1459 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1460 // and f(n) = n, when n <= 3. 1461 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1462 // when n > 3. So, the number of comparisons from leaves should be n, while 1463 // the number of non-leaf should be : 1464 // 2^(log2(n) - 1) - 1 1465 // = 2^log2(n) * 2^-1 - 1 1466 // = n / 2 - 1. 1467 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1468 // number of comparisons in a simple closed form : 1469 // n + n / 2 - 1 = n * 3 / 2 - 1 1470 if (NumCaseCluster <= 3) { 1471 // Suppose a comparison includes one compare and one conditional branch. 1472 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1473 return false; 1474 } 1475 1476 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1477 int64_t SwitchCost = 1478 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1479 1480 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1481 return false; 1482 } 1483 1484 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1485 // We never want to inline functions that contain an indirectbr. This is 1486 // incorrect because all the blockaddress's (in static global initializers 1487 // for example) would be referring to the original function, and this 1488 // indirect jump would jump from the inlined copy of the function into the 1489 // original function which is extremely undefined behavior. 1490 // FIXME: This logic isn't really right; we can safely inline functions with 1491 // indirectbr's as long as no other function or global references the 1492 // blockaddress of a block within the current function. 1493 HasIndirectBr = true; 1494 return false; 1495 } 1496 1497 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1498 // FIXME: It's not clear that a single instruction is an accurate model for 1499 // the inline cost of a resume instruction. 1500 return false; 1501 } 1502 1503 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1504 // FIXME: It's not clear that a single instruction is an accurate model for 1505 // the inline cost of a cleanupret instruction. 1506 return false; 1507 } 1508 1509 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1510 // FIXME: It's not clear that a single instruction is an accurate model for 1511 // the inline cost of a catchret instruction. 1512 return false; 1513 } 1514 1515 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1516 // FIXME: It might be reasonably to discount the cost of instructions leading 1517 // to unreachable as they have the lowest possible impact on both runtime and 1518 // code size. 1519 return true; // No actual code is needed for unreachable. 1520 } 1521 1522 bool CallAnalyzer::visitInstruction(Instruction &I) { 1523 // Some instructions are free. All of the free intrinsics can also be 1524 // handled by SROA, etc. 1525 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1526 return true; 1527 1528 // We found something we don't understand or can't handle. Mark any SROA-able 1529 // values in the operand list as no longer viable. 1530 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1531 disableSROA(*OI); 1532 1533 return false; 1534 } 1535 1536 /// \brief Analyze a basic block for its contribution to the inline cost. 1537 /// 1538 /// This method walks the analyzer over every instruction in the given basic 1539 /// block and accounts for their cost during inlining at this callsite. It 1540 /// aborts early if the threshold has been exceeded or an impossible to inline 1541 /// construct has been detected. It returns false if inlining is no longer 1542 /// viable, and true if inlining remains viable. 1543 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1544 SmallPtrSetImpl<const Value *> &EphValues) { 1545 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1546 // FIXME: Currently, the number of instructions in a function regardless of 1547 // our ability to simplify them during inline to constants or dead code, 1548 // are actually used by the vector bonus heuristic. As long as that's true, 1549 // we have to special case debug intrinsics here to prevent differences in 1550 // inlining due to debug symbols. Eventually, the number of unsimplified 1551 // instructions shouldn't factor into the cost computation, but until then, 1552 // hack around it here. 1553 if (isa<DbgInfoIntrinsic>(I)) 1554 continue; 1555 1556 // Skip ephemeral values. 1557 if (EphValues.count(&*I)) 1558 continue; 1559 1560 ++NumInstructions; 1561 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1562 ++NumVectorInstructions; 1563 1564 // If the instruction simplified to a constant, there is no cost to this 1565 // instruction. Visit the instructions using our InstVisitor to account for 1566 // all of the per-instruction logic. The visit tree returns true if we 1567 // consumed the instruction in any way, and false if the instruction's base 1568 // cost should count against inlining. 1569 if (Base::visit(&*I)) 1570 ++NumInstructionsSimplified; 1571 else 1572 Cost += InlineConstants::InstrCost; 1573 1574 using namespace ore; 1575 // If the visit this instruction detected an uninlinable pattern, abort. 1576 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1577 HasIndirectBr || HasUninlineableIntrinsic || UsesVarArgs) { 1578 if (ORE) 1579 ORE->emit([&]() { 1580 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1581 CandidateCS.getInstruction()) 1582 << NV("Callee", &F) 1583 << " has uninlinable pattern and cost is not fully computed"; 1584 }); 1585 return false; 1586 } 1587 1588 // If the caller is a recursive function then we don't want to inline 1589 // functions which allocate a lot of stack space because it would increase 1590 // the caller stack usage dramatically. 1591 if (IsCallerRecursive && 1592 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1593 if (ORE) 1594 ORE->emit([&]() { 1595 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1596 CandidateCS.getInstruction()) 1597 << NV("Callee", &F) 1598 << " is recursive and allocates too much stack space. Cost is " 1599 "not fully computed"; 1600 }); 1601 return false; 1602 } 1603 1604 // Check if we've past the maximum possible threshold so we don't spin in 1605 // huge basic blocks that will never inline. 1606 if (Cost >= Threshold && !ComputeFullInlineCost) 1607 return false; 1608 } 1609 1610 return true; 1611 } 1612 1613 /// \brief Compute the base pointer and cumulative constant offsets for V. 1614 /// 1615 /// This strips all constant offsets off of V, leaving it the base pointer, and 1616 /// accumulates the total constant offset applied in the returned constant. It 1617 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1618 /// no constant offsets applied. 1619 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1620 if (!V->getType()->isPointerTy()) 1621 return nullptr; 1622 1623 unsigned AS = V->getType()->getPointerAddressSpace(); 1624 unsigned IntPtrWidth = DL.getIndexSizeInBits(AS); 1625 APInt Offset = APInt::getNullValue(IntPtrWidth); 1626 1627 // Even though we don't look through PHI nodes, we could be called on an 1628 // instruction in an unreachable block, which may be on a cycle. 1629 SmallPtrSet<Value *, 4> Visited; 1630 Visited.insert(V); 1631 do { 1632 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1633 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1634 return nullptr; 1635 V = GEP->getPointerOperand(); 1636 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1637 V = cast<Operator>(V)->getOperand(0); 1638 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1639 if (GA->isInterposable()) 1640 break; 1641 V = GA->getAliasee(); 1642 } else { 1643 break; 1644 } 1645 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1646 } while (Visited.insert(V).second); 1647 1648 Type *IntPtrTy = DL.getIntPtrType(V->getContext(), AS); 1649 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1650 } 1651 1652 /// \brief Find dead blocks due to deleted CFG edges during inlining. 1653 /// 1654 /// If we know the successor of the current block, \p CurrBB, has to be \p 1655 /// NextBB, the other successors of \p CurrBB are dead if these successors have 1656 /// no live incoming CFG edges. If one block is found to be dead, we can 1657 /// continue growing the dead block list by checking the successors of the dead 1658 /// blocks to see if all their incoming edges are dead or not. 1659 void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) { 1660 auto IsEdgeDead = [&](BasicBlock *Pred, BasicBlock *Succ) { 1661 // A CFG edge is dead if the predecessor is dead or the predessor has a 1662 // known successor which is not the one under exam. 1663 return (DeadBlocks.count(Pred) || 1664 (KnownSuccessors[Pred] && KnownSuccessors[Pred] != Succ)); 1665 }; 1666 1667 auto IsNewlyDead = [&](BasicBlock *BB) { 1668 // If all the edges to a block are dead, the block is also dead. 1669 return (!DeadBlocks.count(BB) && 1670 llvm::all_of(predecessors(BB), 1671 [&](BasicBlock *P) { return IsEdgeDead(P, BB); })); 1672 }; 1673 1674 for (BasicBlock *Succ : successors(CurrBB)) { 1675 if (Succ == NextBB || !IsNewlyDead(Succ)) 1676 continue; 1677 SmallVector<BasicBlock *, 4> NewDead; 1678 NewDead.push_back(Succ); 1679 while (!NewDead.empty()) { 1680 BasicBlock *Dead = NewDead.pop_back_val(); 1681 if (DeadBlocks.insert(Dead)) 1682 // Continue growing the dead block lists. 1683 for (BasicBlock *S : successors(Dead)) 1684 if (IsNewlyDead(S)) 1685 NewDead.push_back(S); 1686 } 1687 } 1688 } 1689 1690 /// \brief Analyze a call site for potential inlining. 1691 /// 1692 /// Returns true if inlining this call is viable, and false if it is not 1693 /// viable. It computes the cost and adjusts the threshold based on numerous 1694 /// factors and heuristics. If this method returns false but the computed cost 1695 /// is below the computed threshold, then inlining was forcibly disabled by 1696 /// some artifact of the routine. 1697 bool CallAnalyzer::analyzeCall(CallSite CS) { 1698 ++NumCallsAnalyzed; 1699 1700 // Perform some tweaks to the cost and threshold based on the direct 1701 // callsite information. 1702 1703 // We want to more aggressively inline vector-dense kernels, so up the 1704 // threshold, and we'll lower it if the % of vector instructions gets too 1705 // low. Note that these bonuses are some what arbitrary and evolved over time 1706 // by accident as much as because they are principled bonuses. 1707 // 1708 // FIXME: It would be nice to remove all such bonuses. At least it would be 1709 // nice to base the bonus values on something more scientific. 1710 assert(NumInstructions == 0); 1711 assert(NumVectorInstructions == 0); 1712 1713 // Update the threshold based on callsite properties 1714 updateThreshold(CS, F); 1715 1716 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1717 // this Threshold any time, and cost cannot decrease, we can stop processing 1718 // the rest of the function body. 1719 Threshold += (SingleBBBonus + VectorBonus); 1720 1721 // Give out bonuses for the callsite, as the instructions setting them up 1722 // will be gone after inlining. 1723 Cost -= getCallsiteCost(CS, DL); 1724 1725 // If this function uses the coldcc calling convention, prefer not to inline 1726 // it. 1727 if (F.getCallingConv() == CallingConv::Cold) 1728 Cost += InlineConstants::ColdccPenalty; 1729 1730 // Check if we're done. This can happen due to bonuses and penalties. 1731 if (Cost >= Threshold && !ComputeFullInlineCost) 1732 return false; 1733 1734 if (F.empty()) 1735 return true; 1736 1737 Function *Caller = CS.getInstruction()->getFunction(); 1738 // Check if the caller function is recursive itself. 1739 for (User *U : Caller->users()) { 1740 CallSite Site(U); 1741 if (!Site) 1742 continue; 1743 Instruction *I = Site.getInstruction(); 1744 if (I->getFunction() == Caller) { 1745 IsCallerRecursive = true; 1746 break; 1747 } 1748 } 1749 1750 // Populate our simplified values by mapping from function arguments to call 1751 // arguments with known important simplifications. 1752 CallSite::arg_iterator CAI = CS.arg_begin(); 1753 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1754 FAI != FAE; ++FAI, ++CAI) { 1755 assert(CAI != CS.arg_end()); 1756 if (Constant *C = dyn_cast<Constant>(CAI)) 1757 SimplifiedValues[&*FAI] = C; 1758 1759 Value *PtrArg = *CAI; 1760 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1761 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1762 1763 // We can SROA any pointer arguments derived from alloca instructions. 1764 if (isa<AllocaInst>(PtrArg)) { 1765 SROAArgValues[&*FAI] = PtrArg; 1766 SROAArgCosts[PtrArg] = 0; 1767 } 1768 } 1769 } 1770 NumConstantArgs = SimplifiedValues.size(); 1771 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1772 NumAllocaArgs = SROAArgValues.size(); 1773 1774 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1775 // the ephemeral values multiple times (and they're completely determined by 1776 // the callee, so this is purely duplicate work). 1777 SmallPtrSet<const Value *, 32> EphValues; 1778 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1779 1780 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1781 // adding basic blocks of the callee which can be proven to be dead for this 1782 // particular call site in order to get more accurate cost estimates. This 1783 // requires a somewhat heavyweight iteration pattern: we need to walk the 1784 // basic blocks in a breadth-first order as we insert live successors. To 1785 // accomplish this, prioritizing for small iterations because we exit after 1786 // crossing our threshold, we use a small-size optimized SetVector. 1787 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1788 SmallPtrSet<BasicBlock *, 16>> 1789 BBSetVector; 1790 BBSetVector BBWorklist; 1791 BBWorklist.insert(&F.getEntryBlock()); 1792 bool SingleBB = true; 1793 // Note that we *must not* cache the size, this loop grows the worklist. 1794 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1795 // Bail out the moment we cross the threshold. This means we'll under-count 1796 // the cost, but only when undercounting doesn't matter. 1797 if (Cost >= Threshold && !ComputeFullInlineCost) 1798 break; 1799 1800 BasicBlock *BB = BBWorklist[Idx]; 1801 if (BB->empty()) 1802 continue; 1803 1804 // Disallow inlining a blockaddress. A blockaddress only has defined 1805 // behavior for an indirect branch in the same function, and we do not 1806 // currently support inlining indirect branches. But, the inliner may not 1807 // see an indirect branch that ends up being dead code at a particular call 1808 // site. If the blockaddress escapes the function, e.g., via a global 1809 // variable, inlining may lead to an invalid cross-function reference. 1810 if (BB->hasAddressTaken()) 1811 return false; 1812 1813 // Analyze the cost of this block. If we blow through the threshold, this 1814 // returns false, and we can bail on out. 1815 if (!analyzeBlock(BB, EphValues)) 1816 return false; 1817 1818 TerminatorInst *TI = BB->getTerminator(); 1819 1820 // Add in the live successors by first checking whether we have terminator 1821 // that may be simplified based on the values simplified by this call. 1822 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1823 if (BI->isConditional()) { 1824 Value *Cond = BI->getCondition(); 1825 if (ConstantInt *SimpleCond = 1826 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1827 BasicBlock *NextBB = BI->getSuccessor(SimpleCond->isZero() ? 1 : 0); 1828 BBWorklist.insert(NextBB); 1829 KnownSuccessors[BB] = NextBB; 1830 findDeadBlocks(BB, NextBB); 1831 continue; 1832 } 1833 } 1834 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1835 Value *Cond = SI->getCondition(); 1836 if (ConstantInt *SimpleCond = 1837 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1838 BasicBlock *NextBB = SI->findCaseValue(SimpleCond)->getCaseSuccessor(); 1839 BBWorklist.insert(NextBB); 1840 KnownSuccessors[BB] = NextBB; 1841 findDeadBlocks(BB, NextBB); 1842 continue; 1843 } 1844 } 1845 1846 // If we're unable to select a particular successor, just count all of 1847 // them. 1848 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1849 ++TIdx) 1850 BBWorklist.insert(TI->getSuccessor(TIdx)); 1851 1852 // If we had any successors at this point, than post-inlining is likely to 1853 // have them as well. Note that we assume any basic blocks which existed 1854 // due to branches or switches which folded above will also fold after 1855 // inlining. 1856 if (SingleBB && TI->getNumSuccessors() > 1) { 1857 // Take off the bonus we applied to the threshold. 1858 Threshold -= SingleBBBonus; 1859 SingleBB = false; 1860 } 1861 } 1862 1863 bool OnlyOneCallAndLocalLinkage = 1864 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1865 // If this is a noduplicate call, we can still inline as long as 1866 // inlining this would cause the removal of the caller (so the instruction 1867 // is not actually duplicated, just moved). 1868 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1869 return false; 1870 1871 // We applied the maximum possible vector bonus at the beginning. Now, 1872 // subtract the excess bonus, if any, from the Threshold before 1873 // comparing against Cost. 1874 if (NumVectorInstructions <= NumInstructions / 10) 1875 Threshold -= VectorBonus; 1876 else if (NumVectorInstructions <= NumInstructions / 2) 1877 Threshold -= VectorBonus/2; 1878 1879 return Cost < std::max(1, Threshold); 1880 } 1881 1882 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1883 /// \brief Dump stats about this call's analysis. 1884 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1885 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1886 DEBUG_PRINT_STAT(NumConstantArgs); 1887 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1888 DEBUG_PRINT_STAT(NumAllocaArgs); 1889 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1890 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1891 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1892 DEBUG_PRINT_STAT(NumInstructions); 1893 DEBUG_PRINT_STAT(SROACostSavings); 1894 DEBUG_PRINT_STAT(SROACostSavingsLost); 1895 DEBUG_PRINT_STAT(LoadEliminationCost); 1896 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1897 DEBUG_PRINT_STAT(Cost); 1898 DEBUG_PRINT_STAT(Threshold); 1899 #undef DEBUG_PRINT_STAT 1900 } 1901 #endif 1902 1903 /// \brief Test that there are no attribute conflicts between Caller and Callee 1904 /// that prevent inlining. 1905 static bool functionsHaveCompatibleAttributes(Function *Caller, 1906 Function *Callee, 1907 TargetTransformInfo &TTI) { 1908 return TTI.areInlineCompatible(Caller, Callee) && 1909 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1910 } 1911 1912 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1913 int Cost = 0; 1914 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1915 if (CS.isByValArgument(I)) { 1916 // We approximate the number of loads and stores needed by dividing the 1917 // size of the byval type by the target's pointer size. 1918 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1919 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1920 unsigned AS = PTy->getAddressSpace(); 1921 unsigned PointerSize = DL.getPointerSizeInBits(AS); 1922 // Ceiling division. 1923 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1924 1925 // If it generates more than 8 stores it is likely to be expanded as an 1926 // inline memcpy so we take that as an upper bound. Otherwise we assume 1927 // one load and one store per word copied. 1928 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1929 // here instead of a magic number of 8, but it's not available via 1930 // DataLayout. 1931 NumStores = std::min(NumStores, 8U); 1932 1933 Cost += 2 * NumStores * InlineConstants::InstrCost; 1934 } else { 1935 // For non-byval arguments subtract off one instruction per call 1936 // argument. 1937 Cost += InlineConstants::InstrCost; 1938 } 1939 } 1940 // The call instruction also disappears after inlining. 1941 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1942 return Cost; 1943 } 1944 1945 InlineCost llvm::getInlineCost( 1946 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1947 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1948 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1949 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1950 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1951 GetAssumptionCache, GetBFI, PSI, ORE); 1952 } 1953 1954 InlineCost llvm::getInlineCost( 1955 CallSite CS, Function *Callee, const InlineParams &Params, 1956 TargetTransformInfo &CalleeTTI, 1957 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1958 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1959 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1960 1961 // Cannot inline indirect calls. 1962 if (!Callee) 1963 return llvm::InlineCost::getNever(); 1964 1965 // Never inline calls with byval arguments that does not have the alloca 1966 // address space. Since byval arguments can be replaced with a copy to an 1967 // alloca, the inlined code would need to be adjusted to handle that the 1968 // argument is in the alloca address space (so it is a little bit complicated 1969 // to solve). 1970 unsigned AllocaAS = Callee->getParent()->getDataLayout().getAllocaAddrSpace(); 1971 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) 1972 if (CS.isByValArgument(I)) { 1973 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1974 if (PTy->getAddressSpace() != AllocaAS) 1975 return llvm::InlineCost::getNever(); 1976 } 1977 1978 // Calls to functions with always-inline attributes should be inlined 1979 // whenever possible. 1980 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1981 if (isInlineViable(*Callee)) 1982 return llvm::InlineCost::getAlways(); 1983 return llvm::InlineCost::getNever(); 1984 } 1985 1986 // Never inline functions with conflicting attributes (unless callee has 1987 // always-inline attribute). 1988 Function *Caller = CS.getCaller(); 1989 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1990 return llvm::InlineCost::getNever(); 1991 1992 // Don't inline this call if the caller has the optnone attribute. 1993 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1994 return llvm::InlineCost::getNever(); 1995 1996 // Don't inline functions which can be interposed at link-time. Don't inline 1997 // functions marked noinline or call sites marked noinline. 1998 // Note: inlining non-exact non-interposable functions is fine, since we know 1999 // we have *a* correct implementation of the source level function. 2000 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 2001 CS.isNoInline()) 2002 return llvm::InlineCost::getNever(); 2003 2004 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 2005 << "... (caller:" << Caller->getName() << ")\n"); 2006 2007 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 2008 Params); 2009 bool ShouldInline = CA.analyzeCall(CS); 2010 2011 DEBUG(CA.dump()); 2012 2013 // Check if there was a reason to force inlining or no inlining. 2014 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 2015 return InlineCost::getNever(); 2016 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 2017 return InlineCost::getAlways(); 2018 2019 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 2020 } 2021 2022 bool llvm::isInlineViable(Function &F) { 2023 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 2024 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 2025 // Disallow inlining of functions which contain indirect branches or 2026 // blockaddresses. 2027 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 2028 return false; 2029 2030 for (auto &II : *BI) { 2031 CallSite CS(&II); 2032 if (!CS) 2033 continue; 2034 2035 // Disallow recursive calls. 2036 if (&F == CS.getCalledFunction()) 2037 return false; 2038 2039 // Disallow calls which expose returns-twice to a function not previously 2040 // attributed as such. 2041 if (!ReturnsTwice && CS.isCall() && 2042 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 2043 return false; 2044 2045 if (CS.getCalledFunction()) 2046 switch (CS.getCalledFunction()->getIntrinsicID()) { 2047 default: 2048 break; 2049 // Disallow inlining of @llvm.icall.branch.funnel because current 2050 // backend can't separate call targets from call arguments. 2051 case llvm::Intrinsic::icall_branch_funnel: 2052 // Disallow inlining functions that call @llvm.localescape. Doing this 2053 // correctly would require major changes to the inliner. 2054 case llvm::Intrinsic::localescape: 2055 // Disallow inlining of functions that access VarArgs. 2056 case llvm::Intrinsic::vastart: 2057 case llvm::Intrinsic::vaend: 2058 return false; 2059 } 2060 } 2061 } 2062 2063 return true; 2064 } 2065 2066 // APIs to create InlineParams based on command line flags and/or other 2067 // parameters. 2068 2069 InlineParams llvm::getInlineParams(int Threshold) { 2070 InlineParams Params; 2071 2072 // This field is the threshold to use for a callee by default. This is 2073 // derived from one or more of: 2074 // * optimization or size-optimization levels, 2075 // * a value passed to createFunctionInliningPass function, or 2076 // * the -inline-threshold flag. 2077 // If the -inline-threshold flag is explicitly specified, that is used 2078 // irrespective of anything else. 2079 if (InlineThreshold.getNumOccurrences() > 0) 2080 Params.DefaultThreshold = InlineThreshold; 2081 else 2082 Params.DefaultThreshold = Threshold; 2083 2084 // Set the HintThreshold knob from the -inlinehint-threshold. 2085 Params.HintThreshold = HintThreshold; 2086 2087 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 2088 Params.HotCallSiteThreshold = HotCallSiteThreshold; 2089 2090 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 2091 // populate LocallyHotCallSiteThreshold. Later, we populate 2092 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 2093 // we know that optimization level is O3 (in the getInlineParams variant that 2094 // takes the opt and size levels). 2095 // FIXME: Remove this check (and make the assignment unconditional) after 2096 // addressing size regression issues at O2. 2097 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 2098 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2099 2100 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 2101 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 2102 2103 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 2104 // -inlinehint-threshold commandline option is not explicitly given. If that 2105 // option is present, then its value applies even for callees with size and 2106 // minsize attributes. 2107 // If the -inline-threshold is not specified, set the ColdThreshold from the 2108 // -inlinecold-threshold even if it is not explicitly passed. If 2109 // -inline-threshold is specified, then -inlinecold-threshold needs to be 2110 // explicitly specified to set the ColdThreshold knob 2111 if (InlineThreshold.getNumOccurrences() == 0) { 2112 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 2113 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 2114 Params.ColdThreshold = ColdThreshold; 2115 } else if (ColdThreshold.getNumOccurrences() > 0) { 2116 Params.ColdThreshold = ColdThreshold; 2117 } 2118 return Params; 2119 } 2120 2121 InlineParams llvm::getInlineParams() { 2122 return getInlineParams(InlineThreshold); 2123 } 2124 2125 // Compute the default threshold for inlining based on the opt level and the 2126 // size opt level. 2127 static int computeThresholdFromOptLevels(unsigned OptLevel, 2128 unsigned SizeOptLevel) { 2129 if (OptLevel > 2) 2130 return InlineConstants::OptAggressiveThreshold; 2131 if (SizeOptLevel == 1) // -Os 2132 return InlineConstants::OptSizeThreshold; 2133 if (SizeOptLevel == 2) // -Oz 2134 return InlineConstants::OptMinSizeThreshold; 2135 return InlineThreshold; 2136 } 2137 2138 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 2139 auto Params = 2140 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 2141 // At O3, use the value of -locally-hot-callsite-threshold option to populate 2142 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 2143 // when it is specified explicitly. 2144 if (OptLevel > 2) 2145 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 2146 return Params; 2147 } 2148