1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/BlockFrequencyInfo.h" 22 #include "llvm/Analysis/CodeMetrics.h" 23 #include "llvm/Analysis/ConstantFolding.h" 24 #include "llvm/Analysis/InstructionSimplify.h" 25 #include "llvm/Analysis/ProfileSummaryInfo.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/IR/CallSite.h" 28 #include "llvm/IR/CallingConv.h" 29 #include "llvm/IR/DataLayout.h" 30 #include "llvm/IR/GetElementPtrTypeIterator.h" 31 #include "llvm/IR/GlobalAlias.h" 32 #include "llvm/IR/InstVisitor.h" 33 #include "llvm/IR/IntrinsicInst.h" 34 #include "llvm/IR/Operator.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Support/raw_ostream.h" 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "inline-cost" 41 42 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 43 44 static cl::opt<int> InlineThreshold( 45 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 46 cl::desc("Control the amount of inlining to perform (default = 225)")); 47 48 static cl::opt<int> HintThreshold( 49 "inlinehint-threshold", cl::Hidden, cl::init(325), 50 cl::desc("Threshold for inlining functions with inline hint")); 51 52 static cl::opt<int> 53 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, 54 cl::init(45), 55 cl::desc("Threshold for inlining cold callsites")); 56 57 // We introduce this threshold to help performance of instrumentation based 58 // PGO before we actually hook up inliner with analysis passes such as BPI and 59 // BFI. 60 static cl::opt<int> ColdThreshold( 61 "inlinecold-threshold", cl::Hidden, cl::init(45), 62 cl::desc("Threshold for inlining functions with cold attribute")); 63 64 static cl::opt<int> 65 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 66 cl::ZeroOrMore, 67 cl::desc("Threshold for hot callsites ")); 68 69 static cl::opt<int> LocallyHotCallSiteThreshold( 70 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, 71 cl::desc("Threshold for locally hot callsites ")); 72 73 static cl::opt<int> ColdCallSiteRelFreq( 74 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, 75 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " 76 "entry frequency, for a callsite to be cold in the absence of " 77 "profile information.")); 78 79 static cl::opt<int> HotCallSiteRelFreq( 80 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, 81 cl::desc("Minimum block frequency, expressed as a multiple of caller's " 82 "entry frequency, for a callsite to be hot in the absence of " 83 "profile information.")); 84 85 static cl::opt<bool> OptComputeFullInlineCost( 86 "inline-cost-full", cl::Hidden, cl::init(false), 87 cl::desc("Compute the full inline cost of a call site even when the cost " 88 "exceeds the threshold.")); 89 90 namespace { 91 92 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 93 typedef InstVisitor<CallAnalyzer, bool> Base; 94 friend class InstVisitor<CallAnalyzer, bool>; 95 96 /// The TargetTransformInfo available for this compilation. 97 const TargetTransformInfo &TTI; 98 99 /// Getter for the cache of @llvm.assume intrinsics. 100 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 101 102 /// Getter for BlockFrequencyInfo 103 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; 104 105 /// Profile summary information. 106 ProfileSummaryInfo *PSI; 107 108 /// The called function. 109 Function &F; 110 111 // Cache the DataLayout since we use it a lot. 112 const DataLayout &DL; 113 114 /// The OptimizationRemarkEmitter available for this compilation. 115 OptimizationRemarkEmitter *ORE; 116 117 /// The candidate callsite being analyzed. Please do not use this to do 118 /// analysis in the caller function; we want the inline cost query to be 119 /// easily cacheable. Instead, use the cover function paramHasAttr. 120 CallSite CandidateCS; 121 122 /// Tunable parameters that control the analysis. 123 const InlineParams &Params; 124 125 int Threshold; 126 int Cost; 127 bool ComputeFullInlineCost; 128 129 bool IsCallerRecursive; 130 bool IsRecursiveCall; 131 bool ExposesReturnsTwice; 132 bool HasDynamicAlloca; 133 bool ContainsNoDuplicateCall; 134 bool HasReturn; 135 bool HasIndirectBr; 136 bool HasFrameEscape; 137 138 /// Number of bytes allocated statically by the callee. 139 uint64_t AllocatedSize; 140 unsigned NumInstructions, NumVectorInstructions; 141 int VectorBonus, TenPercentVectorBonus; 142 // Bonus to be applied when the callee has only one reachable basic block. 143 int SingleBBBonus; 144 145 /// While we walk the potentially-inlined instructions, we build up and 146 /// maintain a mapping of simplified values specific to this callsite. The 147 /// idea is to propagate any special information we have about arguments to 148 /// this call through the inlinable section of the function, and account for 149 /// likely simplifications post-inlining. The most important aspect we track 150 /// is CFG altering simplifications -- when we prove a basic block dead, that 151 /// can cause dramatic shifts in the cost of inlining a function. 152 DenseMap<Value *, Constant *> SimplifiedValues; 153 154 /// Keep track of the values which map back (through function arguments) to 155 /// allocas on the caller stack which could be simplified through SROA. 156 DenseMap<Value *, Value *> SROAArgValues; 157 158 /// The mapping of caller Alloca values to their accumulated cost savings. If 159 /// we have to disable SROA for one of the allocas, this tells us how much 160 /// cost must be added. 161 DenseMap<Value *, int> SROAArgCosts; 162 163 /// Keep track of values which map to a pointer base and constant offset. 164 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 165 166 // Custom simplification helper routines. 167 bool isAllocaDerivedArg(Value *V); 168 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 169 DenseMap<Value *, int>::iterator &CostIt); 170 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 171 void disableSROA(Value *V); 172 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 173 int InstructionCost); 174 bool isGEPFree(GetElementPtrInst &GEP); 175 bool canFoldInboundsGEP(GetElementPtrInst &I); 176 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 177 bool simplifyCallSite(Function *F, CallSite CS); 178 template <typename Callable> 179 bool simplifyInstruction(Instruction &I, Callable Evaluate); 180 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 181 182 /// Return true if the given argument to the function being considered for 183 /// inlining has the given attribute set either at the call site or the 184 /// function declaration. Primarily used to inspect call site specific 185 /// attributes since these can be more precise than the ones on the callee 186 /// itself. 187 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 188 189 /// Return true if the given value is known non null within the callee if 190 /// inlined through this particular callsite. 191 bool isKnownNonNullInCallee(Value *V); 192 193 /// Update Threshold based on callsite properties such as callee 194 /// attributes and callee hotness for PGO builds. The Callee is explicitly 195 /// passed to support analyzing indirect calls whose target is inferred by 196 /// analysis. 197 void updateThreshold(CallSite CS, Function &Callee); 198 199 /// Return true if size growth is allowed when inlining the callee at CS. 200 bool allowSizeGrowth(CallSite CS); 201 202 /// Return true if \p CS is a cold callsite. 203 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); 204 205 /// Return a higher threshold if \p CS is a hot callsite. 206 Optional<int> getHotCallSiteThreshold(CallSite CS, 207 BlockFrequencyInfo *CallerBFI); 208 209 // Custom analysis routines. 210 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 211 212 // Disable several entry points to the visitor so we don't accidentally use 213 // them by declaring but not defining them here. 214 void visit(Module *); 215 void visit(Module &); 216 void visit(Function *); 217 void visit(Function &); 218 void visit(BasicBlock *); 219 void visit(BasicBlock &); 220 221 // Provide base case for our instruction visit. 222 bool visitInstruction(Instruction &I); 223 224 // Our visit overrides. 225 bool visitAlloca(AllocaInst &I); 226 bool visitPHI(PHINode &I); 227 bool visitGetElementPtr(GetElementPtrInst &I); 228 bool visitBitCast(BitCastInst &I); 229 bool visitPtrToInt(PtrToIntInst &I); 230 bool visitIntToPtr(IntToPtrInst &I); 231 bool visitCastInst(CastInst &I); 232 bool visitUnaryInstruction(UnaryInstruction &I); 233 bool visitCmpInst(CmpInst &I); 234 bool visitAnd(BinaryOperator &I); 235 bool visitOr(BinaryOperator &I); 236 bool visitSub(BinaryOperator &I); 237 bool visitBinaryOperator(BinaryOperator &I); 238 bool visitLoad(LoadInst &I); 239 bool visitStore(StoreInst &I); 240 bool visitExtractValue(ExtractValueInst &I); 241 bool visitInsertValue(InsertValueInst &I); 242 bool visitCallSite(CallSite CS); 243 bool visitReturnInst(ReturnInst &RI); 244 bool visitBranchInst(BranchInst &BI); 245 bool visitSelectInst(SelectInst &SI); 246 bool visitSwitchInst(SwitchInst &SI); 247 bool visitIndirectBrInst(IndirectBrInst &IBI); 248 bool visitResumeInst(ResumeInst &RI); 249 bool visitCleanupReturnInst(CleanupReturnInst &RI); 250 bool visitCatchReturnInst(CatchReturnInst &RI); 251 bool visitUnreachableInst(UnreachableInst &I); 252 253 public: 254 CallAnalyzer(const TargetTransformInfo &TTI, 255 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 256 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, 257 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, 258 Function &Callee, CallSite CSArg, const InlineParams &Params) 259 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), 260 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), 261 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 262 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || 263 Params.ComputeFullInlineCost || ORE), 264 IsCallerRecursive(false), IsRecursiveCall(false), 265 ExposesReturnsTwice(false), HasDynamicAlloca(false), 266 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 267 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 268 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), 269 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), 270 NumConstantPtrCmps(0), NumConstantPtrDiffs(0), 271 NumInstructionsSimplified(0), SROACostSavings(0), 272 SROACostSavingsLost(0) {} 273 274 bool analyzeCall(CallSite CS); 275 276 int getThreshold() { return Threshold; } 277 int getCost() { return Cost; } 278 279 // Keep a bunch of stats about the cost savings found so we can print them 280 // out when debugging. 281 unsigned NumConstantArgs; 282 unsigned NumConstantOffsetPtrArgs; 283 unsigned NumAllocaArgs; 284 unsigned NumConstantPtrCmps; 285 unsigned NumConstantPtrDiffs; 286 unsigned NumInstructionsSimplified; 287 unsigned SROACostSavings; 288 unsigned SROACostSavingsLost; 289 290 void dump(); 291 }; 292 293 } // namespace 294 295 /// \brief Test whether the given value is an Alloca-derived function argument. 296 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 297 return SROAArgValues.count(V); 298 } 299 300 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 301 /// Returns false if V does not map to a SROA-candidate. 302 bool CallAnalyzer::lookupSROAArgAndCost( 303 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 304 if (SROAArgValues.empty() || SROAArgCosts.empty()) 305 return false; 306 307 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 308 if (ArgIt == SROAArgValues.end()) 309 return false; 310 311 Arg = ArgIt->second; 312 CostIt = SROAArgCosts.find(Arg); 313 return CostIt != SROAArgCosts.end(); 314 } 315 316 /// \brief Disable SROA for the candidate marked by this cost iterator. 317 /// 318 /// This marks the candidate as no longer viable for SROA, and adds the cost 319 /// savings associated with it back into the inline cost measurement. 320 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 321 // If we're no longer able to perform SROA we need to undo its cost savings 322 // and prevent subsequent analysis. 323 Cost += CostIt->second; 324 SROACostSavings -= CostIt->second; 325 SROACostSavingsLost += CostIt->second; 326 SROAArgCosts.erase(CostIt); 327 } 328 329 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 330 void CallAnalyzer::disableSROA(Value *V) { 331 Value *SROAArg; 332 DenseMap<Value *, int>::iterator CostIt; 333 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 334 disableSROA(CostIt); 335 } 336 337 /// \brief Accumulate the given cost for a particular SROA candidate. 338 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 339 int InstructionCost) { 340 CostIt->second += InstructionCost; 341 SROACostSavings += InstructionCost; 342 } 343 344 /// \brief Accumulate a constant GEP offset into an APInt if possible. 345 /// 346 /// Returns false if unable to compute the offset for any reason. Respects any 347 /// simplified values known during the analysis of this callsite. 348 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 349 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 350 assert(IntPtrWidth == Offset.getBitWidth()); 351 352 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 353 GTI != GTE; ++GTI) { 354 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 355 if (!OpC) 356 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 357 OpC = dyn_cast<ConstantInt>(SimpleOp); 358 if (!OpC) 359 return false; 360 if (OpC->isZero()) 361 continue; 362 363 // Handle a struct index, which adds its field offset to the pointer. 364 if (StructType *STy = GTI.getStructTypeOrNull()) { 365 unsigned ElementIdx = OpC->getZExtValue(); 366 const StructLayout *SL = DL.getStructLayout(STy); 367 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 368 continue; 369 } 370 371 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 372 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 373 } 374 return true; 375 } 376 377 /// \brief Use TTI to check whether a GEP is free. 378 /// 379 /// Respects any simplified values known during the analysis of this callsite. 380 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { 381 SmallVector<Value *, 4> Operands; 382 Operands.push_back(GEP.getOperand(0)); 383 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 384 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) 385 Operands.push_back(SimpleOp); 386 else 387 Operands.push_back(*I); 388 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); 389 } 390 391 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 392 // Check whether inlining will turn a dynamic alloca into a static 393 // alloca and handle that case. 394 if (I.isArrayAllocation()) { 395 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 396 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 397 Type *Ty = I.getAllocatedType(); 398 AllocatedSize = SaturatingMultiplyAdd( 399 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 400 return Base::visitAlloca(I); 401 } 402 } 403 404 // Accumulate the allocated size. 405 if (I.isStaticAlloca()) { 406 Type *Ty = I.getAllocatedType(); 407 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 408 } 409 410 // We will happily inline static alloca instructions. 411 if (I.isStaticAlloca()) 412 return Base::visitAlloca(I); 413 414 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 415 // a variety of reasons, and so we would like to not inline them into 416 // functions which don't currently have a dynamic alloca. This simply 417 // disables inlining altogether in the presence of a dynamic alloca. 418 HasDynamicAlloca = true; 419 return false; 420 } 421 422 bool CallAnalyzer::visitPHI(PHINode &I) { 423 // FIXME: We should potentially be tracking values through phi nodes, 424 // especially when they collapse to a single value due to deleted CFG edges 425 // during inlining. 426 427 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 428 // though we don't want to propagate it's bonuses. The idea is to disable 429 // SROA if it *might* be used in an inappropriate manner. 430 431 // Phi nodes are always zero-cost. 432 return true; 433 } 434 435 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. 436 /// This requires target data and inbounds GEPs. 437 /// 438 /// \return true if the specified GEP can be folded. 439 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { 440 // Check if we have a base + offset for the pointer. 441 std::pair<Value *, APInt> BaseAndOffset = 442 ConstantOffsetPtrs.lookup(I.getPointerOperand()); 443 if (!BaseAndOffset.first) 444 return false; 445 446 // Check if the offset of this GEP is constant, and if so accumulate it 447 // into Offset. 448 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) 449 return false; 450 451 // Add the result as a new mapping to Base + Offset. 452 ConstantOffsetPtrs[&I] = BaseAndOffset; 453 454 return true; 455 } 456 457 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 458 Value *SROAArg; 459 DenseMap<Value *, int>::iterator CostIt; 460 bool SROACandidate = 461 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 462 463 // Lambda to check whether a GEP's indices are all constant. 464 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { 465 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 466 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 467 return false; 468 return true; 469 }; 470 471 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { 472 if (SROACandidate) 473 SROAArgValues[&I] = SROAArg; 474 475 // Constant GEPs are modeled as free. 476 return true; 477 } 478 479 // Variable GEPs will require math and will disable SROA. 480 if (SROACandidate) 481 disableSROA(CostIt); 482 return isGEPFree(I); 483 } 484 485 /// Simplify \p I if its operands are constants and update SimplifiedValues. 486 /// \p Evaluate is a callable specific to instruction type that evaluates the 487 /// instruction when all the operands are constants. 488 template <typename Callable> 489 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { 490 SmallVector<Constant *, 2> COps; 491 for (Value *Op : I.operands()) { 492 Constant *COp = dyn_cast<Constant>(Op); 493 if (!COp) 494 COp = SimplifiedValues.lookup(Op); 495 if (!COp) 496 return false; 497 COps.push_back(COp); 498 } 499 auto *C = Evaluate(COps); 500 if (!C) 501 return false; 502 SimplifiedValues[&I] = C; 503 return true; 504 } 505 506 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 507 // Propagate constants through bitcasts. 508 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 509 return ConstantExpr::getBitCast(COps[0], I.getType()); 510 })) 511 return true; 512 513 // Track base/offsets through casts 514 std::pair<Value *, APInt> BaseAndOffset = 515 ConstantOffsetPtrs.lookup(I.getOperand(0)); 516 // Casts don't change the offset, just wrap it up. 517 if (BaseAndOffset.first) 518 ConstantOffsetPtrs[&I] = BaseAndOffset; 519 520 // Also look for SROA candidates here. 521 Value *SROAArg; 522 DenseMap<Value *, int>::iterator CostIt; 523 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 524 SROAArgValues[&I] = SROAArg; 525 526 // Bitcasts are always zero cost. 527 return true; 528 } 529 530 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 531 // Propagate constants through ptrtoint. 532 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 533 return ConstantExpr::getPtrToInt(COps[0], I.getType()); 534 })) 535 return true; 536 537 // Track base/offset pairs when converted to a plain integer provided the 538 // integer is large enough to represent the pointer. 539 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 540 if (IntegerSize >= DL.getPointerSizeInBits()) { 541 std::pair<Value *, APInt> BaseAndOffset = 542 ConstantOffsetPtrs.lookup(I.getOperand(0)); 543 if (BaseAndOffset.first) 544 ConstantOffsetPtrs[&I] = BaseAndOffset; 545 } 546 547 // This is really weird. Technically, ptrtoint will disable SROA. However, 548 // unless that ptrtoint is *used* somewhere in the live basic blocks after 549 // inlining, it will be nuked, and SROA should proceed. All of the uses which 550 // would block SROA would also block SROA if applied directly to a pointer, 551 // and so we can just add the integer in here. The only places where SROA is 552 // preserved either cannot fire on an integer, or won't in-and-of themselves 553 // disable SROA (ext) w/o some later use that we would see and disable. 554 Value *SROAArg; 555 DenseMap<Value *, int>::iterator CostIt; 556 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 557 SROAArgValues[&I] = SROAArg; 558 559 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 560 } 561 562 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 563 // Propagate constants through ptrtoint. 564 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 565 return ConstantExpr::getIntToPtr(COps[0], I.getType()); 566 })) 567 return true; 568 569 // Track base/offset pairs when round-tripped through a pointer without 570 // modifications provided the integer is not too large. 571 Value *Op = I.getOperand(0); 572 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 573 if (IntegerSize <= DL.getPointerSizeInBits()) { 574 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 575 if (BaseAndOffset.first) 576 ConstantOffsetPtrs[&I] = BaseAndOffset; 577 } 578 579 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 580 Value *SROAArg; 581 DenseMap<Value *, int>::iterator CostIt; 582 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 583 SROAArgValues[&I] = SROAArg; 584 585 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 586 } 587 588 bool CallAnalyzer::visitCastInst(CastInst &I) { 589 // Propagate constants through ptrtoint. 590 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 591 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); 592 })) 593 return true; 594 595 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 596 disableSROA(I.getOperand(0)); 597 598 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 599 } 600 601 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 602 Value *Operand = I.getOperand(0); 603 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 604 return ConstantFoldInstOperands(&I, COps[0], DL); 605 })) 606 return true; 607 608 // Disable any SROA on the argument to arbitrary unary operators. 609 disableSROA(Operand); 610 611 return false; 612 } 613 614 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 615 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); 616 } 617 618 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 619 // Does the *call site* have the NonNull attribute set on an argument? We 620 // use the attribute on the call site to memoize any analysis done in the 621 // caller. This will also trip if the callee function has a non-null 622 // parameter attribute, but that's a less interesting case because hopefully 623 // the callee would already have been simplified based on that. 624 if (Argument *A = dyn_cast<Argument>(V)) 625 if (paramHasAttr(A, Attribute::NonNull)) 626 return true; 627 628 // Is this an alloca in the caller? This is distinct from the attribute case 629 // above because attributes aren't updated within the inliner itself and we 630 // always want to catch the alloca derived case. 631 if (isAllocaDerivedArg(V)) 632 // We can actually predict the result of comparisons between an 633 // alloca-derived value and null. Note that this fires regardless of 634 // SROA firing. 635 return true; 636 637 return false; 638 } 639 640 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 641 // If the normal destination of the invoke or the parent block of the call 642 // site is unreachable-terminated, there is little point in inlining this 643 // unless there is literally zero cost. 644 // FIXME: Note that it is possible that an unreachable-terminated block has a 645 // hot entry. For example, in below scenario inlining hot_call_X() may be 646 // beneficial : 647 // main() { 648 // hot_call_1(); 649 // ... 650 // hot_call_N() 651 // exit(0); 652 // } 653 // For now, we are not handling this corner case here as it is rare in real 654 // code. In future, we should elaborate this based on BPI and BFI in more 655 // general threshold adjusting heuristics in updateThreshold(). 656 Instruction *Instr = CS.getInstruction(); 657 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 658 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 659 return false; 660 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 661 return false; 662 663 return true; 664 } 665 666 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { 667 // If global profile summary is available, then callsite's coldness is 668 // determined based on that. 669 if (PSI && PSI->hasProfileSummary()) 670 return PSI->isColdCallSite(CS, CallerBFI); 671 672 // Otherwise we need BFI to be available. 673 if (!CallerBFI) 674 return false; 675 676 // Determine if the callsite is cold relative to caller's entry. We could 677 // potentially cache the computation of scaled entry frequency, but the added 678 // complexity is not worth it unless this scaling shows up high in the 679 // profiles. 680 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); 681 auto CallSiteBB = CS.getInstruction()->getParent(); 682 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); 683 auto CallerEntryFreq = 684 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); 685 return CallSiteFreq < CallerEntryFreq * ColdProb; 686 } 687 688 Optional<int> 689 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, 690 BlockFrequencyInfo *CallerBFI) { 691 692 // If global profile summary is available, then callsite's hotness is 693 // determined based on that. 694 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) 695 return Params.HotCallSiteThreshold; 696 697 // Otherwise we need BFI to be available and to have a locally hot callsite 698 // threshold. 699 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) 700 return None; 701 702 // Determine if the callsite is hot relative to caller's entry. We could 703 // potentially cache the computation of scaled entry frequency, but the added 704 // complexity is not worth it unless this scaling shows up high in the 705 // profiles. 706 auto CallSiteBB = CS.getInstruction()->getParent(); 707 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); 708 auto CallerEntryFreq = CallerBFI->getEntryFreq(); 709 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) 710 return Params.LocallyHotCallSiteThreshold; 711 712 // Otherwise treat it normally. 713 return None; 714 } 715 716 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 717 // If no size growth is allowed for this inlining, set Threshold to 0. 718 if (!allowSizeGrowth(CS)) { 719 Threshold = 0; 720 return; 721 } 722 723 Function *Caller = CS.getCaller(); 724 725 // return min(A, B) if B is valid. 726 auto MinIfValid = [](int A, Optional<int> B) { 727 return B ? std::min(A, B.getValue()) : A; 728 }; 729 730 // return max(A, B) if B is valid. 731 auto MaxIfValid = [](int A, Optional<int> B) { 732 return B ? std::max(A, B.getValue()) : A; 733 }; 734 735 // Various bonus percentages. These are multiplied by Threshold to get the 736 // bonus values. 737 // SingleBBBonus: This bonus is applied if the callee has a single reachable 738 // basic block at the given callsite context. This is speculatively applied 739 // and withdrawn if more than one basic block is seen. 740 // 741 // Vector bonuses: We want to more aggressively inline vector-dense kernels 742 // and apply this bonus based on the percentage of vector instructions. A 743 // bonus is applied if the vector instructions exceed 50% and half that amount 744 // is applied if it exceeds 10%. Note that these bonuses are some what 745 // arbitrary and evolved over time by accident as much as because they are 746 // principled bonuses. 747 // FIXME: It would be nice to base the bonus values on something more 748 // scientific. 749 // 750 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining 751 // of the last call to a static function as inlining such functions is 752 // guaranteed to reduce code size. 753 // 754 // These bonus percentages may be set to 0 based on properties of the caller 755 // and the callsite. 756 int SingleBBBonusPercent = 50; 757 int VectorBonusPercent = 150; 758 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; 759 760 // Lambda to set all the above bonus and bonus percentages to 0. 761 auto DisallowAllBonuses = [&]() { 762 SingleBBBonusPercent = 0; 763 VectorBonusPercent = 0; 764 LastCallToStaticBonus = 0; 765 }; 766 767 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 768 // and reduce the threshold if the caller has the necessary attribute. 769 if (Caller->optForMinSize()) { 770 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 771 // For minsize, we want to disable the single BB bonus and the vector 772 // bonuses, but not the last-call-to-static bonus. Inlining the last call to 773 // a static function will, at the minimum, eliminate the parameter setup and 774 // call/return instructions. 775 SingleBBBonusPercent = 0; 776 VectorBonusPercent = 0; 777 } else if (Caller->optForSize()) 778 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 779 780 // Adjust the threshold based on inlinehint attribute and profile based 781 // hotness information if the caller does not have MinSize attribute. 782 if (!Caller->optForMinSize()) { 783 if (Callee.hasFnAttribute(Attribute::InlineHint)) 784 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 785 786 // FIXME: After switching to the new passmanager, simplify the logic below 787 // by checking only the callsite hotness/coldness as we will reliably 788 // have local profile information. 789 // 790 // Callsite hotness and coldness can be determined if sample profile is 791 // used (which adds hotness metadata to calls) or if caller's 792 // BlockFrequencyInfo is available. 793 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; 794 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); 795 if (!Caller->optForSize() && HotCallSiteThreshold) { 796 DEBUG(dbgs() << "Hot callsite.\n"); 797 // FIXME: This should update the threshold only if it exceeds the 798 // current threshold, but AutoFDO + ThinLTO currently relies on this 799 // behavior to prevent inlining of hot callsites during ThinLTO 800 // compile phase. 801 Threshold = HotCallSiteThreshold.getValue(); 802 } else if (isColdCallSite(CS, CallerBFI)) { 803 DEBUG(dbgs() << "Cold callsite.\n"); 804 // Do not apply bonuses for a cold callsite including the 805 // LastCallToStatic bonus. While this bonus might result in code size 806 // reduction, it can cause the size of a non-cold caller to increase 807 // preventing it from being inlined. 808 DisallowAllBonuses(); 809 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); 810 } else if (PSI) { 811 // Use callee's global profile information only if we have no way of 812 // determining this via callsite information. 813 if (PSI->isFunctionEntryHot(&Callee)) { 814 DEBUG(dbgs() << "Hot callee.\n"); 815 // If callsite hotness can not be determined, we may still know 816 // that the callee is hot and treat it as a weaker hint for threshold 817 // increase. 818 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 819 } else if (PSI->isFunctionEntryCold(&Callee)) { 820 DEBUG(dbgs() << "Cold callee.\n"); 821 // Do not apply bonuses for a cold callee including the 822 // LastCallToStatic bonus. While this bonus might result in code size 823 // reduction, it can cause the size of a non-cold caller to increase 824 // preventing it from being inlined. 825 DisallowAllBonuses(); 826 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 827 } 828 } 829 } 830 831 // Finally, take the target-specific inlining threshold multiplier into 832 // account. 833 Threshold *= TTI.getInliningThresholdMultiplier(); 834 835 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; 836 VectorBonus = Threshold * VectorBonusPercent / 100; 837 838 bool OnlyOneCallAndLocalLinkage = 839 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 840 // If there is only one call of the function, and it has internal linkage, 841 // the cost of inlining it drops dramatically. It may seem odd to update 842 // Cost in updateThreshold, but the bonus depends on the logic in this method. 843 if (OnlyOneCallAndLocalLinkage) 844 Cost -= LastCallToStaticBonus; 845 } 846 847 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 848 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 849 // First try to handle simplified comparisons. 850 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 851 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); 852 })) 853 return true; 854 855 if (I.getOpcode() == Instruction::FCmp) 856 return false; 857 858 // Otherwise look for a comparison between constant offset pointers with 859 // a common base. 860 Value *LHSBase, *RHSBase; 861 APInt LHSOffset, RHSOffset; 862 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 863 if (LHSBase) { 864 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 865 if (RHSBase && LHSBase == RHSBase) { 866 // We have common bases, fold the icmp to a constant based on the 867 // offsets. 868 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 869 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 870 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 871 SimplifiedValues[&I] = C; 872 ++NumConstantPtrCmps; 873 return true; 874 } 875 } 876 } 877 878 // If the comparison is an equality comparison with null, we can simplify it 879 // if we know the value (argument) can't be null 880 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 881 isKnownNonNullInCallee(I.getOperand(0))) { 882 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 883 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 884 : ConstantInt::getFalse(I.getType()); 885 return true; 886 } 887 // Finally check for SROA candidates in comparisons. 888 Value *SROAArg; 889 DenseMap<Value *, int>::iterator CostIt; 890 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 891 if (isa<ConstantPointerNull>(I.getOperand(1))) { 892 accumulateSROACost(CostIt, InlineConstants::InstrCost); 893 return true; 894 } 895 896 disableSROA(CostIt); 897 } 898 899 return false; 900 } 901 902 bool CallAnalyzer::visitOr(BinaryOperator &I) { 903 // This is necessary because the generic simplify instruction only works if 904 // both operands are constants. 905 for (unsigned i = 0; i < 2; ++i) { 906 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( 907 SimplifiedValues.lookup(I.getOperand(i)))) 908 if (C->isAllOnesValue()) { 909 SimplifiedValues[&I] = C; 910 return true; 911 } 912 } 913 return Base::visitOr(I); 914 } 915 916 bool CallAnalyzer::visitAnd(BinaryOperator &I) { 917 // This is necessary because the generic simplify instruction only works if 918 // both operands are constants. 919 for (unsigned i = 0; i < 2; ++i) { 920 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( 921 SimplifiedValues.lookup(I.getOperand(i)))) 922 if (C->isZero()) { 923 SimplifiedValues[&I] = C; 924 return true; 925 } 926 } 927 return Base::visitAnd(I); 928 } 929 930 bool CallAnalyzer::visitSub(BinaryOperator &I) { 931 // Try to handle a special case: we can fold computing the difference of two 932 // constant-related pointers. 933 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 934 Value *LHSBase, *RHSBase; 935 APInt LHSOffset, RHSOffset; 936 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 937 if (LHSBase) { 938 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 939 if (RHSBase && LHSBase == RHSBase) { 940 // We have common bases, fold the subtract to a constant based on the 941 // offsets. 942 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 943 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 944 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 945 SimplifiedValues[&I] = C; 946 ++NumConstantPtrDiffs; 947 return true; 948 } 949 } 950 } 951 952 // Otherwise, fall back to the generic logic for simplifying and handling 953 // instructions. 954 return Base::visitSub(I); 955 } 956 957 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 958 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 959 auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) { 960 Value *SimpleV = nullptr; 961 if (auto FI = dyn_cast<FPMathOperator>(&I)) 962 SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1], 963 FI->getFastMathFlags(), DL); 964 else 965 SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL); 966 return dyn_cast_or_null<Constant>(SimpleV); 967 }; 968 969 if (simplifyInstruction(I, Evaluate)) 970 return true; 971 972 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 973 disableSROA(LHS); 974 disableSROA(RHS); 975 976 return false; 977 } 978 979 bool CallAnalyzer::visitLoad(LoadInst &I) { 980 Value *SROAArg; 981 DenseMap<Value *, int>::iterator CostIt; 982 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 983 if (I.isSimple()) { 984 accumulateSROACost(CostIt, InlineConstants::InstrCost); 985 return true; 986 } 987 988 disableSROA(CostIt); 989 } 990 991 return false; 992 } 993 994 bool CallAnalyzer::visitStore(StoreInst &I) { 995 Value *SROAArg; 996 DenseMap<Value *, int>::iterator CostIt; 997 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 998 if (I.isSimple()) { 999 accumulateSROACost(CostIt, InlineConstants::InstrCost); 1000 return true; 1001 } 1002 1003 disableSROA(CostIt); 1004 } 1005 1006 return false; 1007 } 1008 1009 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1010 // Constant folding for extract value is trivial. 1011 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1012 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); 1013 })) 1014 return true; 1015 1016 // SROA can look through these but give them a cost. 1017 return false; 1018 } 1019 1020 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1021 // Constant folding for insert value is trivial. 1022 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { 1023 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], 1024 /*InsertedValueOperand*/ COps[1], 1025 I.getIndices()); 1026 })) 1027 return true; 1028 1029 // SROA can look through these but give them a cost. 1030 return false; 1031 } 1032 1033 /// \brief Try to simplify a call site. 1034 /// 1035 /// Takes a concrete function and callsite and tries to actually simplify it by 1036 /// analyzing the arguments and call itself with instsimplify. Returns true if 1037 /// it has simplified the callsite to some other entity (a constant), making it 1038 /// free. 1039 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1040 // FIXME: Using the instsimplify logic directly for this is inefficient 1041 // because we have to continually rebuild the argument list even when no 1042 // simplifications can be performed. Until that is fixed with remapping 1043 // inside of instsimplify, directly constant fold calls here. 1044 if (!canConstantFoldCallTo(CS, F)) 1045 return false; 1046 1047 // Try to re-map the arguments to constants. 1048 SmallVector<Constant *, 4> ConstantArgs; 1049 ConstantArgs.reserve(CS.arg_size()); 1050 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 1051 ++I) { 1052 Constant *C = dyn_cast<Constant>(*I); 1053 if (!C) 1054 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 1055 if (!C) 1056 return false; // This argument doesn't map to a constant. 1057 1058 ConstantArgs.push_back(C); 1059 } 1060 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { 1061 SimplifiedValues[CS.getInstruction()] = C; 1062 return true; 1063 } 1064 1065 return false; 1066 } 1067 1068 bool CallAnalyzer::visitCallSite(CallSite CS) { 1069 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 1070 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 1071 // This aborts the entire analysis. 1072 ExposesReturnsTwice = true; 1073 return false; 1074 } 1075 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 1076 ContainsNoDuplicateCall = true; 1077 1078 if (Function *F = CS.getCalledFunction()) { 1079 // When we have a concrete function, first try to simplify it directly. 1080 if (simplifyCallSite(F, CS)) 1081 return true; 1082 1083 // Next check if it is an intrinsic we know about. 1084 // FIXME: Lift this into part of the InstVisitor. 1085 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 1086 switch (II->getIntrinsicID()) { 1087 default: 1088 return Base::visitCallSite(CS); 1089 1090 case Intrinsic::load_relative: 1091 // This is normally lowered to 4 LLVM instructions. 1092 Cost += 3 * InlineConstants::InstrCost; 1093 return false; 1094 1095 case Intrinsic::memset: 1096 case Intrinsic::memcpy: 1097 case Intrinsic::memmove: 1098 // SROA can usually chew through these intrinsics, but they aren't free. 1099 return false; 1100 case Intrinsic::localescape: 1101 HasFrameEscape = true; 1102 return false; 1103 } 1104 } 1105 1106 if (F == CS.getInstruction()->getParent()->getParent()) { 1107 // This flag will fully abort the analysis, so don't bother with anything 1108 // else. 1109 IsRecursiveCall = true; 1110 return false; 1111 } 1112 1113 if (TTI.isLoweredToCall(F)) { 1114 // We account for the average 1 instruction per call argument setup 1115 // here. 1116 Cost += CS.arg_size() * InlineConstants::InstrCost; 1117 1118 // Everything other than inline ASM will also have a significant cost 1119 // merely from making the call. 1120 if (!isa<InlineAsm>(CS.getCalledValue())) 1121 Cost += InlineConstants::CallPenalty; 1122 } 1123 1124 return Base::visitCallSite(CS); 1125 } 1126 1127 // Otherwise we're in a very special case -- an indirect function call. See 1128 // if we can be particularly clever about this. 1129 Value *Callee = CS.getCalledValue(); 1130 1131 // First, pay the price of the argument setup. We account for the average 1132 // 1 instruction per call argument setup here. 1133 Cost += CS.arg_size() * InlineConstants::InstrCost; 1134 1135 // Next, check if this happens to be an indirect function call to a known 1136 // function in this inline context. If not, we've done all we can. 1137 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 1138 if (!F) 1139 return Base::visitCallSite(CS); 1140 1141 // If we have a constant that we are calling as a function, we can peer 1142 // through it and see the function target. This happens not infrequently 1143 // during devirtualization and so we want to give it a hefty bonus for 1144 // inlining, but cap that bonus in the event that inlining wouldn't pan 1145 // out. Pretend to inline the function, with a custom threshold. 1146 auto IndirectCallParams = Params; 1147 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1148 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, 1149 IndirectCallParams); 1150 if (CA.analyzeCall(CS)) { 1151 // We were able to inline the indirect call! Subtract the cost from the 1152 // threshold to get the bonus we want to apply, but don't go below zero. 1153 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1154 } 1155 1156 return Base::visitCallSite(CS); 1157 } 1158 1159 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 1160 // At least one return instruction will be free after inlining. 1161 bool Free = !HasReturn; 1162 HasReturn = true; 1163 return Free; 1164 } 1165 1166 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 1167 // We model unconditional branches as essentially free -- they really 1168 // shouldn't exist at all, but handling them makes the behavior of the 1169 // inliner more regular and predictable. Interestingly, conditional branches 1170 // which will fold away are also free. 1171 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1172 dyn_cast_or_null<ConstantInt>( 1173 SimplifiedValues.lookup(BI.getCondition())); 1174 } 1175 1176 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { 1177 bool CheckSROA = SI.getType()->isPointerTy(); 1178 Value *TrueVal = SI.getTrueValue(); 1179 Value *FalseVal = SI.getFalseValue(); 1180 1181 Constant *TrueC = dyn_cast<Constant>(TrueVal); 1182 if (!TrueC) 1183 TrueC = SimplifiedValues.lookup(TrueVal); 1184 Constant *FalseC = dyn_cast<Constant>(FalseVal); 1185 if (!FalseC) 1186 FalseC = SimplifiedValues.lookup(FalseVal); 1187 Constant *CondC = 1188 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); 1189 1190 if (!CondC) { 1191 // Select C, X, X => X 1192 if (TrueC == FalseC && TrueC) { 1193 SimplifiedValues[&SI] = TrueC; 1194 return true; 1195 } 1196 1197 if (!CheckSROA) 1198 return Base::visitSelectInst(SI); 1199 1200 std::pair<Value *, APInt> TrueBaseAndOffset = 1201 ConstantOffsetPtrs.lookup(TrueVal); 1202 std::pair<Value *, APInt> FalseBaseAndOffset = 1203 ConstantOffsetPtrs.lookup(FalseVal); 1204 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { 1205 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; 1206 1207 Value *SROAArg; 1208 DenseMap<Value *, int>::iterator CostIt; 1209 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) 1210 SROAArgValues[&SI] = SROAArg; 1211 return true; 1212 } 1213 1214 return Base::visitSelectInst(SI); 1215 } 1216 1217 // Select condition is a constant. 1218 Value *SelectedV = CondC->isAllOnesValue() 1219 ? TrueVal 1220 : (CondC->isNullValue()) ? FalseVal : nullptr; 1221 if (!SelectedV) { 1222 // Condition is a vector constant that is not all 1s or all 0s. If all 1223 // operands are constants, ConstantExpr::getSelect() can handle the cases 1224 // such as select vectors. 1225 if (TrueC && FalseC) { 1226 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { 1227 SimplifiedValues[&SI] = C; 1228 return true; 1229 } 1230 } 1231 return Base::visitSelectInst(SI); 1232 } 1233 1234 // Condition is either all 1s or all 0s. SI can be simplified. 1235 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { 1236 SimplifiedValues[&SI] = SelectedC; 1237 return true; 1238 } 1239 1240 if (!CheckSROA) 1241 return true; 1242 1243 std::pair<Value *, APInt> BaseAndOffset = 1244 ConstantOffsetPtrs.lookup(SelectedV); 1245 if (BaseAndOffset.first) { 1246 ConstantOffsetPtrs[&SI] = BaseAndOffset; 1247 1248 Value *SROAArg; 1249 DenseMap<Value *, int>::iterator CostIt; 1250 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) 1251 SROAArgValues[&SI] = SROAArg; 1252 } 1253 1254 return true; 1255 } 1256 1257 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1258 // We model unconditional switches as free, see the comments on handling 1259 // branches. 1260 if (isa<ConstantInt>(SI.getCondition())) 1261 return true; 1262 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1263 if (isa<ConstantInt>(V)) 1264 return true; 1265 1266 // Assume the most general case where the switch is lowered into 1267 // either a jump table, bit test, or a balanced binary tree consisting of 1268 // case clusters without merging adjacent clusters with the same 1269 // destination. We do not consider the switches that are lowered with a mix 1270 // of jump table/bit test/binary search tree. The cost of the switch is 1271 // proportional to the size of the tree or the size of jump table range. 1272 // 1273 // NB: We convert large switches which are just used to initialize large phi 1274 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1275 // inlining those. It will prevent inlining in cases where the optimization 1276 // does not (yet) fire. 1277 1278 // Maximum valid cost increased in this function. 1279 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; 1280 1281 // Exit early for a large switch, assuming one case needs at least one 1282 // instruction. 1283 // FIXME: This is not true for a bit test, but ignore such case for now to 1284 // save compile-time. 1285 int64_t CostLowerBound = 1286 std::min((int64_t)CostUpperBound, 1287 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); 1288 1289 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { 1290 Cost = CostLowerBound; 1291 return false; 1292 } 1293 1294 unsigned JumpTableSize = 0; 1295 unsigned NumCaseCluster = 1296 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); 1297 1298 // If suitable for a jump table, consider the cost for the table size and 1299 // branch to destination. 1300 if (JumpTableSize) { 1301 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + 1302 4 * InlineConstants::InstrCost; 1303 1304 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); 1305 return false; 1306 } 1307 1308 // Considering forming a binary search, we should find the number of nodes 1309 // which is same as the number of comparisons when lowered. For a given 1310 // number of clusters, n, we can define a recursive function, f(n), to find 1311 // the number of nodes in the tree. The recursion is : 1312 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, 1313 // and f(n) = n, when n <= 3. 1314 // This will lead a binary tree where the leaf should be either f(2) or f(3) 1315 // when n > 3. So, the number of comparisons from leaves should be n, while 1316 // the number of non-leaf should be : 1317 // 2^(log2(n) - 1) - 1 1318 // = 2^log2(n) * 2^-1 - 1 1319 // = n / 2 - 1. 1320 // Considering comparisons from leaf and non-leaf nodes, we can estimate the 1321 // number of comparisons in a simple closed form : 1322 // n + n / 2 - 1 = n * 3 / 2 - 1 1323 if (NumCaseCluster <= 3) { 1324 // Suppose a comparison includes one compare and one conditional branch. 1325 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; 1326 return false; 1327 } 1328 1329 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; 1330 int64_t SwitchCost = 1331 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; 1332 1333 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); 1334 return false; 1335 } 1336 1337 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1338 // We never want to inline functions that contain an indirectbr. This is 1339 // incorrect because all the blockaddress's (in static global initializers 1340 // for example) would be referring to the original function, and this 1341 // indirect jump would jump from the inlined copy of the function into the 1342 // original function which is extremely undefined behavior. 1343 // FIXME: This logic isn't really right; we can safely inline functions with 1344 // indirectbr's as long as no other function or global references the 1345 // blockaddress of a block within the current function. 1346 HasIndirectBr = true; 1347 return false; 1348 } 1349 1350 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1351 // FIXME: It's not clear that a single instruction is an accurate model for 1352 // the inline cost of a resume instruction. 1353 return false; 1354 } 1355 1356 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1357 // FIXME: It's not clear that a single instruction is an accurate model for 1358 // the inline cost of a cleanupret instruction. 1359 return false; 1360 } 1361 1362 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1363 // FIXME: It's not clear that a single instruction is an accurate model for 1364 // the inline cost of a catchret instruction. 1365 return false; 1366 } 1367 1368 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1369 // FIXME: It might be reasonably to discount the cost of instructions leading 1370 // to unreachable as they have the lowest possible impact on both runtime and 1371 // code size. 1372 return true; // No actual code is needed for unreachable. 1373 } 1374 1375 bool CallAnalyzer::visitInstruction(Instruction &I) { 1376 // Some instructions are free. All of the free intrinsics can also be 1377 // handled by SROA, etc. 1378 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1379 return true; 1380 1381 // We found something we don't understand or can't handle. Mark any SROA-able 1382 // values in the operand list as no longer viable. 1383 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1384 disableSROA(*OI); 1385 1386 return false; 1387 } 1388 1389 /// \brief Analyze a basic block for its contribution to the inline cost. 1390 /// 1391 /// This method walks the analyzer over every instruction in the given basic 1392 /// block and accounts for their cost during inlining at this callsite. It 1393 /// aborts early if the threshold has been exceeded or an impossible to inline 1394 /// construct has been detected. It returns false if inlining is no longer 1395 /// viable, and true if inlining remains viable. 1396 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1397 SmallPtrSetImpl<const Value *> &EphValues) { 1398 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1399 // FIXME: Currently, the number of instructions in a function regardless of 1400 // our ability to simplify them during inline to constants or dead code, 1401 // are actually used by the vector bonus heuristic. As long as that's true, 1402 // we have to special case debug intrinsics here to prevent differences in 1403 // inlining due to debug symbols. Eventually, the number of unsimplified 1404 // instructions shouldn't factor into the cost computation, but until then, 1405 // hack around it here. 1406 if (isa<DbgInfoIntrinsic>(I)) 1407 continue; 1408 1409 // Skip ephemeral values. 1410 if (EphValues.count(&*I)) 1411 continue; 1412 1413 ++NumInstructions; 1414 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1415 ++NumVectorInstructions; 1416 1417 // If the instruction is floating point, and the target says this operation 1418 // is expensive or the function has the "use-soft-float" attribute, this may 1419 // eventually become a library call. Treat the cost as such. 1420 if (I->getType()->isFloatingPointTy()) { 1421 // If the function has the "use-soft-float" attribute, mark it as 1422 // expensive. 1423 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1424 (F.getFnAttribute("use-soft-float").getValueAsString() == "true")) 1425 Cost += InlineConstants::CallPenalty; 1426 } 1427 1428 // If the instruction simplified to a constant, there is no cost to this 1429 // instruction. Visit the instructions using our InstVisitor to account for 1430 // all of the per-instruction logic. The visit tree returns true if we 1431 // consumed the instruction in any way, and false if the instruction's base 1432 // cost should count against inlining. 1433 if (Base::visit(&*I)) 1434 ++NumInstructionsSimplified; 1435 else 1436 Cost += InlineConstants::InstrCost; 1437 1438 using namespace ore; 1439 // If the visit this instruction detected an uninlinable pattern, abort. 1440 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1441 HasIndirectBr || HasFrameEscape) { 1442 if (ORE) 1443 ORE->emit([&]() { 1444 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1445 CandidateCS.getInstruction()) 1446 << NV("Callee", &F) 1447 << " has uninlinable pattern and cost is not fully computed"; 1448 }); 1449 return false; 1450 } 1451 1452 // If the caller is a recursive function then we don't want to inline 1453 // functions which allocate a lot of stack space because it would increase 1454 // the caller stack usage dramatically. 1455 if (IsCallerRecursive && 1456 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { 1457 if (ORE) 1458 ORE->emit([&]() { 1459 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", 1460 CandidateCS.getInstruction()) 1461 << NV("Callee", &F) 1462 << " is recursive and allocates too much stack space. Cost is " 1463 "not fully computed"; 1464 }); 1465 return false; 1466 } 1467 1468 // Check if we've past the maximum possible threshold so we don't spin in 1469 // huge basic blocks that will never inline. 1470 if (Cost >= Threshold && !ComputeFullInlineCost) 1471 return false; 1472 } 1473 1474 return true; 1475 } 1476 1477 /// \brief Compute the base pointer and cumulative constant offsets for V. 1478 /// 1479 /// This strips all constant offsets off of V, leaving it the base pointer, and 1480 /// accumulates the total constant offset applied in the returned constant. It 1481 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1482 /// no constant offsets applied. 1483 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1484 if (!V->getType()->isPointerTy()) 1485 return nullptr; 1486 1487 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1488 APInt Offset = APInt::getNullValue(IntPtrWidth); 1489 1490 // Even though we don't look through PHI nodes, we could be called on an 1491 // instruction in an unreachable block, which may be on a cycle. 1492 SmallPtrSet<Value *, 4> Visited; 1493 Visited.insert(V); 1494 do { 1495 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1496 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1497 return nullptr; 1498 V = GEP->getPointerOperand(); 1499 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1500 V = cast<Operator>(V)->getOperand(0); 1501 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1502 if (GA->isInterposable()) 1503 break; 1504 V = GA->getAliasee(); 1505 } else { 1506 break; 1507 } 1508 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1509 } while (Visited.insert(V).second); 1510 1511 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1512 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1513 } 1514 1515 /// \brief Analyze a call site for potential inlining. 1516 /// 1517 /// Returns true if inlining this call is viable, and false if it is not 1518 /// viable. It computes the cost and adjusts the threshold based on numerous 1519 /// factors and heuristics. If this method returns false but the computed cost 1520 /// is below the computed threshold, then inlining was forcibly disabled by 1521 /// some artifact of the routine. 1522 bool CallAnalyzer::analyzeCall(CallSite CS) { 1523 ++NumCallsAnalyzed; 1524 1525 // Perform some tweaks to the cost and threshold based on the direct 1526 // callsite information. 1527 1528 // We want to more aggressively inline vector-dense kernels, so up the 1529 // threshold, and we'll lower it if the % of vector instructions gets too 1530 // low. Note that these bonuses are some what arbitrary and evolved over time 1531 // by accident as much as because they are principled bonuses. 1532 // 1533 // FIXME: It would be nice to remove all such bonuses. At least it would be 1534 // nice to base the bonus values on something more scientific. 1535 assert(NumInstructions == 0); 1536 assert(NumVectorInstructions == 0); 1537 1538 // Update the threshold based on callsite properties 1539 updateThreshold(CS, F); 1540 1541 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1542 // this Threshold any time, and cost cannot decrease, we can stop processing 1543 // the rest of the function body. 1544 Threshold += (SingleBBBonus + VectorBonus); 1545 1546 // Give out bonuses for the callsite, as the instructions setting them up 1547 // will be gone after inlining. 1548 Cost -= getCallsiteCost(CS, DL); 1549 1550 // If this function uses the coldcc calling convention, prefer not to inline 1551 // it. 1552 if (F.getCallingConv() == CallingConv::Cold) 1553 Cost += InlineConstants::ColdccPenalty; 1554 1555 // Check if we're done. This can happen due to bonuses and penalties. 1556 if (Cost >= Threshold && !ComputeFullInlineCost) 1557 return false; 1558 1559 if (F.empty()) 1560 return true; 1561 1562 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1563 // Check if the caller function is recursive itself. 1564 for (User *U : Caller->users()) { 1565 CallSite Site(U); 1566 if (!Site) 1567 continue; 1568 Instruction *I = Site.getInstruction(); 1569 if (I->getParent()->getParent() == Caller) { 1570 IsCallerRecursive = true; 1571 break; 1572 } 1573 } 1574 1575 // Populate our simplified values by mapping from function arguments to call 1576 // arguments with known important simplifications. 1577 CallSite::arg_iterator CAI = CS.arg_begin(); 1578 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1579 FAI != FAE; ++FAI, ++CAI) { 1580 assert(CAI != CS.arg_end()); 1581 if (Constant *C = dyn_cast<Constant>(CAI)) 1582 SimplifiedValues[&*FAI] = C; 1583 1584 Value *PtrArg = *CAI; 1585 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1586 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1587 1588 // We can SROA any pointer arguments derived from alloca instructions. 1589 if (isa<AllocaInst>(PtrArg)) { 1590 SROAArgValues[&*FAI] = PtrArg; 1591 SROAArgCosts[PtrArg] = 0; 1592 } 1593 } 1594 } 1595 NumConstantArgs = SimplifiedValues.size(); 1596 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1597 NumAllocaArgs = SROAArgValues.size(); 1598 1599 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1600 // the ephemeral values multiple times (and they're completely determined by 1601 // the callee, so this is purely duplicate work). 1602 SmallPtrSet<const Value *, 32> EphValues; 1603 CodeMetrics::collectEphemeralValues(&F, &GetAssumptionCache(F), EphValues); 1604 1605 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1606 // adding basic blocks of the callee which can be proven to be dead for this 1607 // particular call site in order to get more accurate cost estimates. This 1608 // requires a somewhat heavyweight iteration pattern: we need to walk the 1609 // basic blocks in a breadth-first order as we insert live successors. To 1610 // accomplish this, prioritizing for small iterations because we exit after 1611 // crossing our threshold, we use a small-size optimized SetVector. 1612 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1613 SmallPtrSet<BasicBlock *, 16>> 1614 BBSetVector; 1615 BBSetVector BBWorklist; 1616 BBWorklist.insert(&F.getEntryBlock()); 1617 bool SingleBB = true; 1618 // Note that we *must not* cache the size, this loop grows the worklist. 1619 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1620 // Bail out the moment we cross the threshold. This means we'll under-count 1621 // the cost, but only when undercounting doesn't matter. 1622 if (Cost >= Threshold && !ComputeFullInlineCost) 1623 break; 1624 1625 BasicBlock *BB = BBWorklist[Idx]; 1626 if (BB->empty()) 1627 continue; 1628 1629 // Disallow inlining a blockaddress. A blockaddress only has defined 1630 // behavior for an indirect branch in the same function, and we do not 1631 // currently support inlining indirect branches. But, the inliner may not 1632 // see an indirect branch that ends up being dead code at a particular call 1633 // site. If the blockaddress escapes the function, e.g., via a global 1634 // variable, inlining may lead to an invalid cross-function reference. 1635 if (BB->hasAddressTaken()) 1636 return false; 1637 1638 // Analyze the cost of this block. If we blow through the threshold, this 1639 // returns false, and we can bail on out. 1640 if (!analyzeBlock(BB, EphValues)) 1641 return false; 1642 1643 TerminatorInst *TI = BB->getTerminator(); 1644 1645 // Add in the live successors by first checking whether we have terminator 1646 // that may be simplified based on the values simplified by this call. 1647 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1648 if (BI->isConditional()) { 1649 Value *Cond = BI->getCondition(); 1650 if (ConstantInt *SimpleCond = 1651 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1652 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1653 continue; 1654 } 1655 } 1656 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1657 Value *Cond = SI->getCondition(); 1658 if (ConstantInt *SimpleCond = 1659 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1660 BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor()); 1661 continue; 1662 } 1663 } 1664 1665 // If we're unable to select a particular successor, just count all of 1666 // them. 1667 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1668 ++TIdx) 1669 BBWorklist.insert(TI->getSuccessor(TIdx)); 1670 1671 // If we had any successors at this point, than post-inlining is likely to 1672 // have them as well. Note that we assume any basic blocks which existed 1673 // due to branches or switches which folded above will also fold after 1674 // inlining. 1675 if (SingleBB && TI->getNumSuccessors() > 1) { 1676 // Take off the bonus we applied to the threshold. 1677 Threshold -= SingleBBBonus; 1678 SingleBB = false; 1679 } 1680 } 1681 1682 bool OnlyOneCallAndLocalLinkage = 1683 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1684 // If this is a noduplicate call, we can still inline as long as 1685 // inlining this would cause the removal of the caller (so the instruction 1686 // is not actually duplicated, just moved). 1687 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1688 return false; 1689 1690 // We applied the maximum possible vector bonus at the beginning. Now, 1691 // subtract the excess bonus, if any, from the Threshold before 1692 // comparing against Cost. 1693 if (NumVectorInstructions <= NumInstructions / 10) 1694 Threshold -= VectorBonus; 1695 else if (NumVectorInstructions <= NumInstructions / 2) 1696 Threshold -= VectorBonus/2; 1697 1698 return Cost < std::max(1, Threshold); 1699 } 1700 1701 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1702 /// \brief Dump stats about this call's analysis. 1703 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1704 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1705 DEBUG_PRINT_STAT(NumConstantArgs); 1706 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1707 DEBUG_PRINT_STAT(NumAllocaArgs); 1708 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1709 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1710 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1711 DEBUG_PRINT_STAT(NumInstructions); 1712 DEBUG_PRINT_STAT(SROACostSavings); 1713 DEBUG_PRINT_STAT(SROACostSavingsLost); 1714 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1715 DEBUG_PRINT_STAT(Cost); 1716 DEBUG_PRINT_STAT(Threshold); 1717 #undef DEBUG_PRINT_STAT 1718 } 1719 #endif 1720 1721 /// \brief Test that there are no attribute conflicts between Caller and Callee 1722 /// that prevent inlining. 1723 static bool functionsHaveCompatibleAttributes(Function *Caller, 1724 Function *Callee, 1725 TargetTransformInfo &TTI) { 1726 return TTI.areInlineCompatible(Caller, Callee) && 1727 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1728 } 1729 1730 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { 1731 int Cost = 0; 1732 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1733 if (CS.isByValArgument(I)) { 1734 // We approximate the number of loads and stores needed by dividing the 1735 // size of the byval type by the target's pointer size. 1736 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1737 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1738 unsigned PointerSize = DL.getPointerSizeInBits(); 1739 // Ceiling division. 1740 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1741 1742 // If it generates more than 8 stores it is likely to be expanded as an 1743 // inline memcpy so we take that as an upper bound. Otherwise we assume 1744 // one load and one store per word copied. 1745 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1746 // here instead of a magic number of 8, but it's not available via 1747 // DataLayout. 1748 NumStores = std::min(NumStores, 8U); 1749 1750 Cost += 2 * NumStores * InlineConstants::InstrCost; 1751 } else { 1752 // For non-byval arguments subtract off one instruction per call 1753 // argument. 1754 Cost += InlineConstants::InstrCost; 1755 } 1756 } 1757 // The call instruction also disappears after inlining. 1758 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; 1759 return Cost; 1760 } 1761 1762 InlineCost llvm::getInlineCost( 1763 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1764 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1765 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1766 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1767 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1768 GetAssumptionCache, GetBFI, PSI, ORE); 1769 } 1770 1771 InlineCost llvm::getInlineCost( 1772 CallSite CS, Function *Callee, const InlineParams &Params, 1773 TargetTransformInfo &CalleeTTI, 1774 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1775 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, 1776 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { 1777 1778 // Cannot inline indirect calls. 1779 if (!Callee) 1780 return llvm::InlineCost::getNever(); 1781 1782 // Calls to functions with always-inline attributes should be inlined 1783 // whenever possible. 1784 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1785 if (isInlineViable(*Callee)) 1786 return llvm::InlineCost::getAlways(); 1787 return llvm::InlineCost::getNever(); 1788 } 1789 1790 // Never inline functions with conflicting attributes (unless callee has 1791 // always-inline attribute). 1792 Function *Caller = CS.getCaller(); 1793 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) 1794 return llvm::InlineCost::getNever(); 1795 1796 // Don't inline this call if the caller has the optnone attribute. 1797 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) 1798 return llvm::InlineCost::getNever(); 1799 1800 // Don't inline functions which can be interposed at link-time. Don't inline 1801 // functions marked noinline or call sites marked noinline. 1802 // Note: inlining non-exact non-interposable functions is fine, since we know 1803 // we have *a* correct implementation of the source level function. 1804 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1805 CS.isNoInline()) 1806 return llvm::InlineCost::getNever(); 1807 1808 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1809 << "... (caller:" << Caller->getName() << ")\n"); 1810 1811 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, 1812 Params); 1813 bool ShouldInline = CA.analyzeCall(CS); 1814 1815 DEBUG(CA.dump()); 1816 1817 // Check if there was a reason to force inlining or no inlining. 1818 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1819 return InlineCost::getNever(); 1820 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1821 return InlineCost::getAlways(); 1822 1823 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1824 } 1825 1826 bool llvm::isInlineViable(Function &F) { 1827 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1828 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1829 // Disallow inlining of functions which contain indirect branches or 1830 // blockaddresses. 1831 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1832 return false; 1833 1834 for (auto &II : *BI) { 1835 CallSite CS(&II); 1836 if (!CS) 1837 continue; 1838 1839 // Disallow recursive calls. 1840 if (&F == CS.getCalledFunction()) 1841 return false; 1842 1843 // Disallow calls which expose returns-twice to a function not previously 1844 // attributed as such. 1845 if (!ReturnsTwice && CS.isCall() && 1846 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1847 return false; 1848 1849 // Disallow inlining functions that call @llvm.localescape. Doing this 1850 // correctly would require major changes to the inliner. 1851 if (CS.getCalledFunction() && 1852 CS.getCalledFunction()->getIntrinsicID() == 1853 llvm::Intrinsic::localescape) 1854 return false; 1855 } 1856 } 1857 1858 return true; 1859 } 1860 1861 // APIs to create InlineParams based on command line flags and/or other 1862 // parameters. 1863 1864 InlineParams llvm::getInlineParams(int Threshold) { 1865 InlineParams Params; 1866 1867 // This field is the threshold to use for a callee by default. This is 1868 // derived from one or more of: 1869 // * optimization or size-optimization levels, 1870 // * a value passed to createFunctionInliningPass function, or 1871 // * the -inline-threshold flag. 1872 // If the -inline-threshold flag is explicitly specified, that is used 1873 // irrespective of anything else. 1874 if (InlineThreshold.getNumOccurrences() > 0) 1875 Params.DefaultThreshold = InlineThreshold; 1876 else 1877 Params.DefaultThreshold = Threshold; 1878 1879 // Set the HintThreshold knob from the -inlinehint-threshold. 1880 Params.HintThreshold = HintThreshold; 1881 1882 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 1883 Params.HotCallSiteThreshold = HotCallSiteThreshold; 1884 1885 // If the -locally-hot-callsite-threshold is explicitly specified, use it to 1886 // populate LocallyHotCallSiteThreshold. Later, we populate 1887 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if 1888 // we know that optimization level is O3 (in the getInlineParams variant that 1889 // takes the opt and size levels). 1890 // FIXME: Remove this check (and make the assignment unconditional) after 1891 // addressing size regression issues at O2. 1892 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) 1893 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 1894 1895 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. 1896 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; 1897 1898 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 1899 // -inlinehint-threshold commandline option is not explicitly given. If that 1900 // option is present, then its value applies even for callees with size and 1901 // minsize attributes. 1902 // If the -inline-threshold is not specified, set the ColdThreshold from the 1903 // -inlinecold-threshold even if it is not explicitly passed. If 1904 // -inline-threshold is specified, then -inlinecold-threshold needs to be 1905 // explicitly specified to set the ColdThreshold knob 1906 if (InlineThreshold.getNumOccurrences() == 0) { 1907 Params.OptMinSizeThreshold = InlineConstants::OptMinSizeThreshold; 1908 Params.OptSizeThreshold = InlineConstants::OptSizeThreshold; 1909 Params.ColdThreshold = ColdThreshold; 1910 } else if (ColdThreshold.getNumOccurrences() > 0) { 1911 Params.ColdThreshold = ColdThreshold; 1912 } 1913 return Params; 1914 } 1915 1916 InlineParams llvm::getInlineParams() { 1917 return getInlineParams(InlineThreshold); 1918 } 1919 1920 // Compute the default threshold for inlining based on the opt level and the 1921 // size opt level. 1922 static int computeThresholdFromOptLevels(unsigned OptLevel, 1923 unsigned SizeOptLevel) { 1924 if (OptLevel > 2) 1925 return InlineConstants::OptAggressiveThreshold; 1926 if (SizeOptLevel == 1) // -Os 1927 return InlineConstants::OptSizeThreshold; 1928 if (SizeOptLevel == 2) // -Oz 1929 return InlineConstants::OptMinSizeThreshold; 1930 return InlineThreshold; 1931 } 1932 1933 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 1934 auto Params = 1935 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 1936 // At O3, use the value of -locally-hot-callsite-threshold option to populate 1937 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only 1938 // when it is specified explicitly. 1939 if (OptLevel > 2) 1940 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; 1941 return Params; 1942 } 1943