1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/CallingConv.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/GlobalAlias.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Operator.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "inline-cost" 39 40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 41 42 // Threshold to use when optsize is specified (and there is no 43 // -inline-threshold). 44 const int OptSizeThreshold = 75; 45 46 // Threshold to use when -Oz is specified (and there is no -inline-threshold). 47 const int OptMinSizeThreshold = 25; 48 49 // Threshold to use when -O[34] is specified (and there is no 50 // -inline-threshold). 51 const int OptAggressiveThreshold = 275; 52 53 static cl::opt<int> DefaultInlineThreshold( 54 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 55 cl::desc("Control the amount of inlining to perform (default = 225)")); 56 57 static cl::opt<int> HintThreshold( 58 "inlinehint-threshold", cl::Hidden, cl::init(325), 59 cl::desc("Threshold for inlining functions with inline hint")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(225), 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 namespace { 69 70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 71 typedef InstVisitor<CallAnalyzer, bool> Base; 72 friend class InstVisitor<CallAnalyzer, bool>; 73 74 /// The TargetTransformInfo available for this compilation. 75 const TargetTransformInfo &TTI; 76 77 /// The cache of @llvm.assume intrinsics. 78 AssumptionCacheTracker *ACT; 79 80 // The called function. 81 Function &F; 82 83 // The candidate callsite being analyzed. Please do not use this to do 84 // analysis in the caller function; we want the inline cost query to be 85 // easily cacheable. Instead, use the cover function paramHasAttr. 86 CallSite CandidateCS; 87 88 int Threshold; 89 int Cost; 90 91 bool IsCallerRecursive; 92 bool IsRecursiveCall; 93 bool ExposesReturnsTwice; 94 bool HasDynamicAlloca; 95 bool ContainsNoDuplicateCall; 96 bool HasReturn; 97 bool HasIndirectBr; 98 bool HasFrameEscape; 99 100 /// Number of bytes allocated statically by the callee. 101 uint64_t AllocatedSize; 102 unsigned NumInstructions, NumVectorInstructions; 103 int FiftyPercentVectorBonus, TenPercentVectorBonus; 104 int VectorBonus; 105 106 // While we walk the potentially-inlined instructions, we build up and 107 // maintain a mapping of simplified values specific to this callsite. The 108 // idea is to propagate any special information we have about arguments to 109 // this call through the inlinable section of the function, and account for 110 // likely simplifications post-inlining. The most important aspect we track 111 // is CFG altering simplifications -- when we prove a basic block dead, that 112 // can cause dramatic shifts in the cost of inlining a function. 113 DenseMap<Value *, Constant *> SimplifiedValues; 114 115 // Keep track of the values which map back (through function arguments) to 116 // allocas on the caller stack which could be simplified through SROA. 117 DenseMap<Value *, Value *> SROAArgValues; 118 119 // The mapping of caller Alloca values to their accumulated cost savings. If 120 // we have to disable SROA for one of the allocas, this tells us how much 121 // cost must be added. 122 DenseMap<Value *, int> SROAArgCosts; 123 124 // Keep track of values which map to a pointer base and constant offset. 125 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 126 127 // Custom simplification helper routines. 128 bool isAllocaDerivedArg(Value *V); 129 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 130 DenseMap<Value *, int>::iterator &CostIt); 131 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 132 void disableSROA(Value *V); 133 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 134 int InstructionCost); 135 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 136 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 137 bool simplifyCallSite(Function *F, CallSite CS); 138 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 139 140 /// Return true if the given argument to the function being considered for 141 /// inlining has the given attribute set either at the call site or the 142 /// function declaration. Primarily used to inspect call site specific 143 /// attributes since these can be more precise than the ones on the callee 144 /// itself. 145 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 146 147 /// Return true if the given value is known non null within the callee if 148 /// inlined through this particular callsite. 149 bool isKnownNonNullInCallee(Value *V); 150 151 /// Update Threshold based on callsite properties such as callee 152 /// attributes and callee hotness for PGO builds. The Callee is explicitly 153 /// passed to support analyzing indirect calls whose target is inferred by 154 /// analysis. 155 void updateThreshold(CallSite CS, Function &Callee); 156 157 /// Return true if size growth is allowed when inlining the callee at CS. 158 bool allowSizeGrowth(CallSite CS); 159 160 // Custom analysis routines. 161 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 162 163 // Disable several entry points to the visitor so we don't accidentally use 164 // them by declaring but not defining them here. 165 void visit(Module *); 166 void visit(Module &); 167 void visit(Function *); 168 void visit(Function &); 169 void visit(BasicBlock *); 170 void visit(BasicBlock &); 171 172 // Provide base case for our instruction visit. 173 bool visitInstruction(Instruction &I); 174 175 // Our visit overrides. 176 bool visitAlloca(AllocaInst &I); 177 bool visitPHI(PHINode &I); 178 bool visitGetElementPtr(GetElementPtrInst &I); 179 bool visitBitCast(BitCastInst &I); 180 bool visitPtrToInt(PtrToIntInst &I); 181 bool visitIntToPtr(IntToPtrInst &I); 182 bool visitCastInst(CastInst &I); 183 bool visitUnaryInstruction(UnaryInstruction &I); 184 bool visitCmpInst(CmpInst &I); 185 bool visitSub(BinaryOperator &I); 186 bool visitBinaryOperator(BinaryOperator &I); 187 bool visitLoad(LoadInst &I); 188 bool visitStore(StoreInst &I); 189 bool visitExtractValue(ExtractValueInst &I); 190 bool visitInsertValue(InsertValueInst &I); 191 bool visitCallSite(CallSite CS); 192 bool visitReturnInst(ReturnInst &RI); 193 bool visitBranchInst(BranchInst &BI); 194 bool visitSwitchInst(SwitchInst &SI); 195 bool visitIndirectBrInst(IndirectBrInst &IBI); 196 bool visitResumeInst(ResumeInst &RI); 197 bool visitCleanupReturnInst(CleanupReturnInst &RI); 198 bool visitCatchReturnInst(CatchReturnInst &RI); 199 bool visitUnreachableInst(UnreachableInst &I); 200 201 public: 202 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, 203 Function &Callee, int Threshold, CallSite CSArg) 204 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), 205 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 206 ExposesReturnsTwice(false), HasDynamicAlloca(false), 207 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 208 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 209 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 210 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 211 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 212 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 213 SROACostSavings(0), SROACostSavingsLost(0) {} 214 215 bool analyzeCall(CallSite CS); 216 217 int getThreshold() { return Threshold; } 218 int getCost() { return Cost; } 219 220 // Keep a bunch of stats about the cost savings found so we can print them 221 // out when debugging. 222 unsigned NumConstantArgs; 223 unsigned NumConstantOffsetPtrArgs; 224 unsigned NumAllocaArgs; 225 unsigned NumConstantPtrCmps; 226 unsigned NumConstantPtrDiffs; 227 unsigned NumInstructionsSimplified; 228 unsigned SROACostSavings; 229 unsigned SROACostSavingsLost; 230 231 void dump(); 232 }; 233 234 } // namespace 235 236 /// \brief Test whether the given value is an Alloca-derived function argument. 237 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 238 return SROAArgValues.count(V); 239 } 240 241 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 242 /// Returns false if V does not map to a SROA-candidate. 243 bool CallAnalyzer::lookupSROAArgAndCost( 244 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 245 if (SROAArgValues.empty() || SROAArgCosts.empty()) 246 return false; 247 248 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 249 if (ArgIt == SROAArgValues.end()) 250 return false; 251 252 Arg = ArgIt->second; 253 CostIt = SROAArgCosts.find(Arg); 254 return CostIt != SROAArgCosts.end(); 255 } 256 257 /// \brief Disable SROA for the candidate marked by this cost iterator. 258 /// 259 /// This marks the candidate as no longer viable for SROA, and adds the cost 260 /// savings associated with it back into the inline cost measurement. 261 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 262 // If we're no longer able to perform SROA we need to undo its cost savings 263 // and prevent subsequent analysis. 264 Cost += CostIt->second; 265 SROACostSavings -= CostIt->second; 266 SROACostSavingsLost += CostIt->second; 267 SROAArgCosts.erase(CostIt); 268 } 269 270 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 271 void CallAnalyzer::disableSROA(Value *V) { 272 Value *SROAArg; 273 DenseMap<Value *, int>::iterator CostIt; 274 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 275 disableSROA(CostIt); 276 } 277 278 /// \brief Accumulate the given cost for a particular SROA candidate. 279 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 280 int InstructionCost) { 281 CostIt->second += InstructionCost; 282 SROACostSavings += InstructionCost; 283 } 284 285 /// \brief Check whether a GEP's indices are all constant. 286 /// 287 /// Respects any simplified values known during the analysis of this callsite. 288 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 289 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 290 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 291 return false; 292 293 return true; 294 } 295 296 /// \brief Accumulate a constant GEP offset into an APInt if possible. 297 /// 298 /// Returns false if unable to compute the offset for any reason. Respects any 299 /// simplified values known during the analysis of this callsite. 300 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 301 const DataLayout &DL = F.getParent()->getDataLayout(); 302 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 303 assert(IntPtrWidth == Offset.getBitWidth()); 304 305 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 306 GTI != GTE; ++GTI) { 307 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 308 if (!OpC) 309 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 310 OpC = dyn_cast<ConstantInt>(SimpleOp); 311 if (!OpC) 312 return false; 313 if (OpC->isZero()) 314 continue; 315 316 // Handle a struct index, which adds its field offset to the pointer. 317 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 318 unsigned ElementIdx = OpC->getZExtValue(); 319 const StructLayout *SL = DL.getStructLayout(STy); 320 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 321 continue; 322 } 323 324 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 325 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 326 } 327 return true; 328 } 329 330 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 331 // Check whether inlining will turn a dynamic alloca into a static 332 // alloca and handle that case. 333 if (I.isArrayAllocation()) { 334 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 335 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 336 Type *Ty = I.getAllocatedType(); 337 // FIXME: This can't be right. AllocatedSize is in *bytes*. 338 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue(); 339 return Base::visitAlloca(I); 340 } 341 } 342 343 // Accumulate the allocated size. 344 if (I.isStaticAlloca()) { 345 const DataLayout &DL = F.getParent()->getDataLayout(); 346 Type *Ty = I.getAllocatedType(); 347 AllocatedSize += DL.getTypeAllocSize(Ty); 348 } 349 350 // We will happily inline static alloca instructions. 351 if (I.isStaticAlloca()) 352 return Base::visitAlloca(I); 353 354 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 355 // a variety of reasons, and so we would like to not inline them into 356 // functions which don't currently have a dynamic alloca. This simply 357 // disables inlining altogether in the presence of a dynamic alloca. 358 HasDynamicAlloca = true; 359 return false; 360 } 361 362 bool CallAnalyzer::visitPHI(PHINode &I) { 363 // FIXME: We should potentially be tracking values through phi nodes, 364 // especially when they collapse to a single value due to deleted CFG edges 365 // during inlining. 366 367 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 368 // though we don't want to propagate it's bonuses. The idea is to disable 369 // SROA if it *might* be used in an inappropriate manner. 370 371 // Phi nodes are always zero-cost. 372 return true; 373 } 374 375 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 376 Value *SROAArg; 377 DenseMap<Value *, int>::iterator CostIt; 378 bool SROACandidate = 379 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 380 381 // Try to fold GEPs of constant-offset call site argument pointers. This 382 // requires target data and inbounds GEPs. 383 if (I.isInBounds()) { 384 // Check if we have a base + offset for the pointer. 385 Value *Ptr = I.getPointerOperand(); 386 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 387 if (BaseAndOffset.first) { 388 // Check if the offset of this GEP is constant, and if so accumulate it 389 // into Offset. 390 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 391 // Non-constant GEPs aren't folded, and disable SROA. 392 if (SROACandidate) 393 disableSROA(CostIt); 394 return false; 395 } 396 397 // Add the result as a new mapping to Base + Offset. 398 ConstantOffsetPtrs[&I] = BaseAndOffset; 399 400 // Also handle SROA candidates here, we already know that the GEP is 401 // all-constant indexed. 402 if (SROACandidate) 403 SROAArgValues[&I] = SROAArg; 404 405 return true; 406 } 407 } 408 409 if (isGEPOffsetConstant(I)) { 410 if (SROACandidate) 411 SROAArgValues[&I] = SROAArg; 412 413 // Constant GEPs are modeled as free. 414 return true; 415 } 416 417 // Variable GEPs will require math and will disable SROA. 418 if (SROACandidate) 419 disableSROA(CostIt); 420 return false; 421 } 422 423 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 424 // Propagate constants through bitcasts. 425 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 426 if (!COp) 427 COp = SimplifiedValues.lookup(I.getOperand(0)); 428 if (COp) 429 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 430 SimplifiedValues[&I] = C; 431 return true; 432 } 433 434 // Track base/offsets through casts 435 std::pair<Value *, APInt> BaseAndOffset = 436 ConstantOffsetPtrs.lookup(I.getOperand(0)); 437 // Casts don't change the offset, just wrap it up. 438 if (BaseAndOffset.first) 439 ConstantOffsetPtrs[&I] = BaseAndOffset; 440 441 // Also look for SROA candidates here. 442 Value *SROAArg; 443 DenseMap<Value *, int>::iterator CostIt; 444 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 445 SROAArgValues[&I] = SROAArg; 446 447 // Bitcasts are always zero cost. 448 return true; 449 } 450 451 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 452 // Propagate constants through ptrtoint. 453 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 454 if (!COp) 455 COp = SimplifiedValues.lookup(I.getOperand(0)); 456 if (COp) 457 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 458 SimplifiedValues[&I] = C; 459 return true; 460 } 461 462 // Track base/offset pairs when converted to a plain integer provided the 463 // integer is large enough to represent the pointer. 464 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 465 const DataLayout &DL = F.getParent()->getDataLayout(); 466 if (IntegerSize >= DL.getPointerSizeInBits()) { 467 std::pair<Value *, APInt> BaseAndOffset = 468 ConstantOffsetPtrs.lookup(I.getOperand(0)); 469 if (BaseAndOffset.first) 470 ConstantOffsetPtrs[&I] = BaseAndOffset; 471 } 472 473 // This is really weird. Technically, ptrtoint will disable SROA. However, 474 // unless that ptrtoint is *used* somewhere in the live basic blocks after 475 // inlining, it will be nuked, and SROA should proceed. All of the uses which 476 // would block SROA would also block SROA if applied directly to a pointer, 477 // and so we can just add the integer in here. The only places where SROA is 478 // preserved either cannot fire on an integer, or won't in-and-of themselves 479 // disable SROA (ext) w/o some later use that we would see and disable. 480 Value *SROAArg; 481 DenseMap<Value *, int>::iterator CostIt; 482 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 483 SROAArgValues[&I] = SROAArg; 484 485 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 486 } 487 488 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 489 // Propagate constants through ptrtoint. 490 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 491 if (!COp) 492 COp = SimplifiedValues.lookup(I.getOperand(0)); 493 if (COp) 494 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 495 SimplifiedValues[&I] = C; 496 return true; 497 } 498 499 // Track base/offset pairs when round-tripped through a pointer without 500 // modifications provided the integer is not too large. 501 Value *Op = I.getOperand(0); 502 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 503 const DataLayout &DL = F.getParent()->getDataLayout(); 504 if (IntegerSize <= DL.getPointerSizeInBits()) { 505 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 506 if (BaseAndOffset.first) 507 ConstantOffsetPtrs[&I] = BaseAndOffset; 508 } 509 510 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 511 Value *SROAArg; 512 DenseMap<Value *, int>::iterator CostIt; 513 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 514 SROAArgValues[&I] = SROAArg; 515 516 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 517 } 518 519 bool CallAnalyzer::visitCastInst(CastInst &I) { 520 // Propagate constants through ptrtoint. 521 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 522 if (!COp) 523 COp = SimplifiedValues.lookup(I.getOperand(0)); 524 if (COp) 525 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 526 SimplifiedValues[&I] = C; 527 return true; 528 } 529 530 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 531 disableSROA(I.getOperand(0)); 532 533 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 534 } 535 536 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 537 Value *Operand = I.getOperand(0); 538 Constant *COp = dyn_cast<Constant>(Operand); 539 if (!COp) 540 COp = SimplifiedValues.lookup(Operand); 541 if (COp) { 542 const DataLayout &DL = F.getParent()->getDataLayout(); 543 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { 544 SimplifiedValues[&I] = C; 545 return true; 546 } 547 } 548 549 // Disable any SROA on the argument to arbitrary unary operators. 550 disableSROA(Operand); 551 552 return false; 553 } 554 555 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 556 unsigned ArgNo = A->getArgNo(); 557 return CandidateCS.paramHasAttr(ArgNo + 1, Attr); 558 } 559 560 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 561 // Does the *call site* have the NonNull attribute set on an argument? We 562 // use the attribute on the call site to memoize any analysis done in the 563 // caller. This will also trip if the callee function has a non-null 564 // parameter attribute, but that's a less interesting case because hopefully 565 // the callee would already have been simplified based on that. 566 if (Argument *A = dyn_cast<Argument>(V)) 567 if (paramHasAttr(A, Attribute::NonNull)) 568 return true; 569 570 // Is this an alloca in the caller? This is distinct from the attribute case 571 // above because attributes aren't updated within the inliner itself and we 572 // always want to catch the alloca derived case. 573 if (isAllocaDerivedArg(V)) 574 // We can actually predict the result of comparisons between an 575 // alloca-derived value and null. Note that this fires regardless of 576 // SROA firing. 577 return true; 578 579 return false; 580 } 581 582 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 583 // If the normal destination of the invoke or the parent block of the call 584 // site is unreachable-terminated, there is little point in inlining this 585 // unless there is literally zero cost. 586 // FIXME: Note that it is possible that an unreachable-terminated block has a 587 // hot entry. For example, in below scenario inlining hot_call_X() may be 588 // beneficial : 589 // main() { 590 // hot_call_1(); 591 // ... 592 // hot_call_N() 593 // exit(0); 594 // } 595 // For now, we are not handling this corner case here as it is rare in real 596 // code. In future, we should elaborate this based on BPI and BFI in more 597 // general threshold adjusting heuristics in updateThreshold(). 598 Instruction *Instr = CS.getInstruction(); 599 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 600 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 601 return false; 602 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 603 return false; 604 605 return true; 606 } 607 608 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 609 // If no size growth is allowed for this inlining, set Threshold to 0. 610 if (!allowSizeGrowth(CS)) { 611 Threshold = 0; 612 return; 613 } 614 615 Function *Caller = CS.getCaller(); 616 if (DefaultInlineThreshold.getNumOccurrences() > 0) { 617 // Explicitly specified -inline-threhold overrides the threshold passed to 618 // CallAnalyzer's constructor. 619 Threshold = DefaultInlineThreshold; 620 } else { 621 // If -inline-threshold is not given, listen to the optsize and minsize 622 // attributes when they would decrease the threshold. 623 if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold) 624 Threshold = OptMinSizeThreshold; 625 else if (Caller->optForSize() && OptSizeThreshold < Threshold) 626 Threshold = OptSizeThreshold; 627 } 628 629 // If profile information is available, use that to adjust threshold of hot 630 // and cold functions. 631 // FIXME: The heuristic used below for determining hotness and coldness are 632 // based on preliminary SPEC tuning and may not be optimal. Replace this with 633 // a well-tuned heuristic based on *callsite* hotness and not callee hotness. 634 uint64_t FunctionCount = 0, MaxFunctionCount = 0; 635 bool HasPGOCounts = false; 636 if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) { 637 HasPGOCounts = true; 638 FunctionCount = Callee.getEntryCount().getValue(); 639 MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); 640 } 641 642 // Listen to the inlinehint attribute or profile based hotness information 643 // when it would increase the threshold and the caller does not need to 644 // minimize its size. 645 bool InlineHint = 646 Callee.hasFnAttribute(Attribute::InlineHint) || 647 (HasPGOCounts && 648 FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); 649 if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize()) 650 Threshold = HintThreshold; 651 652 // Listen to the cold attribute or profile based coldness information 653 // when it would decrease the threshold. 654 bool ColdCallee = 655 Callee.hasFnAttribute(Attribute::Cold) || 656 (HasPGOCounts && 657 FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); 658 // Command line argument for DefaultInlineThreshold will override the default 659 // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, 660 // do not use the default cold threshold even if it is smaller. 661 if ((DefaultInlineThreshold.getNumOccurrences() == 0 || 662 ColdThreshold.getNumOccurrences() > 0) && 663 ColdCallee && ColdThreshold < Threshold) 664 Threshold = ColdThreshold; 665 666 // Finally, take the target-specific inlining threshold multiplier into 667 // account. 668 Threshold *= TTI.getInliningThresholdMultiplier(); 669 } 670 671 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 672 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 673 // First try to handle simplified comparisons. 674 if (!isa<Constant>(LHS)) 675 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 676 LHS = SimpleLHS; 677 if (!isa<Constant>(RHS)) 678 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 679 RHS = SimpleRHS; 680 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { 681 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 682 if (Constant *C = 683 ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { 684 SimplifiedValues[&I] = C; 685 return true; 686 } 687 } 688 689 if (I.getOpcode() == Instruction::FCmp) 690 return false; 691 692 // Otherwise look for a comparison between constant offset pointers with 693 // a common base. 694 Value *LHSBase, *RHSBase; 695 APInt LHSOffset, RHSOffset; 696 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 697 if (LHSBase) { 698 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 699 if (RHSBase && LHSBase == RHSBase) { 700 // We have common bases, fold the icmp to a constant based on the 701 // offsets. 702 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 703 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 704 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 705 SimplifiedValues[&I] = C; 706 ++NumConstantPtrCmps; 707 return true; 708 } 709 } 710 } 711 712 // If the comparison is an equality comparison with null, we can simplify it 713 // if we know the value (argument) can't be null 714 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 715 isKnownNonNullInCallee(I.getOperand(0))) { 716 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 717 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 718 : ConstantInt::getFalse(I.getType()); 719 return true; 720 } 721 // Finally check for SROA candidates in comparisons. 722 Value *SROAArg; 723 DenseMap<Value *, int>::iterator CostIt; 724 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 725 if (isa<ConstantPointerNull>(I.getOperand(1))) { 726 accumulateSROACost(CostIt, InlineConstants::InstrCost); 727 return true; 728 } 729 730 disableSROA(CostIt); 731 } 732 733 return false; 734 } 735 736 bool CallAnalyzer::visitSub(BinaryOperator &I) { 737 // Try to handle a special case: we can fold computing the difference of two 738 // constant-related pointers. 739 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 740 Value *LHSBase, *RHSBase; 741 APInt LHSOffset, RHSOffset; 742 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 743 if (LHSBase) { 744 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 745 if (RHSBase && LHSBase == RHSBase) { 746 // We have common bases, fold the subtract to a constant based on the 747 // offsets. 748 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 749 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 750 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 751 SimplifiedValues[&I] = C; 752 ++NumConstantPtrDiffs; 753 return true; 754 } 755 } 756 } 757 758 // Otherwise, fall back to the generic logic for simplifying and handling 759 // instructions. 760 return Base::visitSub(I); 761 } 762 763 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 764 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 765 const DataLayout &DL = F.getParent()->getDataLayout(); 766 if (!isa<Constant>(LHS)) 767 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 768 LHS = SimpleLHS; 769 if (!isa<Constant>(RHS)) 770 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 771 RHS = SimpleRHS; 772 Value *SimpleV = nullptr; 773 if (auto FI = dyn_cast<FPMathOperator>(&I)) 774 SimpleV = 775 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 776 else 777 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); 778 779 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 780 SimplifiedValues[&I] = C; 781 return true; 782 } 783 784 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 785 disableSROA(LHS); 786 disableSROA(RHS); 787 788 return false; 789 } 790 791 bool CallAnalyzer::visitLoad(LoadInst &I) { 792 Value *SROAArg; 793 DenseMap<Value *, int>::iterator CostIt; 794 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 795 if (I.isSimple()) { 796 accumulateSROACost(CostIt, InlineConstants::InstrCost); 797 return true; 798 } 799 800 disableSROA(CostIt); 801 } 802 803 return false; 804 } 805 806 bool CallAnalyzer::visitStore(StoreInst &I) { 807 Value *SROAArg; 808 DenseMap<Value *, int>::iterator CostIt; 809 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 810 if (I.isSimple()) { 811 accumulateSROACost(CostIt, InlineConstants::InstrCost); 812 return true; 813 } 814 815 disableSROA(CostIt); 816 } 817 818 return false; 819 } 820 821 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 822 // Constant folding for extract value is trivial. 823 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 824 if (!C) 825 C = SimplifiedValues.lookup(I.getAggregateOperand()); 826 if (C) { 827 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); 828 return true; 829 } 830 831 // SROA can look through these but give them a cost. 832 return false; 833 } 834 835 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 836 // Constant folding for insert value is trivial. 837 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 838 if (!AggC) 839 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 840 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 841 if (!InsertedC) 842 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); 843 if (AggC && InsertedC) { 844 SimplifiedValues[&I] = 845 ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices()); 846 return true; 847 } 848 849 // SROA can look through these but give them a cost. 850 return false; 851 } 852 853 /// \brief Try to simplify a call site. 854 /// 855 /// Takes a concrete function and callsite and tries to actually simplify it by 856 /// analyzing the arguments and call itself with instsimplify. Returns true if 857 /// it has simplified the callsite to some other entity (a constant), making it 858 /// free. 859 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 860 // FIXME: Using the instsimplify logic directly for this is inefficient 861 // because we have to continually rebuild the argument list even when no 862 // simplifications can be performed. Until that is fixed with remapping 863 // inside of instsimplify, directly constant fold calls here. 864 if (!canConstantFoldCallTo(F)) 865 return false; 866 867 // Try to re-map the arguments to constants. 868 SmallVector<Constant *, 4> ConstantArgs; 869 ConstantArgs.reserve(CS.arg_size()); 870 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 871 ++I) { 872 Constant *C = dyn_cast<Constant>(*I); 873 if (!C) 874 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 875 if (!C) 876 return false; // This argument doesn't map to a constant. 877 878 ConstantArgs.push_back(C); 879 } 880 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 881 SimplifiedValues[CS.getInstruction()] = C; 882 return true; 883 } 884 885 return false; 886 } 887 888 bool CallAnalyzer::visitCallSite(CallSite CS) { 889 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 890 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 891 // This aborts the entire analysis. 892 ExposesReturnsTwice = true; 893 return false; 894 } 895 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 896 ContainsNoDuplicateCall = true; 897 898 if (Function *F = CS.getCalledFunction()) { 899 // When we have a concrete function, first try to simplify it directly. 900 if (simplifyCallSite(F, CS)) 901 return true; 902 903 // Next check if it is an intrinsic we know about. 904 // FIXME: Lift this into part of the InstVisitor. 905 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 906 switch (II->getIntrinsicID()) { 907 default: 908 return Base::visitCallSite(CS); 909 910 case Intrinsic::load_relative: 911 // This is normally lowered to 4 LLVM instructions. 912 Cost += 3 * InlineConstants::InstrCost; 913 return false; 914 915 case Intrinsic::memset: 916 case Intrinsic::memcpy: 917 case Intrinsic::memmove: 918 // SROA can usually chew through these intrinsics, but they aren't free. 919 return false; 920 case Intrinsic::localescape: 921 HasFrameEscape = true; 922 return false; 923 } 924 } 925 926 if (F == CS.getInstruction()->getParent()->getParent()) { 927 // This flag will fully abort the analysis, so don't bother with anything 928 // else. 929 IsRecursiveCall = true; 930 return false; 931 } 932 933 if (TTI.isLoweredToCall(F)) { 934 // We account for the average 1 instruction per call argument setup 935 // here. 936 Cost += CS.arg_size() * InlineConstants::InstrCost; 937 938 // Everything other than inline ASM will also have a significant cost 939 // merely from making the call. 940 if (!isa<InlineAsm>(CS.getCalledValue())) 941 Cost += InlineConstants::CallPenalty; 942 } 943 944 return Base::visitCallSite(CS); 945 } 946 947 // Otherwise we're in a very special case -- an indirect function call. See 948 // if we can be particularly clever about this. 949 Value *Callee = CS.getCalledValue(); 950 951 // First, pay the price of the argument setup. We account for the average 952 // 1 instruction per call argument setup here. 953 Cost += CS.arg_size() * InlineConstants::InstrCost; 954 955 // Next, check if this happens to be an indirect function call to a known 956 // function in this inline context. If not, we've done all we can. 957 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 958 if (!F) 959 return Base::visitCallSite(CS); 960 961 // If we have a constant that we are calling as a function, we can peer 962 // through it and see the function target. This happens not infrequently 963 // during devirtualization and so we want to give it a hefty bonus for 964 // inlining, but cap that bonus in the event that inlining wouldn't pan 965 // out. Pretend to inline the function, with a custom threshold. 966 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); 967 if (CA.analyzeCall(CS)) { 968 // We were able to inline the indirect call! Subtract the cost from the 969 // threshold to get the bonus we want to apply, but don't go below zero. 970 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 971 } 972 973 return Base::visitCallSite(CS); 974 } 975 976 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 977 // At least one return instruction will be free after inlining. 978 bool Free = !HasReturn; 979 HasReturn = true; 980 return Free; 981 } 982 983 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 984 // We model unconditional branches as essentially free -- they really 985 // shouldn't exist at all, but handling them makes the behavior of the 986 // inliner more regular and predictable. Interestingly, conditional branches 987 // which will fold away are also free. 988 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 989 dyn_cast_or_null<ConstantInt>( 990 SimplifiedValues.lookup(BI.getCondition())); 991 } 992 993 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 994 // We model unconditional switches as free, see the comments on handling 995 // branches. 996 if (isa<ConstantInt>(SI.getCondition())) 997 return true; 998 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 999 if (isa<ConstantInt>(V)) 1000 return true; 1001 1002 // Otherwise, we need to accumulate a cost proportional to the number of 1003 // distinct successor blocks. This fan-out in the CFG cannot be represented 1004 // for free even if we can represent the core switch as a jumptable that 1005 // takes a single instruction. 1006 // 1007 // NB: We convert large switches which are just used to initialize large phi 1008 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1009 // inlining those. It will prevent inlining in cases where the optimization 1010 // does not (yet) fire. 1011 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 1012 SuccessorBlocks.insert(SI.getDefaultDest()); 1013 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 1014 SuccessorBlocks.insert(I.getCaseSuccessor()); 1015 // Add cost corresponding to the number of distinct destinations. The first 1016 // we model as free because of fallthrough. 1017 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 1018 return false; 1019 } 1020 1021 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1022 // We never want to inline functions that contain an indirectbr. This is 1023 // incorrect because all the blockaddress's (in static global initializers 1024 // for example) would be referring to the original function, and this 1025 // indirect jump would jump from the inlined copy of the function into the 1026 // original function which is extremely undefined behavior. 1027 // FIXME: This logic isn't really right; we can safely inline functions with 1028 // indirectbr's as long as no other function or global references the 1029 // blockaddress of a block within the current function. 1030 HasIndirectBr = true; 1031 return false; 1032 } 1033 1034 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1035 // FIXME: It's not clear that a single instruction is an accurate model for 1036 // the inline cost of a resume instruction. 1037 return false; 1038 } 1039 1040 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1041 // FIXME: It's not clear that a single instruction is an accurate model for 1042 // the inline cost of a cleanupret instruction. 1043 return false; 1044 } 1045 1046 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1047 // FIXME: It's not clear that a single instruction is an accurate model for 1048 // the inline cost of a catchret instruction. 1049 return false; 1050 } 1051 1052 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1053 // FIXME: It might be reasonably to discount the cost of instructions leading 1054 // to unreachable as they have the lowest possible impact on both runtime and 1055 // code size. 1056 return true; // No actual code is needed for unreachable. 1057 } 1058 1059 bool CallAnalyzer::visitInstruction(Instruction &I) { 1060 // Some instructions are free. All of the free intrinsics can also be 1061 // handled by SROA, etc. 1062 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1063 return true; 1064 1065 // We found something we don't understand or can't handle. Mark any SROA-able 1066 // values in the operand list as no longer viable. 1067 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1068 disableSROA(*OI); 1069 1070 return false; 1071 } 1072 1073 /// \brief Analyze a basic block for its contribution to the inline cost. 1074 /// 1075 /// This method walks the analyzer over every instruction in the given basic 1076 /// block and accounts for their cost during inlining at this callsite. It 1077 /// aborts early if the threshold has been exceeded or an impossible to inline 1078 /// construct has been detected. It returns false if inlining is no longer 1079 /// viable, and true if inlining remains viable. 1080 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1081 SmallPtrSetImpl<const Value *> &EphValues) { 1082 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1083 // FIXME: Currently, the number of instructions in a function regardless of 1084 // our ability to simplify them during inline to constants or dead code, 1085 // are actually used by the vector bonus heuristic. As long as that's true, 1086 // we have to special case debug intrinsics here to prevent differences in 1087 // inlining due to debug symbols. Eventually, the number of unsimplified 1088 // instructions shouldn't factor into the cost computation, but until then, 1089 // hack around it here. 1090 if (isa<DbgInfoIntrinsic>(I)) 1091 continue; 1092 1093 // Skip ephemeral values. 1094 if (EphValues.count(&*I)) 1095 continue; 1096 1097 ++NumInstructions; 1098 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1099 ++NumVectorInstructions; 1100 1101 // If the instruction is floating point, and the target says this operation 1102 // is expensive or the function has the "use-soft-float" attribute, this may 1103 // eventually become a library call. Treat the cost as such. 1104 if (I->getType()->isFloatingPointTy()) { 1105 bool hasSoftFloatAttr = false; 1106 1107 // If the function has the "use-soft-float" attribute, mark it as 1108 // expensive. 1109 if (F.hasFnAttribute("use-soft-float")) { 1110 Attribute Attr = F.getFnAttribute("use-soft-float"); 1111 StringRef Val = Attr.getValueAsString(); 1112 if (Val == "true") 1113 hasSoftFloatAttr = true; 1114 } 1115 1116 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1117 hasSoftFloatAttr) 1118 Cost += InlineConstants::CallPenalty; 1119 } 1120 1121 // If the instruction simplified to a constant, there is no cost to this 1122 // instruction. Visit the instructions using our InstVisitor to account for 1123 // all of the per-instruction logic. The visit tree returns true if we 1124 // consumed the instruction in any way, and false if the instruction's base 1125 // cost should count against inlining. 1126 if (Base::visit(&*I)) 1127 ++NumInstructionsSimplified; 1128 else 1129 Cost += InlineConstants::InstrCost; 1130 1131 // If the visit this instruction detected an uninlinable pattern, abort. 1132 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1133 HasIndirectBr || HasFrameEscape) 1134 return false; 1135 1136 // If the caller is a recursive function then we don't want to inline 1137 // functions which allocate a lot of stack space because it would increase 1138 // the caller stack usage dramatically. 1139 if (IsCallerRecursive && 1140 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1141 return false; 1142 1143 // Check if we've past the maximum possible threshold so we don't spin in 1144 // huge basic blocks that will never inline. 1145 if (Cost > Threshold) 1146 return false; 1147 } 1148 1149 return true; 1150 } 1151 1152 /// \brief Compute the base pointer and cumulative constant offsets for V. 1153 /// 1154 /// This strips all constant offsets off of V, leaving it the base pointer, and 1155 /// accumulates the total constant offset applied in the returned constant. It 1156 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1157 /// no constant offsets applied. 1158 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1159 if (!V->getType()->isPointerTy()) 1160 return nullptr; 1161 1162 const DataLayout &DL = F.getParent()->getDataLayout(); 1163 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1164 APInt Offset = APInt::getNullValue(IntPtrWidth); 1165 1166 // Even though we don't look through PHI nodes, we could be called on an 1167 // instruction in an unreachable block, which may be on a cycle. 1168 SmallPtrSet<Value *, 4> Visited; 1169 Visited.insert(V); 1170 do { 1171 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1172 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1173 return nullptr; 1174 V = GEP->getPointerOperand(); 1175 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1176 V = cast<Operator>(V)->getOperand(0); 1177 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1178 if (GA->isInterposable()) 1179 break; 1180 V = GA->getAliasee(); 1181 } else { 1182 break; 1183 } 1184 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1185 } while (Visited.insert(V).second); 1186 1187 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1188 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1189 } 1190 1191 /// \brief Analyze a call site for potential inlining. 1192 /// 1193 /// Returns true if inlining this call is viable, and false if it is not 1194 /// viable. It computes the cost and adjusts the threshold based on numerous 1195 /// factors and heuristics. If this method returns false but the computed cost 1196 /// is below the computed threshold, then inlining was forcibly disabled by 1197 /// some artifact of the routine. 1198 bool CallAnalyzer::analyzeCall(CallSite CS) { 1199 ++NumCallsAnalyzed; 1200 1201 // Perform some tweaks to the cost and threshold based on the direct 1202 // callsite information. 1203 1204 // We want to more aggressively inline vector-dense kernels, so up the 1205 // threshold, and we'll lower it if the % of vector instructions gets too 1206 // low. Note that these bonuses are some what arbitrary and evolved over time 1207 // by accident as much as because they are principled bonuses. 1208 // 1209 // FIXME: It would be nice to remove all such bonuses. At least it would be 1210 // nice to base the bonus values on something more scientific. 1211 assert(NumInstructions == 0); 1212 assert(NumVectorInstructions == 0); 1213 1214 // Update the threshold based on callsite properties 1215 updateThreshold(CS, F); 1216 1217 FiftyPercentVectorBonus = 3 * Threshold / 2; 1218 TenPercentVectorBonus = 3 * Threshold / 4; 1219 const DataLayout &DL = F.getParent()->getDataLayout(); 1220 1221 // Track whether the post-inlining function would have more than one basic 1222 // block. A single basic block is often intended for inlining. Balloon the 1223 // threshold by 50% until we pass the single-BB phase. 1224 bool SingleBB = true; 1225 int SingleBBBonus = Threshold / 2; 1226 1227 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1228 // this Threshold any time, and cost cannot decrease, we can stop processing 1229 // the rest of the function body. 1230 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1231 1232 // Give out bonuses per argument, as the instructions setting them up will 1233 // be gone after inlining. 1234 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1235 if (CS.isByValArgument(I)) { 1236 // We approximate the number of loads and stores needed by dividing the 1237 // size of the byval type by the target's pointer size. 1238 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1239 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1240 unsigned PointerSize = DL.getPointerSizeInBits(); 1241 // Ceiling division. 1242 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1243 1244 // If it generates more than 8 stores it is likely to be expanded as an 1245 // inline memcpy so we take that as an upper bound. Otherwise we assume 1246 // one load and one store per word copied. 1247 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1248 // here instead of a magic number of 8, but it's not available via 1249 // DataLayout. 1250 NumStores = std::min(NumStores, 8U); 1251 1252 Cost -= 2 * NumStores * InlineConstants::InstrCost; 1253 } else { 1254 // For non-byval arguments subtract off one instruction per call 1255 // argument. 1256 Cost -= InlineConstants::InstrCost; 1257 } 1258 } 1259 1260 // If there is only one call of the function, and it has internal linkage, 1261 // the cost of inlining it drops dramatically. 1262 bool OnlyOneCallAndLocalLinkage = 1263 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1264 if (OnlyOneCallAndLocalLinkage) 1265 Cost += InlineConstants::LastCallToStaticBonus; 1266 1267 // If this function uses the coldcc calling convention, prefer not to inline 1268 // it. 1269 if (F.getCallingConv() == CallingConv::Cold) 1270 Cost += InlineConstants::ColdccPenalty; 1271 1272 // Check if we're done. This can happen due to bonuses and penalties. 1273 if (Cost > Threshold) 1274 return false; 1275 1276 if (F.empty()) 1277 return true; 1278 1279 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1280 // Check if the caller function is recursive itself. 1281 for (User *U : Caller->users()) { 1282 CallSite Site(U); 1283 if (!Site) 1284 continue; 1285 Instruction *I = Site.getInstruction(); 1286 if (I->getParent()->getParent() == Caller) { 1287 IsCallerRecursive = true; 1288 break; 1289 } 1290 } 1291 1292 // Populate our simplified values by mapping from function arguments to call 1293 // arguments with known important simplifications. 1294 CallSite::arg_iterator CAI = CS.arg_begin(); 1295 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1296 FAI != FAE; ++FAI, ++CAI) { 1297 assert(CAI != CS.arg_end()); 1298 if (Constant *C = dyn_cast<Constant>(CAI)) 1299 SimplifiedValues[&*FAI] = C; 1300 1301 Value *PtrArg = *CAI; 1302 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1303 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1304 1305 // We can SROA any pointer arguments derived from alloca instructions. 1306 if (isa<AllocaInst>(PtrArg)) { 1307 SROAArgValues[&*FAI] = PtrArg; 1308 SROAArgCosts[PtrArg] = 0; 1309 } 1310 } 1311 } 1312 NumConstantArgs = SimplifiedValues.size(); 1313 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1314 NumAllocaArgs = SROAArgValues.size(); 1315 1316 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1317 // the ephemeral values multiple times (and they're completely determined by 1318 // the callee, so this is purely duplicate work). 1319 SmallPtrSet<const Value *, 32> EphValues; 1320 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), 1321 EphValues); 1322 1323 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1324 // adding basic blocks of the callee which can be proven to be dead for this 1325 // particular call site in order to get more accurate cost estimates. This 1326 // requires a somewhat heavyweight iteration pattern: we need to walk the 1327 // basic blocks in a breadth-first order as we insert live successors. To 1328 // accomplish this, prioritizing for small iterations because we exit after 1329 // crossing our threshold, we use a small-size optimized SetVector. 1330 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1331 SmallPtrSet<BasicBlock *, 16>> 1332 BBSetVector; 1333 BBSetVector BBWorklist; 1334 BBWorklist.insert(&F.getEntryBlock()); 1335 // Note that we *must not* cache the size, this loop grows the worklist. 1336 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1337 // Bail out the moment we cross the threshold. This means we'll under-count 1338 // the cost, but only when undercounting doesn't matter. 1339 if (Cost > Threshold) 1340 break; 1341 1342 BasicBlock *BB = BBWorklist[Idx]; 1343 if (BB->empty()) 1344 continue; 1345 1346 // Disallow inlining a blockaddress. A blockaddress only has defined 1347 // behavior for an indirect branch in the same function, and we do not 1348 // currently support inlining indirect branches. But, the inliner may not 1349 // see an indirect branch that ends up being dead code at a particular call 1350 // site. If the blockaddress escapes the function, e.g., via a global 1351 // variable, inlining may lead to an invalid cross-function reference. 1352 if (BB->hasAddressTaken()) 1353 return false; 1354 1355 // Analyze the cost of this block. If we blow through the threshold, this 1356 // returns false, and we can bail on out. 1357 if (!analyzeBlock(BB, EphValues)) 1358 return false; 1359 1360 TerminatorInst *TI = BB->getTerminator(); 1361 1362 // Add in the live successors by first checking whether we have terminator 1363 // that may be simplified based on the values simplified by this call. 1364 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1365 if (BI->isConditional()) { 1366 Value *Cond = BI->getCondition(); 1367 if (ConstantInt *SimpleCond = 1368 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1369 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1370 continue; 1371 } 1372 } 1373 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1374 Value *Cond = SI->getCondition(); 1375 if (ConstantInt *SimpleCond = 1376 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1377 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1378 continue; 1379 } 1380 } 1381 1382 // If we're unable to select a particular successor, just count all of 1383 // them. 1384 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1385 ++TIdx) 1386 BBWorklist.insert(TI->getSuccessor(TIdx)); 1387 1388 // If we had any successors at this point, than post-inlining is likely to 1389 // have them as well. Note that we assume any basic blocks which existed 1390 // due to branches or switches which folded above will also fold after 1391 // inlining. 1392 if (SingleBB && TI->getNumSuccessors() > 1) { 1393 // Take off the bonus we applied to the threshold. 1394 Threshold -= SingleBBBonus; 1395 SingleBB = false; 1396 } 1397 } 1398 1399 // If this is a noduplicate call, we can still inline as long as 1400 // inlining this would cause the removal of the caller (so the instruction 1401 // is not actually duplicated, just moved). 1402 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1403 return false; 1404 1405 // We applied the maximum possible vector bonus at the beginning. Now, 1406 // subtract the excess bonus, if any, from the Threshold before 1407 // comparing against Cost. 1408 if (NumVectorInstructions <= NumInstructions / 10) 1409 Threshold -= FiftyPercentVectorBonus; 1410 else if (NumVectorInstructions <= NumInstructions / 2) 1411 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1412 1413 return Cost < std::max(1, Threshold); 1414 } 1415 1416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1417 /// \brief Dump stats about this call's analysis. 1418 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1419 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1420 DEBUG_PRINT_STAT(NumConstantArgs); 1421 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1422 DEBUG_PRINT_STAT(NumAllocaArgs); 1423 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1424 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1425 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1426 DEBUG_PRINT_STAT(NumInstructions); 1427 DEBUG_PRINT_STAT(SROACostSavings); 1428 DEBUG_PRINT_STAT(SROACostSavingsLost); 1429 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1430 DEBUG_PRINT_STAT(Cost); 1431 DEBUG_PRINT_STAT(Threshold); 1432 #undef DEBUG_PRINT_STAT 1433 } 1434 #endif 1435 1436 /// \brief Test that two functions either have or have not the given attribute 1437 /// at the same time. 1438 template <typename AttrKind> 1439 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { 1440 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); 1441 } 1442 1443 /// \brief Test that there are no attribute conflicts between Caller and Callee 1444 /// that prevent inlining. 1445 static bool functionsHaveCompatibleAttributes(Function *Caller, 1446 Function *Callee, 1447 TargetTransformInfo &TTI) { 1448 return TTI.areInlineCompatible(Caller, Callee) && 1449 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1450 } 1451 1452 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, 1453 TargetTransformInfo &CalleeTTI, 1454 AssumptionCacheTracker *ACT) { 1455 return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, 1456 ACT); 1457 } 1458 1459 int llvm::computeThresholdFromOptLevels(unsigned OptLevel, 1460 unsigned SizeOptLevel) { 1461 if (OptLevel > 2) 1462 return OptAggressiveThreshold; 1463 if (SizeOptLevel == 1) // -Os 1464 return OptSizeThreshold; 1465 if (SizeOptLevel == 2) // -Oz 1466 return OptMinSizeThreshold; 1467 return DefaultInlineThreshold; 1468 } 1469 1470 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; } 1471 1472 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, 1473 int DefaultThreshold, 1474 TargetTransformInfo &CalleeTTI, 1475 AssumptionCacheTracker *ACT) { 1476 1477 // Cannot inline indirect calls. 1478 if (!Callee) 1479 return llvm::InlineCost::getNever(); 1480 1481 // Calls to functions with always-inline attributes should be inlined 1482 // whenever possible. 1483 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1484 if (isInlineViable(*Callee)) 1485 return llvm::InlineCost::getAlways(); 1486 return llvm::InlineCost::getNever(); 1487 } 1488 1489 // Never inline functions with conflicting attributes (unless callee has 1490 // always-inline attribute). 1491 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) 1492 return llvm::InlineCost::getNever(); 1493 1494 // Don't inline this call if the caller has the optnone attribute. 1495 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1496 return llvm::InlineCost::getNever(); 1497 1498 // Don't inline functions which can be interposed at link-time. Don't inline 1499 // functions marked noinline or call sites marked noinline. 1500 // Note: inlining non-exact non-interposable fucntions is fine, since we know 1501 // we have *a* correct implementation of the source level function. 1502 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1503 CS.isNoInline()) 1504 return llvm::InlineCost::getNever(); 1505 1506 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1507 << "...\n"); 1508 1509 CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); 1510 bool ShouldInline = CA.analyzeCall(CS); 1511 1512 DEBUG(CA.dump()); 1513 1514 // Check if there was a reason to force inlining or no inlining. 1515 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1516 return InlineCost::getNever(); 1517 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1518 return InlineCost::getAlways(); 1519 1520 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1521 } 1522 1523 bool llvm::isInlineViable(Function &F) { 1524 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1525 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1526 // Disallow inlining of functions which contain indirect branches or 1527 // blockaddresses. 1528 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1529 return false; 1530 1531 for (auto &II : *BI) { 1532 CallSite CS(&II); 1533 if (!CS) 1534 continue; 1535 1536 // Disallow recursive calls. 1537 if (&F == CS.getCalledFunction()) 1538 return false; 1539 1540 // Disallow calls which expose returns-twice to a function not previously 1541 // attributed as such. 1542 if (!ReturnsTwice && CS.isCall() && 1543 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1544 return false; 1545 1546 // Disallow inlining functions that call @llvm.localescape. Doing this 1547 // correctly would require major changes to the inliner. 1548 if (CS.getCalledFunction() && 1549 CS.getCalledFunction()->getIntrinsicID() == 1550 llvm::Intrinsic::localescape) 1551 return false; 1552 } 1553 } 1554 1555 return true; 1556 } 1557