1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/CallingConv.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/GlobalAlias.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Operator.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "inline-cost" 39 40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 41 42 // Threshold to use when optsize is specified (and there is no 43 // -inline-threshold). 44 const int OptSizeThreshold = 75; 45 46 // Threshold to use when -Oz is specified (and there is no -inline-threshold). 47 const int OptMinSizeThreshold = 25; 48 49 // Threshold to use when -O[34] is specified (and there is no 50 // -inline-threshold). 51 const int OptAggressiveThreshold = 275; 52 53 static cl::opt<int> DefaultInlineThreshold( 54 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 55 cl::desc("Control the amount of inlining to perform (default = 225)")); 56 57 static cl::opt<int> HintThreshold( 58 "inlinehint-threshold", cl::Hidden, cl::init(325), 59 cl::desc("Threshold for inlining functions with inline hint")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(225), 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 namespace { 69 70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 71 typedef InstVisitor<CallAnalyzer, bool> Base; 72 friend class InstVisitor<CallAnalyzer, bool>; 73 74 /// The TargetTransformInfo available for this compilation. 75 const TargetTransformInfo &TTI; 76 77 /// The cache of @llvm.assume intrinsics. 78 AssumptionCacheTracker *ACT; 79 80 // The called function. 81 Function &F; 82 83 // The candidate callsite being analyzed. Please do not use this to do 84 // analysis in the caller function; we want the inline cost query to be 85 // easily cacheable. Instead, use the cover function paramHasAttr. 86 CallSite CandidateCS; 87 88 int Threshold; 89 int Cost; 90 91 bool IsCallerRecursive; 92 bool IsRecursiveCall; 93 bool ExposesReturnsTwice; 94 bool HasDynamicAlloca; 95 bool ContainsNoDuplicateCall; 96 bool HasReturn; 97 bool HasIndirectBr; 98 bool HasFrameEscape; 99 100 /// Number of bytes allocated statically by the callee. 101 uint64_t AllocatedSize; 102 unsigned NumInstructions, NumVectorInstructions; 103 int FiftyPercentVectorBonus, TenPercentVectorBonus; 104 int VectorBonus; 105 106 // While we walk the potentially-inlined instructions, we build up and 107 // maintain a mapping of simplified values specific to this callsite. The 108 // idea is to propagate any special information we have about arguments to 109 // this call through the inlinable section of the function, and account for 110 // likely simplifications post-inlining. The most important aspect we track 111 // is CFG altering simplifications -- when we prove a basic block dead, that 112 // can cause dramatic shifts in the cost of inlining a function. 113 DenseMap<Value *, Constant *> SimplifiedValues; 114 115 // Keep track of the values which map back (through function arguments) to 116 // allocas on the caller stack which could be simplified through SROA. 117 DenseMap<Value *, Value *> SROAArgValues; 118 119 // The mapping of caller Alloca values to their accumulated cost savings. If 120 // we have to disable SROA for one of the allocas, this tells us how much 121 // cost must be added. 122 DenseMap<Value *, int> SROAArgCosts; 123 124 // Keep track of values which map to a pointer base and constant offset. 125 DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs; 126 127 // Custom simplification helper routines. 128 bool isAllocaDerivedArg(Value *V); 129 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 130 DenseMap<Value *, int>::iterator &CostIt); 131 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 132 void disableSROA(Value *V); 133 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 134 int InstructionCost); 135 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 136 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 137 bool simplifyCallSite(Function *F, CallSite CS); 138 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 139 140 /// Return true if the given argument to the function being considered for 141 /// inlining has the given attribute set either at the call site or the 142 /// function declaration. Primarily used to inspect call site specific 143 /// attributes since these can be more precise than the ones on the callee 144 /// itself. 145 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 146 147 /// Return true if the given value is known non null within the callee if 148 /// inlined through this particular callsite. 149 bool isKnownNonNullInCallee(Value *V); 150 151 /// Update Threshold based on callsite properties such as callee 152 /// attributes and callee hotness for PGO builds. The Callee is explicitly 153 /// passed to support analyzing indirect calls whose target is inferred by 154 /// analysis. 155 void updateThreshold(CallSite CS, Function &Callee); 156 157 /// Return true if size growth is allowed when inlining the callee at CS. 158 bool allowSizeGrowth(CallSite CS); 159 160 // Custom analysis routines. 161 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 162 163 // Disable several entry points to the visitor so we don't accidentally use 164 // them by declaring but not defining them here. 165 void visit(Module *); 166 void visit(Module &); 167 void visit(Function *); 168 void visit(Function &); 169 void visit(BasicBlock *); 170 void visit(BasicBlock &); 171 172 // Provide base case for our instruction visit. 173 bool visitInstruction(Instruction &I); 174 175 // Our visit overrides. 176 bool visitAlloca(AllocaInst &I); 177 bool visitPHI(PHINode &I); 178 bool visitGetElementPtr(GetElementPtrInst &I); 179 bool visitBitCast(BitCastInst &I); 180 bool visitPtrToInt(PtrToIntInst &I); 181 bool visitIntToPtr(IntToPtrInst &I); 182 bool visitCastInst(CastInst &I); 183 bool visitUnaryInstruction(UnaryInstruction &I); 184 bool visitCmpInst(CmpInst &I); 185 bool visitSub(BinaryOperator &I); 186 bool visitBinaryOperator(BinaryOperator &I); 187 bool visitLoad(LoadInst &I); 188 bool visitStore(StoreInst &I); 189 bool visitExtractValue(ExtractValueInst &I); 190 bool visitInsertValue(InsertValueInst &I); 191 bool visitCallSite(CallSite CS); 192 bool visitReturnInst(ReturnInst &RI); 193 bool visitBranchInst(BranchInst &BI); 194 bool visitSwitchInst(SwitchInst &SI); 195 bool visitIndirectBrInst(IndirectBrInst &IBI); 196 bool visitResumeInst(ResumeInst &RI); 197 bool visitCleanupReturnInst(CleanupReturnInst &RI); 198 bool visitCatchReturnInst(CatchReturnInst &RI); 199 bool visitUnreachableInst(UnreachableInst &I); 200 201 public: 202 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, 203 Function &Callee, int Threshold, CallSite CSArg) 204 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), 205 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 206 ExposesReturnsTwice(false), HasDynamicAlloca(false), 207 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 208 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 209 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 210 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 211 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 212 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 213 SROACostSavings(0), SROACostSavingsLost(0) {} 214 215 bool analyzeCall(CallSite CS); 216 217 int getThreshold() { return Threshold; } 218 int getCost() { return Cost; } 219 220 // Keep a bunch of stats about the cost savings found so we can print them 221 // out when debugging. 222 unsigned NumConstantArgs; 223 unsigned NumConstantOffsetPtrArgs; 224 unsigned NumAllocaArgs; 225 unsigned NumConstantPtrCmps; 226 unsigned NumConstantPtrDiffs; 227 unsigned NumInstructionsSimplified; 228 unsigned SROACostSavings; 229 unsigned SROACostSavingsLost; 230 231 void dump(); 232 }; 233 234 } // namespace 235 236 /// \brief Test whether the given value is an Alloca-derived function argument. 237 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 238 return SROAArgValues.count(V); 239 } 240 241 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 242 /// Returns false if V does not map to a SROA-candidate. 243 bool CallAnalyzer::lookupSROAArgAndCost( 244 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 245 if (SROAArgValues.empty() || SROAArgCosts.empty()) 246 return false; 247 248 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 249 if (ArgIt == SROAArgValues.end()) 250 return false; 251 252 Arg = ArgIt->second; 253 CostIt = SROAArgCosts.find(Arg); 254 return CostIt != SROAArgCosts.end(); 255 } 256 257 /// \brief Disable SROA for the candidate marked by this cost iterator. 258 /// 259 /// This marks the candidate as no longer viable for SROA, and adds the cost 260 /// savings associated with it back into the inline cost measurement. 261 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 262 // If we're no longer able to perform SROA we need to undo its cost savings 263 // and prevent subsequent analysis. 264 Cost += CostIt->second; 265 SROACostSavings -= CostIt->second; 266 SROACostSavingsLost += CostIt->second; 267 SROAArgCosts.erase(CostIt); 268 } 269 270 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 271 void CallAnalyzer::disableSROA(Value *V) { 272 Value *SROAArg; 273 DenseMap<Value *, int>::iterator CostIt; 274 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 275 disableSROA(CostIt); 276 } 277 278 /// \brief Accumulate the given cost for a particular SROA candidate. 279 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 280 int InstructionCost) { 281 CostIt->second += InstructionCost; 282 SROACostSavings += InstructionCost; 283 } 284 285 /// \brief Check whether a GEP's indices are all constant. 286 /// 287 /// Respects any simplified values known during the analysis of this callsite. 288 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 289 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 290 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 291 return false; 292 293 return true; 294 } 295 296 /// \brief Accumulate a constant GEP offset into an APInt if possible. 297 /// 298 /// Returns false if unable to compute the offset for any reason. Respects any 299 /// simplified values known during the analysis of this callsite. 300 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 301 const DataLayout &DL = F.getParent()->getDataLayout(); 302 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 303 assert(IntPtrWidth == Offset.getBitWidth()); 304 305 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 306 GTI != GTE; ++GTI) { 307 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 308 if (!OpC) 309 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 310 OpC = dyn_cast<ConstantInt>(SimpleOp); 311 if (!OpC) 312 return false; 313 if (OpC->isZero()) 314 continue; 315 316 // Handle a struct index, which adds its field offset to the pointer. 317 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 318 unsigned ElementIdx = OpC->getZExtValue(); 319 const StructLayout *SL = DL.getStructLayout(STy); 320 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 321 continue; 322 } 323 324 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 325 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 326 } 327 return true; 328 } 329 330 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 331 // Check whether inlining will turn a dynamic alloca into a static 332 // alloca, and handle that case. 333 if (I.isArrayAllocation()) { 334 if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) { 335 ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size); 336 assert(AllocSize && "Allocation size not a constant int?"); 337 Type *Ty = I.getAllocatedType(); 338 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue(); 339 return Base::visitAlloca(I); 340 } 341 } 342 343 // Accumulate the allocated size. 344 if (I.isStaticAlloca()) { 345 const DataLayout &DL = F.getParent()->getDataLayout(); 346 Type *Ty = I.getAllocatedType(); 347 AllocatedSize += DL.getTypeAllocSize(Ty); 348 } 349 350 // We will happily inline static alloca instructions. 351 if (I.isStaticAlloca()) 352 return Base::visitAlloca(I); 353 354 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 355 // a variety of reasons, and so we would like to not inline them into 356 // functions which don't currently have a dynamic alloca. This simply 357 // disables inlining altogether in the presence of a dynamic alloca. 358 HasDynamicAlloca = true; 359 return false; 360 } 361 362 bool CallAnalyzer::visitPHI(PHINode &I) { 363 // FIXME: We should potentially be tracking values through phi nodes, 364 // especially when they collapse to a single value due to deleted CFG edges 365 // during inlining. 366 367 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 368 // though we don't want to propagate it's bonuses. The idea is to disable 369 // SROA if it *might* be used in an inappropriate manner. 370 371 // Phi nodes are always zero-cost. 372 return true; 373 } 374 375 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 376 Value *SROAArg; 377 DenseMap<Value *, int>::iterator CostIt; 378 bool SROACandidate = 379 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 380 381 // Try to fold GEPs of constant-offset call site argument pointers. This 382 // requires target data and inbounds GEPs. 383 if (I.isInBounds()) { 384 // Check if we have a base + offset for the pointer. 385 Value *Ptr = I.getPointerOperand(); 386 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 387 if (BaseAndOffset.first) { 388 // Check if the offset of this GEP is constant, and if so accumulate it 389 // into Offset. 390 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 391 // Non-constant GEPs aren't folded, and disable SROA. 392 if (SROACandidate) 393 disableSROA(CostIt); 394 return false; 395 } 396 397 // Add the result as a new mapping to Base + Offset. 398 ConstantOffsetPtrs[&I] = BaseAndOffset; 399 400 // Also handle SROA candidates here, we already know that the GEP is 401 // all-constant indexed. 402 if (SROACandidate) 403 SROAArgValues[&I] = SROAArg; 404 405 return true; 406 } 407 } 408 409 if (isGEPOffsetConstant(I)) { 410 if (SROACandidate) 411 SROAArgValues[&I] = SROAArg; 412 413 // Constant GEPs are modeled as free. 414 return true; 415 } 416 417 // Variable GEPs will require math and will disable SROA. 418 if (SROACandidate) 419 disableSROA(CostIt); 420 return false; 421 } 422 423 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 424 // Propagate constants through bitcasts. 425 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 426 if (!COp) 427 COp = SimplifiedValues.lookup(I.getOperand(0)); 428 if (COp) 429 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 430 SimplifiedValues[&I] = C; 431 return true; 432 } 433 434 // Track base/offsets through casts 435 std::pair<Value *, APInt> BaseAndOffset = 436 ConstantOffsetPtrs.lookup(I.getOperand(0)); 437 // Casts don't change the offset, just wrap it up. 438 if (BaseAndOffset.first) 439 ConstantOffsetPtrs[&I] = BaseAndOffset; 440 441 // Also look for SROA candidates here. 442 Value *SROAArg; 443 DenseMap<Value *, int>::iterator CostIt; 444 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 445 SROAArgValues[&I] = SROAArg; 446 447 // Bitcasts are always zero cost. 448 return true; 449 } 450 451 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 452 // Propagate constants through ptrtoint. 453 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 454 if (!COp) 455 COp = SimplifiedValues.lookup(I.getOperand(0)); 456 if (COp) 457 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 458 SimplifiedValues[&I] = C; 459 return true; 460 } 461 462 // Track base/offset pairs when converted to a plain integer provided the 463 // integer is large enough to represent the pointer. 464 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 465 const DataLayout &DL = F.getParent()->getDataLayout(); 466 if (IntegerSize >= DL.getPointerSizeInBits()) { 467 std::pair<Value *, APInt> BaseAndOffset = 468 ConstantOffsetPtrs.lookup(I.getOperand(0)); 469 if (BaseAndOffset.first) 470 ConstantOffsetPtrs[&I] = BaseAndOffset; 471 } 472 473 // This is really weird. Technically, ptrtoint will disable SROA. However, 474 // unless that ptrtoint is *used* somewhere in the live basic blocks after 475 // inlining, it will be nuked, and SROA should proceed. All of the uses which 476 // would block SROA would also block SROA if applied directly to a pointer, 477 // and so we can just add the integer in here. The only places where SROA is 478 // preserved either cannot fire on an integer, or won't in-and-of themselves 479 // disable SROA (ext) w/o some later use that we would see and disable. 480 Value *SROAArg; 481 DenseMap<Value *, int>::iterator CostIt; 482 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 483 SROAArgValues[&I] = SROAArg; 484 485 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 486 } 487 488 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 489 // Propagate constants through ptrtoint. 490 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 491 if (!COp) 492 COp = SimplifiedValues.lookup(I.getOperand(0)); 493 if (COp) 494 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 495 SimplifiedValues[&I] = C; 496 return true; 497 } 498 499 // Track base/offset pairs when round-tripped through a pointer without 500 // modifications provided the integer is not too large. 501 Value *Op = I.getOperand(0); 502 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 503 const DataLayout &DL = F.getParent()->getDataLayout(); 504 if (IntegerSize <= DL.getPointerSizeInBits()) { 505 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 506 if (BaseAndOffset.first) 507 ConstantOffsetPtrs[&I] = BaseAndOffset; 508 } 509 510 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 511 Value *SROAArg; 512 DenseMap<Value *, int>::iterator CostIt; 513 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 514 SROAArgValues[&I] = SROAArg; 515 516 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 517 } 518 519 bool CallAnalyzer::visitCastInst(CastInst &I) { 520 // Propagate constants through ptrtoint. 521 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 522 if (!COp) 523 COp = SimplifiedValues.lookup(I.getOperand(0)); 524 if (COp) 525 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 526 SimplifiedValues[&I] = C; 527 return true; 528 } 529 530 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 531 disableSROA(I.getOperand(0)); 532 533 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 534 } 535 536 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 537 Value *Operand = I.getOperand(0); 538 Constant *COp = dyn_cast<Constant>(Operand); 539 if (!COp) 540 COp = SimplifiedValues.lookup(Operand); 541 if (COp) { 542 const DataLayout &DL = F.getParent()->getDataLayout(); 543 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { 544 SimplifiedValues[&I] = C; 545 return true; 546 } 547 } 548 549 // Disable any SROA on the argument to arbitrary unary operators. 550 disableSROA(Operand); 551 552 return false; 553 } 554 555 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 556 unsigned ArgNo = A->getArgNo(); 557 return CandidateCS.paramHasAttr(ArgNo + 1, Attr); 558 } 559 560 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 561 // Does the *call site* have the NonNull attribute set on an argument? We 562 // use the attribute on the call site to memoize any analysis done in the 563 // caller. This will also trip if the callee function has a non-null 564 // parameter attribute, but that's a less interesting case because hopefully 565 // the callee would already have been simplified based on that. 566 if (Argument *A = dyn_cast<Argument>(V)) 567 if (paramHasAttr(A, Attribute::NonNull)) 568 return true; 569 570 // Is this an alloca in the caller? This is distinct from the attribute case 571 // above because attributes aren't updated within the inliner itself and we 572 // always want to catch the alloca derived case. 573 if (isAllocaDerivedArg(V)) 574 // We can actually predict the result of comparisons between an 575 // alloca-derived value and null. Note that this fires regardless of 576 // SROA firing. 577 return true; 578 579 return false; 580 } 581 582 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 583 // If the normal destination of the invoke or the parent block of the call 584 // site is unreachable-terminated, there is little point in inlining this 585 // unless there is literally zero cost. 586 // FIXME: Note that it is possible that an unreachable-terminated block has a 587 // hot entry. For example, in below scenario inlining hot_call_X() may be 588 // beneficial : 589 // main() { 590 // hot_call_1(); 591 // ... 592 // hot_call_N() 593 // exit(0); 594 // } 595 // For now, we are not handling this corner case here as it is rare in real 596 // code. In future, we should elaborate this based on BPI and BFI in more 597 // general threshold adjusting heuristics in updateThreshold(). 598 Instruction *Instr = CS.getInstruction(); 599 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 600 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 601 return false; 602 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 603 return false; 604 605 return true; 606 } 607 608 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 609 // If no size growth is allowed for this inlining, set Threshold to 0. 610 if (!allowSizeGrowth(CS)) { 611 Threshold = 0; 612 return; 613 } 614 615 // If -inline-threshold is not given, listen to the optsize and minsize 616 // attributes when they would decrease the threshold. 617 Function *Caller = CS.getCaller(); 618 619 if (!(DefaultInlineThreshold.getNumOccurrences() > 0)) { 620 if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold) 621 Threshold = OptMinSizeThreshold; 622 else if (Caller->optForSize() && OptSizeThreshold < Threshold) 623 Threshold = OptSizeThreshold; 624 } 625 626 // If profile information is available, use that to adjust threshold of hot 627 // and cold functions. 628 // FIXME: The heuristic used below for determining hotness and coldness are 629 // based on preliminary SPEC tuning and may not be optimal. Replace this with 630 // a well-tuned heuristic based on *callsite* hotness and not callee hotness. 631 uint64_t FunctionCount = 0, MaxFunctionCount = 0; 632 bool HasPGOCounts = false; 633 if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) { 634 HasPGOCounts = true; 635 FunctionCount = Callee.getEntryCount().getValue(); 636 MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); 637 } 638 639 // Listen to the inlinehint attribute or profile based hotness information 640 // when it would increase the threshold and the caller does not need to 641 // minimize its size. 642 bool InlineHint = 643 Callee.hasFnAttribute(Attribute::InlineHint) || 644 (HasPGOCounts && 645 FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); 646 if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize()) 647 Threshold = HintThreshold; 648 649 // Listen to the cold attribute or profile based coldness information 650 // when it would decrease the threshold. 651 bool ColdCallee = 652 Callee.hasFnAttribute(Attribute::Cold) || 653 (HasPGOCounts && 654 FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); 655 // Command line argument for DefaultInlineThreshold will override the default 656 // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, 657 // do not use the default cold threshold even if it is smaller. 658 if ((DefaultInlineThreshold.getNumOccurrences() == 0 || 659 ColdThreshold.getNumOccurrences() > 0) && 660 ColdCallee && ColdThreshold < Threshold) 661 Threshold = ColdThreshold; 662 663 // Finally, take the target-specific inlining threshold multiplier into 664 // account. 665 Threshold *= TTI.getInliningThresholdMultiplier(); 666 } 667 668 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 669 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 670 // First try to handle simplified comparisons. 671 if (!isa<Constant>(LHS)) 672 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 673 LHS = SimpleLHS; 674 if (!isa<Constant>(RHS)) 675 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 676 RHS = SimpleRHS; 677 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { 678 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 679 if (Constant *C = 680 ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { 681 SimplifiedValues[&I] = C; 682 return true; 683 } 684 } 685 686 if (I.getOpcode() == Instruction::FCmp) 687 return false; 688 689 // Otherwise look for a comparison between constant offset pointers with 690 // a common base. 691 Value *LHSBase, *RHSBase; 692 APInt LHSOffset, RHSOffset; 693 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 694 if (LHSBase) { 695 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 696 if (RHSBase && LHSBase == RHSBase) { 697 // We have common bases, fold the icmp to a constant based on the 698 // offsets. 699 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 700 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 701 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 702 SimplifiedValues[&I] = C; 703 ++NumConstantPtrCmps; 704 return true; 705 } 706 } 707 } 708 709 // If the comparison is an equality comparison with null, we can simplify it 710 // if we know the value (argument) can't be null 711 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 712 isKnownNonNullInCallee(I.getOperand(0))) { 713 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 714 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 715 : ConstantInt::getFalse(I.getType()); 716 return true; 717 } 718 // Finally check for SROA candidates in comparisons. 719 Value *SROAArg; 720 DenseMap<Value *, int>::iterator CostIt; 721 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 722 if (isa<ConstantPointerNull>(I.getOperand(1))) { 723 accumulateSROACost(CostIt, InlineConstants::InstrCost); 724 return true; 725 } 726 727 disableSROA(CostIt); 728 } 729 730 return false; 731 } 732 733 bool CallAnalyzer::visitSub(BinaryOperator &I) { 734 // Try to handle a special case: we can fold computing the difference of two 735 // constant-related pointers. 736 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 737 Value *LHSBase, *RHSBase; 738 APInt LHSOffset, RHSOffset; 739 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 740 if (LHSBase) { 741 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 742 if (RHSBase && LHSBase == RHSBase) { 743 // We have common bases, fold the subtract to a constant based on the 744 // offsets. 745 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 746 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 747 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 748 SimplifiedValues[&I] = C; 749 ++NumConstantPtrDiffs; 750 return true; 751 } 752 } 753 } 754 755 // Otherwise, fall back to the generic logic for simplifying and handling 756 // instructions. 757 return Base::visitSub(I); 758 } 759 760 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 761 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 762 const DataLayout &DL = F.getParent()->getDataLayout(); 763 if (!isa<Constant>(LHS)) 764 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 765 LHS = SimpleLHS; 766 if (!isa<Constant>(RHS)) 767 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 768 RHS = SimpleRHS; 769 Value *SimpleV = nullptr; 770 if (auto FI = dyn_cast<FPMathOperator>(&I)) 771 SimpleV = 772 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 773 else 774 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); 775 776 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 777 SimplifiedValues[&I] = C; 778 return true; 779 } 780 781 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 782 disableSROA(LHS); 783 disableSROA(RHS); 784 785 return false; 786 } 787 788 bool CallAnalyzer::visitLoad(LoadInst &I) { 789 Value *SROAArg; 790 DenseMap<Value *, int>::iterator CostIt; 791 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 792 if (I.isSimple()) { 793 accumulateSROACost(CostIt, InlineConstants::InstrCost); 794 return true; 795 } 796 797 disableSROA(CostIt); 798 } 799 800 return false; 801 } 802 803 bool CallAnalyzer::visitStore(StoreInst &I) { 804 Value *SROAArg; 805 DenseMap<Value *, int>::iterator CostIt; 806 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 807 if (I.isSimple()) { 808 accumulateSROACost(CostIt, InlineConstants::InstrCost); 809 return true; 810 } 811 812 disableSROA(CostIt); 813 } 814 815 return false; 816 } 817 818 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 819 // Constant folding for extract value is trivial. 820 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 821 if (!C) 822 C = SimplifiedValues.lookup(I.getAggregateOperand()); 823 if (C) { 824 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); 825 return true; 826 } 827 828 // SROA can look through these but give them a cost. 829 return false; 830 } 831 832 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 833 // Constant folding for insert value is trivial. 834 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 835 if (!AggC) 836 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 837 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 838 if (!InsertedC) 839 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); 840 if (AggC && InsertedC) { 841 SimplifiedValues[&I] = 842 ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices()); 843 return true; 844 } 845 846 // SROA can look through these but give them a cost. 847 return false; 848 } 849 850 /// \brief Try to simplify a call site. 851 /// 852 /// Takes a concrete function and callsite and tries to actually simplify it by 853 /// analyzing the arguments and call itself with instsimplify. Returns true if 854 /// it has simplified the callsite to some other entity (a constant), making it 855 /// free. 856 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 857 // FIXME: Using the instsimplify logic directly for this is inefficient 858 // because we have to continually rebuild the argument list even when no 859 // simplifications can be performed. Until that is fixed with remapping 860 // inside of instsimplify, directly constant fold calls here. 861 if (!canConstantFoldCallTo(F)) 862 return false; 863 864 // Try to re-map the arguments to constants. 865 SmallVector<Constant *, 4> ConstantArgs; 866 ConstantArgs.reserve(CS.arg_size()); 867 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E; 868 ++I) { 869 Constant *C = dyn_cast<Constant>(*I); 870 if (!C) 871 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 872 if (!C) 873 return false; // This argument doesn't map to a constant. 874 875 ConstantArgs.push_back(C); 876 } 877 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 878 SimplifiedValues[CS.getInstruction()] = C; 879 return true; 880 } 881 882 return false; 883 } 884 885 bool CallAnalyzer::visitCallSite(CallSite CS) { 886 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 887 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 888 // This aborts the entire analysis. 889 ExposesReturnsTwice = true; 890 return false; 891 } 892 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 893 ContainsNoDuplicateCall = true; 894 895 if (Function *F = CS.getCalledFunction()) { 896 // When we have a concrete function, first try to simplify it directly. 897 if (simplifyCallSite(F, CS)) 898 return true; 899 900 // Next check if it is an intrinsic we know about. 901 // FIXME: Lift this into part of the InstVisitor. 902 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 903 switch (II->getIntrinsicID()) { 904 default: 905 return Base::visitCallSite(CS); 906 907 case Intrinsic::load_relative: 908 // This is normally lowered to 4 LLVM instructions. 909 Cost += 3 * InlineConstants::InstrCost; 910 return false; 911 912 case Intrinsic::memset: 913 case Intrinsic::memcpy: 914 case Intrinsic::memmove: 915 // SROA can usually chew through these intrinsics, but they aren't free. 916 return false; 917 case Intrinsic::localescape: 918 HasFrameEscape = true; 919 return false; 920 } 921 } 922 923 if (F == CS.getInstruction()->getParent()->getParent()) { 924 // This flag will fully abort the analysis, so don't bother with anything 925 // else. 926 IsRecursiveCall = true; 927 return false; 928 } 929 930 if (TTI.isLoweredToCall(F)) { 931 // We account for the average 1 instruction per call argument setup 932 // here. 933 Cost += CS.arg_size() * InlineConstants::InstrCost; 934 935 // Everything other than inline ASM will also have a significant cost 936 // merely from making the call. 937 if (!isa<InlineAsm>(CS.getCalledValue())) 938 Cost += InlineConstants::CallPenalty; 939 } 940 941 return Base::visitCallSite(CS); 942 } 943 944 // Otherwise we're in a very special case -- an indirect function call. See 945 // if we can be particularly clever about this. 946 Value *Callee = CS.getCalledValue(); 947 948 // First, pay the price of the argument setup. We account for the average 949 // 1 instruction per call argument setup here. 950 Cost += CS.arg_size() * InlineConstants::InstrCost; 951 952 // Next, check if this happens to be an indirect function call to a known 953 // function in this inline context. If not, we've done all we can. 954 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 955 if (!F) 956 return Base::visitCallSite(CS); 957 958 // If we have a constant that we are calling as a function, we can peer 959 // through it and see the function target. This happens not infrequently 960 // during devirtualization and so we want to give it a hefty bonus for 961 // inlining, but cap that bonus in the event that inlining wouldn't pan 962 // out. Pretend to inline the function, with a custom threshold. 963 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); 964 if (CA.analyzeCall(CS)) { 965 // We were able to inline the indirect call! Subtract the cost from the 966 // threshold to get the bonus we want to apply, but don't go below zero. 967 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 968 } 969 970 return Base::visitCallSite(CS); 971 } 972 973 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 974 // At least one return instruction will be free after inlining. 975 bool Free = !HasReturn; 976 HasReturn = true; 977 return Free; 978 } 979 980 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 981 // We model unconditional branches as essentially free -- they really 982 // shouldn't exist at all, but handling them makes the behavior of the 983 // inliner more regular and predictable. Interestingly, conditional branches 984 // which will fold away are also free. 985 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 986 dyn_cast_or_null<ConstantInt>( 987 SimplifiedValues.lookup(BI.getCondition())); 988 } 989 990 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 991 // We model unconditional switches as free, see the comments on handling 992 // branches. 993 if (isa<ConstantInt>(SI.getCondition())) 994 return true; 995 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 996 if (isa<ConstantInt>(V)) 997 return true; 998 999 // Otherwise, we need to accumulate a cost proportional to the number of 1000 // distinct successor blocks. This fan-out in the CFG cannot be represented 1001 // for free even if we can represent the core switch as a jumptable that 1002 // takes a single instruction. 1003 // 1004 // NB: We convert large switches which are just used to initialize large phi 1005 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1006 // inlining those. It will prevent inlining in cases where the optimization 1007 // does not (yet) fire. 1008 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 1009 SuccessorBlocks.insert(SI.getDefaultDest()); 1010 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 1011 SuccessorBlocks.insert(I.getCaseSuccessor()); 1012 // Add cost corresponding to the number of distinct destinations. The first 1013 // we model as free because of fallthrough. 1014 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 1015 return false; 1016 } 1017 1018 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1019 // We never want to inline functions that contain an indirectbr. This is 1020 // incorrect because all the blockaddress's (in static global initializers 1021 // for example) would be referring to the original function, and this 1022 // indirect jump would jump from the inlined copy of the function into the 1023 // original function which is extremely undefined behavior. 1024 // FIXME: This logic isn't really right; we can safely inline functions with 1025 // indirectbr's as long as no other function or global references the 1026 // blockaddress of a block within the current function. 1027 HasIndirectBr = true; 1028 return false; 1029 } 1030 1031 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1032 // FIXME: It's not clear that a single instruction is an accurate model for 1033 // the inline cost of a resume instruction. 1034 return false; 1035 } 1036 1037 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1038 // FIXME: It's not clear that a single instruction is an accurate model for 1039 // the inline cost of a cleanupret instruction. 1040 return false; 1041 } 1042 1043 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1044 // FIXME: It's not clear that a single instruction is an accurate model for 1045 // the inline cost of a catchret instruction. 1046 return false; 1047 } 1048 1049 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1050 // FIXME: It might be reasonably to discount the cost of instructions leading 1051 // to unreachable as they have the lowest possible impact on both runtime and 1052 // code size. 1053 return true; // No actual code is needed for unreachable. 1054 } 1055 1056 bool CallAnalyzer::visitInstruction(Instruction &I) { 1057 // Some instructions are free. All of the free intrinsics can also be 1058 // handled by SROA, etc. 1059 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1060 return true; 1061 1062 // We found something we don't understand or can't handle. Mark any SROA-able 1063 // values in the operand list as no longer viable. 1064 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1065 disableSROA(*OI); 1066 1067 return false; 1068 } 1069 1070 /// \brief Analyze a basic block for its contribution to the inline cost. 1071 /// 1072 /// This method walks the analyzer over every instruction in the given basic 1073 /// block and accounts for their cost during inlining at this callsite. It 1074 /// aborts early if the threshold has been exceeded or an impossible to inline 1075 /// construct has been detected. It returns false if inlining is no longer 1076 /// viable, and true if inlining remains viable. 1077 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1078 SmallPtrSetImpl<const Value *> &EphValues) { 1079 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1080 // FIXME: Currently, the number of instructions in a function regardless of 1081 // our ability to simplify them during inline to constants or dead code, 1082 // are actually used by the vector bonus heuristic. As long as that's true, 1083 // we have to special case debug intrinsics here to prevent differences in 1084 // inlining due to debug symbols. Eventually, the number of unsimplified 1085 // instructions shouldn't factor into the cost computation, but until then, 1086 // hack around it here. 1087 if (isa<DbgInfoIntrinsic>(I)) 1088 continue; 1089 1090 // Skip ephemeral values. 1091 if (EphValues.count(&*I)) 1092 continue; 1093 1094 ++NumInstructions; 1095 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1096 ++NumVectorInstructions; 1097 1098 // If the instruction is floating point, and the target says this operation 1099 // is expensive or the function has the "use-soft-float" attribute, this may 1100 // eventually become a library call. Treat the cost as such. 1101 if (I->getType()->isFloatingPointTy()) { 1102 bool hasSoftFloatAttr = false; 1103 1104 // If the function has the "use-soft-float" attribute, mark it as 1105 // expensive. 1106 if (F.hasFnAttribute("use-soft-float")) { 1107 Attribute Attr = F.getFnAttribute("use-soft-float"); 1108 StringRef Val = Attr.getValueAsString(); 1109 if (Val == "true") 1110 hasSoftFloatAttr = true; 1111 } 1112 1113 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1114 hasSoftFloatAttr) 1115 Cost += InlineConstants::CallPenalty; 1116 } 1117 1118 // If the instruction simplified to a constant, there is no cost to this 1119 // instruction. Visit the instructions using our InstVisitor to account for 1120 // all of the per-instruction logic. The visit tree returns true if we 1121 // consumed the instruction in any way, and false if the instruction's base 1122 // cost should count against inlining. 1123 if (Base::visit(&*I)) 1124 ++NumInstructionsSimplified; 1125 else 1126 Cost += InlineConstants::InstrCost; 1127 1128 // If the visit this instruction detected an uninlinable pattern, abort. 1129 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1130 HasIndirectBr || HasFrameEscape) 1131 return false; 1132 1133 // If the caller is a recursive function then we don't want to inline 1134 // functions which allocate a lot of stack space because it would increase 1135 // the caller stack usage dramatically. 1136 if (IsCallerRecursive && 1137 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1138 return false; 1139 1140 // Check if we've past the maximum possible threshold so we don't spin in 1141 // huge basic blocks that will never inline. 1142 if (Cost > Threshold) 1143 return false; 1144 } 1145 1146 return true; 1147 } 1148 1149 /// \brief Compute the base pointer and cumulative constant offsets for V. 1150 /// 1151 /// This strips all constant offsets off of V, leaving it the base pointer, and 1152 /// accumulates the total constant offset applied in the returned constant. It 1153 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1154 /// no constant offsets applied. 1155 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1156 if (!V->getType()->isPointerTy()) 1157 return nullptr; 1158 1159 const DataLayout &DL = F.getParent()->getDataLayout(); 1160 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1161 APInt Offset = APInt::getNullValue(IntPtrWidth); 1162 1163 // Even though we don't look through PHI nodes, we could be called on an 1164 // instruction in an unreachable block, which may be on a cycle. 1165 SmallPtrSet<Value *, 4> Visited; 1166 Visited.insert(V); 1167 do { 1168 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1169 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1170 return nullptr; 1171 V = GEP->getPointerOperand(); 1172 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1173 V = cast<Operator>(V)->getOperand(0); 1174 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1175 if (GA->isInterposable()) 1176 break; 1177 V = GA->getAliasee(); 1178 } else { 1179 break; 1180 } 1181 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1182 } while (Visited.insert(V).second); 1183 1184 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1185 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1186 } 1187 1188 /// \brief Analyze a call site for potential inlining. 1189 /// 1190 /// Returns true if inlining this call is viable, and false if it is not 1191 /// viable. It computes the cost and adjusts the threshold based on numerous 1192 /// factors and heuristics. If this method returns false but the computed cost 1193 /// is below the computed threshold, then inlining was forcibly disabled by 1194 /// some artifact of the routine. 1195 bool CallAnalyzer::analyzeCall(CallSite CS) { 1196 ++NumCallsAnalyzed; 1197 1198 // Perform some tweaks to the cost and threshold based on the direct 1199 // callsite information. 1200 1201 // We want to more aggressively inline vector-dense kernels, so up the 1202 // threshold, and we'll lower it if the % of vector instructions gets too 1203 // low. Note that these bonuses are some what arbitrary and evolved over time 1204 // by accident as much as because they are principled bonuses. 1205 // 1206 // FIXME: It would be nice to remove all such bonuses. At least it would be 1207 // nice to base the bonus values on something more scientific. 1208 assert(NumInstructions == 0); 1209 assert(NumVectorInstructions == 0); 1210 1211 // Update the threshold based on callsite properties 1212 updateThreshold(CS, F); 1213 1214 FiftyPercentVectorBonus = 3 * Threshold / 2; 1215 TenPercentVectorBonus = 3 * Threshold / 4; 1216 const DataLayout &DL = F.getParent()->getDataLayout(); 1217 1218 // Track whether the post-inlining function would have more than one basic 1219 // block. A single basic block is often intended for inlining. Balloon the 1220 // threshold by 50% until we pass the single-BB phase. 1221 bool SingleBB = true; 1222 int SingleBBBonus = Threshold / 2; 1223 1224 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1225 // this Threshold any time, and cost cannot decrease, we can stop processing 1226 // the rest of the function body. 1227 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1228 1229 // Give out bonuses per argument, as the instructions setting them up will 1230 // be gone after inlining. 1231 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1232 if (CS.isByValArgument(I)) { 1233 // We approximate the number of loads and stores needed by dividing the 1234 // size of the byval type by the target's pointer size. 1235 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1236 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1237 unsigned PointerSize = DL.getPointerSizeInBits(); 1238 // Ceiling division. 1239 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1240 1241 // If it generates more than 8 stores it is likely to be expanded as an 1242 // inline memcpy so we take that as an upper bound. Otherwise we assume 1243 // one load and one store per word copied. 1244 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1245 // here instead of a magic number of 8, but it's not available via 1246 // DataLayout. 1247 NumStores = std::min(NumStores, 8U); 1248 1249 Cost -= 2 * NumStores * InlineConstants::InstrCost; 1250 } else { 1251 // For non-byval arguments subtract off one instruction per call 1252 // argument. 1253 Cost -= InlineConstants::InstrCost; 1254 } 1255 } 1256 1257 // If there is only one call of the function, and it has internal linkage, 1258 // the cost of inlining it drops dramatically. 1259 bool OnlyOneCallAndLocalLinkage = 1260 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); 1261 if (OnlyOneCallAndLocalLinkage) 1262 Cost += InlineConstants::LastCallToStaticBonus; 1263 1264 // If this function uses the coldcc calling convention, prefer not to inline 1265 // it. 1266 if (F.getCallingConv() == CallingConv::Cold) 1267 Cost += InlineConstants::ColdccPenalty; 1268 1269 // Check if we're done. This can happen due to bonuses and penalties. 1270 if (Cost > Threshold) 1271 return false; 1272 1273 if (F.empty()) 1274 return true; 1275 1276 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1277 // Check if the caller function is recursive itself. 1278 for (User *U : Caller->users()) { 1279 CallSite Site(U); 1280 if (!Site) 1281 continue; 1282 Instruction *I = Site.getInstruction(); 1283 if (I->getParent()->getParent() == Caller) { 1284 IsCallerRecursive = true; 1285 break; 1286 } 1287 } 1288 1289 // Populate our simplified values by mapping from function arguments to call 1290 // arguments with known important simplifications. 1291 CallSite::arg_iterator CAI = CS.arg_begin(); 1292 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1293 FAI != FAE; ++FAI, ++CAI) { 1294 assert(CAI != CS.arg_end()); 1295 if (Constant *C = dyn_cast<Constant>(CAI)) 1296 SimplifiedValues[&*FAI] = C; 1297 1298 Value *PtrArg = *CAI; 1299 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1300 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1301 1302 // We can SROA any pointer arguments derived from alloca instructions. 1303 if (isa<AllocaInst>(PtrArg)) { 1304 SROAArgValues[&*FAI] = PtrArg; 1305 SROAArgCosts[PtrArg] = 0; 1306 } 1307 } 1308 } 1309 NumConstantArgs = SimplifiedValues.size(); 1310 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1311 NumAllocaArgs = SROAArgValues.size(); 1312 1313 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1314 // the ephemeral values multiple times (and they're completely determined by 1315 // the callee, so this is purely duplicate work). 1316 SmallPtrSet<const Value *, 32> EphValues; 1317 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), 1318 EphValues); 1319 1320 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1321 // adding basic blocks of the callee which can be proven to be dead for this 1322 // particular call site in order to get more accurate cost estimates. This 1323 // requires a somewhat heavyweight iteration pattern: we need to walk the 1324 // basic blocks in a breadth-first order as we insert live successors. To 1325 // accomplish this, prioritizing for small iterations because we exit after 1326 // crossing our threshold, we use a small-size optimized SetVector. 1327 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1328 SmallPtrSet<BasicBlock *, 16>> 1329 BBSetVector; 1330 BBSetVector BBWorklist; 1331 BBWorklist.insert(&F.getEntryBlock()); 1332 // Note that we *must not* cache the size, this loop grows the worklist. 1333 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1334 // Bail out the moment we cross the threshold. This means we'll under-count 1335 // the cost, but only when undercounting doesn't matter. 1336 if (Cost > Threshold) 1337 break; 1338 1339 BasicBlock *BB = BBWorklist[Idx]; 1340 if (BB->empty()) 1341 continue; 1342 1343 // Disallow inlining a blockaddress. A blockaddress only has defined 1344 // behavior for an indirect branch in the same function, and we do not 1345 // currently support inlining indirect branches. But, the inliner may not 1346 // see an indirect branch that ends up being dead code at a particular call 1347 // site. If the blockaddress escapes the function, e.g., via a global 1348 // variable, inlining may lead to an invalid cross-function reference. 1349 if (BB->hasAddressTaken()) 1350 return false; 1351 1352 // Analyze the cost of this block. If we blow through the threshold, this 1353 // returns false, and we can bail on out. 1354 if (!analyzeBlock(BB, EphValues)) 1355 return false; 1356 1357 TerminatorInst *TI = BB->getTerminator(); 1358 1359 // Add in the live successors by first checking whether we have terminator 1360 // that may be simplified based on the values simplified by this call. 1361 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1362 if (BI->isConditional()) { 1363 Value *Cond = BI->getCondition(); 1364 if (ConstantInt *SimpleCond = 1365 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1366 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1367 continue; 1368 } 1369 } 1370 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1371 Value *Cond = SI->getCondition(); 1372 if (ConstantInt *SimpleCond = 1373 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1374 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1375 continue; 1376 } 1377 } 1378 1379 // If we're unable to select a particular successor, just count all of 1380 // them. 1381 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1382 ++TIdx) 1383 BBWorklist.insert(TI->getSuccessor(TIdx)); 1384 1385 // If we had any successors at this point, than post-inlining is likely to 1386 // have them as well. Note that we assume any basic blocks which existed 1387 // due to branches or switches which folded above will also fold after 1388 // inlining. 1389 if (SingleBB && TI->getNumSuccessors() > 1) { 1390 // Take off the bonus we applied to the threshold. 1391 Threshold -= SingleBBBonus; 1392 SingleBB = false; 1393 } 1394 } 1395 1396 // If this is a noduplicate call, we can still inline as long as 1397 // inlining this would cause the removal of the caller (so the instruction 1398 // is not actually duplicated, just moved). 1399 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1400 return false; 1401 1402 // We applied the maximum possible vector bonus at the beginning. Now, 1403 // subtract the excess bonus, if any, from the Threshold before 1404 // comparing against Cost. 1405 if (NumVectorInstructions <= NumInstructions / 10) 1406 Threshold -= FiftyPercentVectorBonus; 1407 else if (NumVectorInstructions <= NumInstructions / 2) 1408 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1409 1410 return Cost < std::max(1, Threshold); 1411 } 1412 1413 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1414 /// \brief Dump stats about this call's analysis. 1415 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1416 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1417 DEBUG_PRINT_STAT(NumConstantArgs); 1418 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1419 DEBUG_PRINT_STAT(NumAllocaArgs); 1420 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1421 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1422 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1423 DEBUG_PRINT_STAT(NumInstructions); 1424 DEBUG_PRINT_STAT(SROACostSavings); 1425 DEBUG_PRINT_STAT(SROACostSavingsLost); 1426 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1427 DEBUG_PRINT_STAT(Cost); 1428 DEBUG_PRINT_STAT(Threshold); 1429 #undef DEBUG_PRINT_STAT 1430 } 1431 #endif 1432 1433 /// \brief Test that two functions either have or have not the given attribute 1434 /// at the same time. 1435 template <typename AttrKind> 1436 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { 1437 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); 1438 } 1439 1440 /// \brief Test that there are no attribute conflicts between Caller and Callee 1441 /// that prevent inlining. 1442 static bool functionsHaveCompatibleAttributes(Function *Caller, 1443 Function *Callee, 1444 TargetTransformInfo &TTI) { 1445 return TTI.areInlineCompatible(Caller, Callee) && 1446 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1447 } 1448 1449 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, 1450 TargetTransformInfo &CalleeTTI, 1451 AssumptionCacheTracker *ACT) { 1452 return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, 1453 ACT); 1454 } 1455 1456 int llvm::computeThresholdFromOptLevels(unsigned OptLevel, 1457 unsigned SizeOptLevel) { 1458 if (OptLevel > 2) 1459 return OptAggressiveThreshold; 1460 if (SizeOptLevel == 1) // -Os 1461 return OptSizeThreshold; 1462 if (SizeOptLevel == 2) // -Oz 1463 return OptMinSizeThreshold; 1464 return DefaultInlineThreshold; 1465 } 1466 1467 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; } 1468 1469 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, 1470 int DefaultThreshold, 1471 TargetTransformInfo &CalleeTTI, 1472 AssumptionCacheTracker *ACT) { 1473 1474 // Cannot inline indirect calls. 1475 if (!Callee) 1476 return llvm::InlineCost::getNever(); 1477 1478 // Calls to functions with always-inline attributes should be inlined 1479 // whenever possible. 1480 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1481 if (isInlineViable(*Callee)) 1482 return llvm::InlineCost::getAlways(); 1483 return llvm::InlineCost::getNever(); 1484 } 1485 1486 // Never inline functions with conflicting attributes (unless callee has 1487 // always-inline attribute). 1488 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) 1489 return llvm::InlineCost::getNever(); 1490 1491 // Don't inline this call if the caller has the optnone attribute. 1492 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1493 return llvm::InlineCost::getNever(); 1494 1495 // Don't inline functions which can be interposed at link-time. Don't inline 1496 // functions marked noinline or call sites marked noinline. 1497 // Note: inlining non-exact non-interposable fucntions is fine, since we know 1498 // we have *a* correct implementation of the source level function. 1499 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1500 CS.isNoInline()) 1501 return llvm::InlineCost::getNever(); 1502 1503 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1504 << "...\n"); 1505 1506 CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); 1507 bool ShouldInline = CA.analyzeCall(CS); 1508 1509 DEBUG(CA.dump()); 1510 1511 // Check if there was a reason to force inlining or no inlining. 1512 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1513 return InlineCost::getNever(); 1514 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1515 return InlineCost::getAlways(); 1516 1517 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1518 } 1519 1520 bool llvm::isInlineViable(Function &F) { 1521 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1522 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1523 // Disallow inlining of functions which contain indirect branches or 1524 // blockaddresses. 1525 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1526 return false; 1527 1528 for (auto &II : *BI) { 1529 CallSite CS(&II); 1530 if (!CS) 1531 continue; 1532 1533 // Disallow recursive calls. 1534 if (&F == CS.getCalledFunction()) 1535 return false; 1536 1537 // Disallow calls which expose returns-twice to a function not previously 1538 // attributed as such. 1539 if (!ReturnsTwice && CS.isCall() && 1540 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1541 return false; 1542 1543 // Disallow inlining functions that call @llvm.localescape. Doing this 1544 // correctly would require major changes to the inliner. 1545 if (CS.getCalledFunction() && 1546 CS.getCalledFunction()->getIntrinsicID() == 1547 llvm::Intrinsic::localescape) 1548 return false; 1549 } 1550 } 1551 1552 return true; 1553 } 1554