1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/CallingConv.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/GlobalAlias.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Operator.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "inline-cost" 39 40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 41 42 // Threshold to use when optsize is specified (and there is no 43 // -inline-threshold). 44 const int OptSizeThreshold = 75; 45 46 // Threshold to use when -Oz is specified (and there is no -inline-threshold). 47 const int OptMinSizeThreshold = 25; 48 49 // Threshold to use when -O[34] is specified (and there is no 50 // -inline-threshold). 51 const int OptAggressiveThreshold = 275; 52 53 static cl::opt<int> DefaultInlineThreshold( 54 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 55 cl::desc("Control the amount of inlining to perform (default = 225)")); 56 57 static cl::opt<int> HintThreshold( 58 "inlinehint-threshold", cl::Hidden, cl::init(325), 59 cl::desc("Threshold for inlining functions with inline hint")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(225), 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 namespace { 69 70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 71 typedef InstVisitor<CallAnalyzer, bool> Base; 72 friend class InstVisitor<CallAnalyzer, bool>; 73 74 /// The TargetTransformInfo available for this compilation. 75 const TargetTransformInfo &TTI; 76 77 /// The cache of @llvm.assume intrinsics. 78 AssumptionCacheTracker *ACT; 79 80 // The called function. 81 Function &F; 82 83 // The candidate callsite being analyzed. Please do not use this to do 84 // analysis in the caller function; we want the inline cost query to be 85 // easily cacheable. Instead, use the cover function paramHasAttr. 86 CallSite CandidateCS; 87 88 int Threshold; 89 int Cost; 90 91 bool IsCallerRecursive; 92 bool IsRecursiveCall; 93 bool ExposesReturnsTwice; 94 bool HasDynamicAlloca; 95 bool ContainsNoDuplicateCall; 96 bool HasReturn; 97 bool HasIndirectBr; 98 bool HasFrameEscape; 99 100 /// Number of bytes allocated statically by the callee. 101 uint64_t AllocatedSize; 102 unsigned NumInstructions, NumVectorInstructions; 103 int FiftyPercentVectorBonus, TenPercentVectorBonus; 104 int VectorBonus; 105 106 // While we walk the potentially-inlined instructions, we build up and 107 // maintain a mapping of simplified values specific to this callsite. The 108 // idea is to propagate any special information we have about arguments to 109 // this call through the inlinable section of the function, and account for 110 // likely simplifications post-inlining. The most important aspect we track 111 // is CFG altering simplifications -- when we prove a basic block dead, that 112 // can cause dramatic shifts in the cost of inlining a function. 113 DenseMap<Value *, Constant *> SimplifiedValues; 114 115 // Keep track of the values which map back (through function arguments) to 116 // allocas on the caller stack which could be simplified through SROA. 117 DenseMap<Value *, Value *> SROAArgValues; 118 119 // The mapping of caller Alloca values to their accumulated cost savings. If 120 // we have to disable SROA for one of the allocas, this tells us how much 121 // cost must be added. 122 DenseMap<Value *, int> SROAArgCosts; 123 124 // Keep track of values which map to a pointer base and constant offset. 125 DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs; 126 127 // Custom simplification helper routines. 128 bool isAllocaDerivedArg(Value *V); 129 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 130 DenseMap<Value *, int>::iterator &CostIt); 131 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 132 void disableSROA(Value *V); 133 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 134 int InstructionCost); 135 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 136 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 137 bool simplifyCallSite(Function *F, CallSite CS); 138 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 139 140 /// Return true if the given argument to the function being considered for 141 /// inlining has the given attribute set either at the call site or the 142 /// function declaration. Primarily used to inspect call site specific 143 /// attributes since these can be more precise than the ones on the callee 144 /// itself. 145 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 146 147 /// Return true if the given value is known non null within the callee if 148 /// inlined through this particular callsite. 149 bool isKnownNonNullInCallee(Value *V); 150 151 /// Update Threshold based on callsite properties such as callee 152 /// attributes and callee hotness for PGO builds. The Callee is explicitly 153 /// passed to support analyzing indirect calls whose target is inferred by 154 /// analysis. 155 void updateThreshold(CallSite CS, Function &Callee); 156 157 /// Return true if size growth is allowed when inlining the callee at CS. 158 bool allowSizeGrowth(CallSite CS); 159 160 // Custom analysis routines. 161 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 162 163 // Disable several entry points to the visitor so we don't accidentally use 164 // them by declaring but not defining them here. 165 void visit(Module *); void visit(Module &); 166 void visit(Function *); void visit(Function &); 167 void visit(BasicBlock *); void visit(BasicBlock &); 168 169 // Provide base case for our instruction visit. 170 bool visitInstruction(Instruction &I); 171 172 // Our visit overrides. 173 bool visitAlloca(AllocaInst &I); 174 bool visitPHI(PHINode &I); 175 bool visitGetElementPtr(GetElementPtrInst &I); 176 bool visitBitCast(BitCastInst &I); 177 bool visitPtrToInt(PtrToIntInst &I); 178 bool visitIntToPtr(IntToPtrInst &I); 179 bool visitCastInst(CastInst &I); 180 bool visitUnaryInstruction(UnaryInstruction &I); 181 bool visitCmpInst(CmpInst &I); 182 bool visitSub(BinaryOperator &I); 183 bool visitBinaryOperator(BinaryOperator &I); 184 bool visitLoad(LoadInst &I); 185 bool visitStore(StoreInst &I); 186 bool visitExtractValue(ExtractValueInst &I); 187 bool visitInsertValue(InsertValueInst &I); 188 bool visitCallSite(CallSite CS); 189 bool visitReturnInst(ReturnInst &RI); 190 bool visitBranchInst(BranchInst &BI); 191 bool visitSwitchInst(SwitchInst &SI); 192 bool visitIndirectBrInst(IndirectBrInst &IBI); 193 bool visitResumeInst(ResumeInst &RI); 194 bool visitCleanupReturnInst(CleanupReturnInst &RI); 195 bool visitCatchReturnInst(CatchReturnInst &RI); 196 bool visitUnreachableInst(UnreachableInst &I); 197 198 public: 199 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, 200 Function &Callee, int Threshold, CallSite CSArg) 201 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), 202 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 203 ExposesReturnsTwice(false), HasDynamicAlloca(false), 204 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 205 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 206 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 207 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 208 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 209 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 210 SROACostSavings(0), SROACostSavingsLost(0) {} 211 212 bool analyzeCall(CallSite CS); 213 214 int getThreshold() { return Threshold; } 215 int getCost() { return Cost; } 216 217 // Keep a bunch of stats about the cost savings found so we can print them 218 // out when debugging. 219 unsigned NumConstantArgs; 220 unsigned NumConstantOffsetPtrArgs; 221 unsigned NumAllocaArgs; 222 unsigned NumConstantPtrCmps; 223 unsigned NumConstantPtrDiffs; 224 unsigned NumInstructionsSimplified; 225 unsigned SROACostSavings; 226 unsigned SROACostSavingsLost; 227 228 void dump(); 229 }; 230 231 } // namespace 232 233 /// \brief Test whether the given value is an Alloca-derived function argument. 234 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 235 return SROAArgValues.count(V); 236 } 237 238 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 239 /// Returns false if V does not map to a SROA-candidate. 240 bool CallAnalyzer::lookupSROAArgAndCost( 241 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 242 if (SROAArgValues.empty() || SROAArgCosts.empty()) 243 return false; 244 245 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 246 if (ArgIt == SROAArgValues.end()) 247 return false; 248 249 Arg = ArgIt->second; 250 CostIt = SROAArgCosts.find(Arg); 251 return CostIt != SROAArgCosts.end(); 252 } 253 254 /// \brief Disable SROA for the candidate marked by this cost iterator. 255 /// 256 /// This marks the candidate as no longer viable for SROA, and adds the cost 257 /// savings associated with it back into the inline cost measurement. 258 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 259 // If we're no longer able to perform SROA we need to undo its cost savings 260 // and prevent subsequent analysis. 261 Cost += CostIt->second; 262 SROACostSavings -= CostIt->second; 263 SROACostSavingsLost += CostIt->second; 264 SROAArgCosts.erase(CostIt); 265 } 266 267 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 268 void CallAnalyzer::disableSROA(Value *V) { 269 Value *SROAArg; 270 DenseMap<Value *, int>::iterator CostIt; 271 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 272 disableSROA(CostIt); 273 } 274 275 /// \brief Accumulate the given cost for a particular SROA candidate. 276 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 277 int InstructionCost) { 278 CostIt->second += InstructionCost; 279 SROACostSavings += InstructionCost; 280 } 281 282 /// \brief Check whether a GEP's indices are all constant. 283 /// 284 /// Respects any simplified values known during the analysis of this callsite. 285 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 286 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 287 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 288 return false; 289 290 return true; 291 } 292 293 /// \brief Accumulate a constant GEP offset into an APInt if possible. 294 /// 295 /// Returns false if unable to compute the offset for any reason. Respects any 296 /// simplified values known during the analysis of this callsite. 297 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 298 const DataLayout &DL = F.getParent()->getDataLayout(); 299 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 300 assert(IntPtrWidth == Offset.getBitWidth()); 301 302 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 303 GTI != GTE; ++GTI) { 304 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 305 if (!OpC) 306 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 307 OpC = dyn_cast<ConstantInt>(SimpleOp); 308 if (!OpC) 309 return false; 310 if (OpC->isZero()) continue; 311 312 // Handle a struct index, which adds its field offset to the pointer. 313 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 314 unsigned ElementIdx = OpC->getZExtValue(); 315 const StructLayout *SL = DL.getStructLayout(STy); 316 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 317 continue; 318 } 319 320 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 321 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 322 } 323 return true; 324 } 325 326 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 327 // Check whether inlining will turn a dynamic alloca into a static 328 // alloca, and handle that case. 329 if (I.isArrayAllocation()) { 330 if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) { 331 ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size); 332 assert(AllocSize && "Allocation size not a constant int?"); 333 Type *Ty = I.getAllocatedType(); 334 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue(); 335 return Base::visitAlloca(I); 336 } 337 } 338 339 // Accumulate the allocated size. 340 if (I.isStaticAlloca()) { 341 const DataLayout &DL = F.getParent()->getDataLayout(); 342 Type *Ty = I.getAllocatedType(); 343 AllocatedSize += DL.getTypeAllocSize(Ty); 344 } 345 346 // We will happily inline static alloca instructions. 347 if (I.isStaticAlloca()) 348 return Base::visitAlloca(I); 349 350 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 351 // a variety of reasons, and so we would like to not inline them into 352 // functions which don't currently have a dynamic alloca. This simply 353 // disables inlining altogether in the presence of a dynamic alloca. 354 HasDynamicAlloca = true; 355 return false; 356 } 357 358 bool CallAnalyzer::visitPHI(PHINode &I) { 359 // FIXME: We should potentially be tracking values through phi nodes, 360 // especially when they collapse to a single value due to deleted CFG edges 361 // during inlining. 362 363 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 364 // though we don't want to propagate it's bonuses. The idea is to disable 365 // SROA if it *might* be used in an inappropriate manner. 366 367 // Phi nodes are always zero-cost. 368 return true; 369 } 370 371 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 372 Value *SROAArg; 373 DenseMap<Value *, int>::iterator CostIt; 374 bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(), 375 SROAArg, CostIt); 376 377 // Try to fold GEPs of constant-offset call site argument pointers. This 378 // requires target data and inbounds GEPs. 379 if (I.isInBounds()) { 380 // Check if we have a base + offset for the pointer. 381 Value *Ptr = I.getPointerOperand(); 382 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 383 if (BaseAndOffset.first) { 384 // Check if the offset of this GEP is constant, and if so accumulate it 385 // into Offset. 386 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 387 // Non-constant GEPs aren't folded, and disable SROA. 388 if (SROACandidate) 389 disableSROA(CostIt); 390 return false; 391 } 392 393 // Add the result as a new mapping to Base + Offset. 394 ConstantOffsetPtrs[&I] = BaseAndOffset; 395 396 // Also handle SROA candidates here, we already know that the GEP is 397 // all-constant indexed. 398 if (SROACandidate) 399 SROAArgValues[&I] = SROAArg; 400 401 return true; 402 } 403 } 404 405 if (isGEPOffsetConstant(I)) { 406 if (SROACandidate) 407 SROAArgValues[&I] = SROAArg; 408 409 // Constant GEPs are modeled as free. 410 return true; 411 } 412 413 // Variable GEPs will require math and will disable SROA. 414 if (SROACandidate) 415 disableSROA(CostIt); 416 return false; 417 } 418 419 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 420 // Propagate constants through bitcasts. 421 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 422 if (!COp) 423 COp = SimplifiedValues.lookup(I.getOperand(0)); 424 if (COp) 425 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 426 SimplifiedValues[&I] = C; 427 return true; 428 } 429 430 // Track base/offsets through casts 431 std::pair<Value *, APInt> BaseAndOffset 432 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 433 // Casts don't change the offset, just wrap it up. 434 if (BaseAndOffset.first) 435 ConstantOffsetPtrs[&I] = BaseAndOffset; 436 437 // Also look for SROA candidates here. 438 Value *SROAArg; 439 DenseMap<Value *, int>::iterator CostIt; 440 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 441 SROAArgValues[&I] = SROAArg; 442 443 // Bitcasts are always zero cost. 444 return true; 445 } 446 447 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 448 // Propagate constants through ptrtoint. 449 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 450 if (!COp) 451 COp = SimplifiedValues.lookup(I.getOperand(0)); 452 if (COp) 453 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 454 SimplifiedValues[&I] = C; 455 return true; 456 } 457 458 // Track base/offset pairs when converted to a plain integer provided the 459 // integer is large enough to represent the pointer. 460 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 461 const DataLayout &DL = F.getParent()->getDataLayout(); 462 if (IntegerSize >= DL.getPointerSizeInBits()) { 463 std::pair<Value *, APInt> BaseAndOffset 464 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 465 if (BaseAndOffset.first) 466 ConstantOffsetPtrs[&I] = BaseAndOffset; 467 } 468 469 // This is really weird. Technically, ptrtoint will disable SROA. However, 470 // unless that ptrtoint is *used* somewhere in the live basic blocks after 471 // inlining, it will be nuked, and SROA should proceed. All of the uses which 472 // would block SROA would also block SROA if applied directly to a pointer, 473 // and so we can just add the integer in here. The only places where SROA is 474 // preserved either cannot fire on an integer, or won't in-and-of themselves 475 // disable SROA (ext) w/o some later use that we would see and disable. 476 Value *SROAArg; 477 DenseMap<Value *, int>::iterator CostIt; 478 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 479 SROAArgValues[&I] = SROAArg; 480 481 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 482 } 483 484 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 485 // Propagate constants through ptrtoint. 486 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 487 if (!COp) 488 COp = SimplifiedValues.lookup(I.getOperand(0)); 489 if (COp) 490 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 491 SimplifiedValues[&I] = C; 492 return true; 493 } 494 495 // Track base/offset pairs when round-tripped through a pointer without 496 // modifications provided the integer is not too large. 497 Value *Op = I.getOperand(0); 498 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 499 const DataLayout &DL = F.getParent()->getDataLayout(); 500 if (IntegerSize <= DL.getPointerSizeInBits()) { 501 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 502 if (BaseAndOffset.first) 503 ConstantOffsetPtrs[&I] = BaseAndOffset; 504 } 505 506 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 507 Value *SROAArg; 508 DenseMap<Value *, int>::iterator CostIt; 509 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 510 SROAArgValues[&I] = SROAArg; 511 512 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 513 } 514 515 bool CallAnalyzer::visitCastInst(CastInst &I) { 516 // Propagate constants through ptrtoint. 517 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 518 if (!COp) 519 COp = SimplifiedValues.lookup(I.getOperand(0)); 520 if (COp) 521 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 522 SimplifiedValues[&I] = C; 523 return true; 524 } 525 526 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 527 disableSROA(I.getOperand(0)); 528 529 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 530 } 531 532 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 533 Value *Operand = I.getOperand(0); 534 Constant *COp = dyn_cast<Constant>(Operand); 535 if (!COp) 536 COp = SimplifiedValues.lookup(Operand); 537 if (COp) { 538 const DataLayout &DL = F.getParent()->getDataLayout(); 539 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { 540 SimplifiedValues[&I] = C; 541 return true; 542 } 543 } 544 545 // Disable any SROA on the argument to arbitrary unary operators. 546 disableSROA(Operand); 547 548 return false; 549 } 550 551 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 552 unsigned ArgNo = A->getArgNo(); 553 return CandidateCS.paramHasAttr(ArgNo+1, Attr); 554 } 555 556 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 557 // Does the *call site* have the NonNull attribute set on an argument? We 558 // use the attribute on the call site to memoize any analysis done in the 559 // caller. This will also trip if the callee function has a non-null 560 // parameter attribute, but that's a less interesting case because hopefully 561 // the callee would already have been simplified based on that. 562 if (Argument *A = dyn_cast<Argument>(V)) 563 if (paramHasAttr(A, Attribute::NonNull)) 564 return true; 565 566 // Is this an alloca in the caller? This is distinct from the attribute case 567 // above because attributes aren't updated within the inliner itself and we 568 // always want to catch the alloca derived case. 569 if (isAllocaDerivedArg(V)) 570 // We can actually predict the result of comparisons between an 571 // alloca-derived value and null. Note that this fires regardless of 572 // SROA firing. 573 return true; 574 575 return false; 576 } 577 578 bool CallAnalyzer::allowSizeGrowth(CallSite CS) { 579 // If the normal destination of the invoke or the parent block of the call 580 // site is unreachable-terminated, there is little point in inlining this 581 // unless there is literally zero cost. 582 // FIXME: Note that it is possible that an unreachable-terminated block has a 583 // hot entry. For example, in below scenario inlining hot_call_X() may be 584 // beneficial : 585 // main() { 586 // hot_call_1(); 587 // ... 588 // hot_call_N() 589 // exit(0); 590 // } 591 // For now, we are not handling this corner case here as it is rare in real 592 // code. In future, we should elaborate this based on BPI and BFI in more 593 // general threshold adjusting heuristics in updateThreshold(). 594 Instruction *Instr = CS.getInstruction(); 595 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 596 if (isa<UnreachableInst>(II->getNormalDest()->getTerminator())) 597 return false; 598 } else if (isa<UnreachableInst>(Instr->getParent()->getTerminator())) 599 return false; 600 601 return true; 602 } 603 604 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 605 // If no size growth is allowed for this inlining, set Threshold to 0. 606 if (!allowSizeGrowth(CS)) { 607 Threshold = 0; 608 return; 609 } 610 611 // If -inline-threshold is not given, listen to the optsize and minsize 612 // attributes when they would decrease the threshold. 613 Function *Caller = CS.getCaller(); 614 615 if (!(DefaultInlineThreshold.getNumOccurrences() > 0)) { 616 if (Caller->optForMinSize() && OptMinSizeThreshold < Threshold) 617 Threshold = OptMinSizeThreshold; 618 else if (Caller->optForSize() && OptSizeThreshold < Threshold) 619 Threshold = OptSizeThreshold; 620 } 621 622 // If profile information is available, use that to adjust threshold of hot 623 // and cold functions. 624 // FIXME: The heuristic used below for determining hotness and coldness are 625 // based on preliminary SPEC tuning and may not be optimal. Replace this with 626 // a well-tuned heuristic based on *callsite* hotness and not callee hotness. 627 uint64_t FunctionCount = 0, MaxFunctionCount = 0; 628 bool HasPGOCounts = false; 629 if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) { 630 HasPGOCounts = true; 631 FunctionCount = Callee.getEntryCount().getValue(); 632 MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); 633 } 634 635 // Listen to the inlinehint attribute or profile based hotness information 636 // when it would increase the threshold and the caller does not need to 637 // minimize its size. 638 bool InlineHint = 639 Callee.hasFnAttribute(Attribute::InlineHint) || 640 (HasPGOCounts && 641 FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); 642 if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize()) 643 Threshold = HintThreshold; 644 645 // Listen to the cold attribute or profile based coldness information 646 // when it would decrease the threshold. 647 bool ColdCallee = 648 Callee.hasFnAttribute(Attribute::Cold) || 649 (HasPGOCounts && 650 FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); 651 // Command line argument for DefaultInlineThreshold will override the default 652 // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, 653 // do not use the default cold threshold even if it is smaller. 654 if ((DefaultInlineThreshold.getNumOccurrences() == 0 || 655 ColdThreshold.getNumOccurrences() > 0) && 656 ColdCallee && ColdThreshold < Threshold) 657 Threshold = ColdThreshold; 658 659 // Finally, take the target-specific inlining threshold multiplier into 660 // account. 661 Threshold *= TTI.getInliningThresholdMultiplier(); 662 } 663 664 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 665 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 666 // First try to handle simplified comparisons. 667 if (!isa<Constant>(LHS)) 668 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 669 LHS = SimpleLHS; 670 if (!isa<Constant>(RHS)) 671 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 672 RHS = SimpleRHS; 673 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { 674 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 675 if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { 676 SimplifiedValues[&I] = C; 677 return true; 678 } 679 } 680 681 if (I.getOpcode() == Instruction::FCmp) 682 return false; 683 684 // Otherwise look for a comparison between constant offset pointers with 685 // a common base. 686 Value *LHSBase, *RHSBase; 687 APInt LHSOffset, RHSOffset; 688 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 689 if (LHSBase) { 690 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 691 if (RHSBase && LHSBase == RHSBase) { 692 // We have common bases, fold the icmp to a constant based on the 693 // offsets. 694 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 695 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 696 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 697 SimplifiedValues[&I] = C; 698 ++NumConstantPtrCmps; 699 return true; 700 } 701 } 702 } 703 704 // If the comparison is an equality comparison with null, we can simplify it 705 // if we know the value (argument) can't be null 706 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 707 isKnownNonNullInCallee(I.getOperand(0))) { 708 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 709 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 710 : ConstantInt::getFalse(I.getType()); 711 return true; 712 } 713 // Finally check for SROA candidates in comparisons. 714 Value *SROAArg; 715 DenseMap<Value *, int>::iterator CostIt; 716 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 717 if (isa<ConstantPointerNull>(I.getOperand(1))) { 718 accumulateSROACost(CostIt, InlineConstants::InstrCost); 719 return true; 720 } 721 722 disableSROA(CostIt); 723 } 724 725 return false; 726 } 727 728 bool CallAnalyzer::visitSub(BinaryOperator &I) { 729 // Try to handle a special case: we can fold computing the difference of two 730 // constant-related pointers. 731 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 732 Value *LHSBase, *RHSBase; 733 APInt LHSOffset, RHSOffset; 734 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 735 if (LHSBase) { 736 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 737 if (RHSBase && LHSBase == RHSBase) { 738 // We have common bases, fold the subtract to a constant based on the 739 // offsets. 740 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 741 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 742 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 743 SimplifiedValues[&I] = C; 744 ++NumConstantPtrDiffs; 745 return true; 746 } 747 } 748 } 749 750 // Otherwise, fall back to the generic logic for simplifying and handling 751 // instructions. 752 return Base::visitSub(I); 753 } 754 755 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 756 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 757 const DataLayout &DL = F.getParent()->getDataLayout(); 758 if (!isa<Constant>(LHS)) 759 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 760 LHS = SimpleLHS; 761 if (!isa<Constant>(RHS)) 762 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 763 RHS = SimpleRHS; 764 Value *SimpleV = nullptr; 765 if (auto FI = dyn_cast<FPMathOperator>(&I)) 766 SimpleV = 767 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 768 else 769 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); 770 771 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 772 SimplifiedValues[&I] = C; 773 return true; 774 } 775 776 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 777 disableSROA(LHS); 778 disableSROA(RHS); 779 780 return false; 781 } 782 783 bool CallAnalyzer::visitLoad(LoadInst &I) { 784 Value *SROAArg; 785 DenseMap<Value *, int>::iterator CostIt; 786 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 787 if (I.isSimple()) { 788 accumulateSROACost(CostIt, InlineConstants::InstrCost); 789 return true; 790 } 791 792 disableSROA(CostIt); 793 } 794 795 return false; 796 } 797 798 bool CallAnalyzer::visitStore(StoreInst &I) { 799 Value *SROAArg; 800 DenseMap<Value *, int>::iterator CostIt; 801 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 802 if (I.isSimple()) { 803 accumulateSROACost(CostIt, InlineConstants::InstrCost); 804 return true; 805 } 806 807 disableSROA(CostIt); 808 } 809 810 return false; 811 } 812 813 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 814 // Constant folding for extract value is trivial. 815 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 816 if (!C) 817 C = SimplifiedValues.lookup(I.getAggregateOperand()); 818 if (C) { 819 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); 820 return true; 821 } 822 823 // SROA can look through these but give them a cost. 824 return false; 825 } 826 827 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 828 // Constant folding for insert value is trivial. 829 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 830 if (!AggC) 831 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 832 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 833 if (!InsertedC) 834 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); 835 if (AggC && InsertedC) { 836 SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC, 837 I.getIndices()); 838 return true; 839 } 840 841 // SROA can look through these but give them a cost. 842 return false; 843 } 844 845 /// \brief Try to simplify a call site. 846 /// 847 /// Takes a concrete function and callsite and tries to actually simplify it by 848 /// analyzing the arguments and call itself with instsimplify. Returns true if 849 /// it has simplified the callsite to some other entity (a constant), making it 850 /// free. 851 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 852 // FIXME: Using the instsimplify logic directly for this is inefficient 853 // because we have to continually rebuild the argument list even when no 854 // simplifications can be performed. Until that is fixed with remapping 855 // inside of instsimplify, directly constant fold calls here. 856 if (!canConstantFoldCallTo(F)) 857 return false; 858 859 // Try to re-map the arguments to constants. 860 SmallVector<Constant *, 4> ConstantArgs; 861 ConstantArgs.reserve(CS.arg_size()); 862 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); 863 I != E; ++I) { 864 Constant *C = dyn_cast<Constant>(*I); 865 if (!C) 866 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 867 if (!C) 868 return false; // This argument doesn't map to a constant. 869 870 ConstantArgs.push_back(C); 871 } 872 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 873 SimplifiedValues[CS.getInstruction()] = C; 874 return true; 875 } 876 877 return false; 878 } 879 880 bool CallAnalyzer::visitCallSite(CallSite CS) { 881 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 882 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 883 // This aborts the entire analysis. 884 ExposesReturnsTwice = true; 885 return false; 886 } 887 if (CS.isCall() && 888 cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 889 ContainsNoDuplicateCall = true; 890 891 if (Function *F = CS.getCalledFunction()) { 892 // When we have a concrete function, first try to simplify it directly. 893 if (simplifyCallSite(F, CS)) 894 return true; 895 896 // Next check if it is an intrinsic we know about. 897 // FIXME: Lift this into part of the InstVisitor. 898 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 899 switch (II->getIntrinsicID()) { 900 default: 901 return Base::visitCallSite(CS); 902 903 case Intrinsic::memset: 904 case Intrinsic::memcpy: 905 case Intrinsic::memmove: 906 // SROA can usually chew through these intrinsics, but they aren't free. 907 return false; 908 case Intrinsic::localescape: 909 HasFrameEscape = true; 910 return false; 911 } 912 } 913 914 if (F == CS.getInstruction()->getParent()->getParent()) { 915 // This flag will fully abort the analysis, so don't bother with anything 916 // else. 917 IsRecursiveCall = true; 918 return false; 919 } 920 921 if (TTI.isLoweredToCall(F)) { 922 // We account for the average 1 instruction per call argument setup 923 // here. 924 Cost += CS.arg_size() * InlineConstants::InstrCost; 925 926 // Everything other than inline ASM will also have a significant cost 927 // merely from making the call. 928 if (!isa<InlineAsm>(CS.getCalledValue())) 929 Cost += InlineConstants::CallPenalty; 930 } 931 932 return Base::visitCallSite(CS); 933 } 934 935 // Otherwise we're in a very special case -- an indirect function call. See 936 // if we can be particularly clever about this. 937 Value *Callee = CS.getCalledValue(); 938 939 // First, pay the price of the argument setup. We account for the average 940 // 1 instruction per call argument setup here. 941 Cost += CS.arg_size() * InlineConstants::InstrCost; 942 943 // Next, check if this happens to be an indirect function call to a known 944 // function in this inline context. If not, we've done all we can. 945 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 946 if (!F) 947 return Base::visitCallSite(CS); 948 949 // If we have a constant that we are calling as a function, we can peer 950 // through it and see the function target. This happens not infrequently 951 // during devirtualization and so we want to give it a hefty bonus for 952 // inlining, but cap that bonus in the event that inlining wouldn't pan 953 // out. Pretend to inline the function, with a custom threshold. 954 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); 955 if (CA.analyzeCall(CS)) { 956 // We were able to inline the indirect call! Subtract the cost from the 957 // threshold to get the bonus we want to apply, but don't go below zero. 958 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 959 } 960 961 return Base::visitCallSite(CS); 962 } 963 964 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 965 // At least one return instruction will be free after inlining. 966 bool Free = !HasReturn; 967 HasReturn = true; 968 return Free; 969 } 970 971 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 972 // We model unconditional branches as essentially free -- they really 973 // shouldn't exist at all, but handling them makes the behavior of the 974 // inliner more regular and predictable. Interestingly, conditional branches 975 // which will fold away are also free. 976 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 977 dyn_cast_or_null<ConstantInt>( 978 SimplifiedValues.lookup(BI.getCondition())); 979 } 980 981 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 982 // We model unconditional switches as free, see the comments on handling 983 // branches. 984 if (isa<ConstantInt>(SI.getCondition())) 985 return true; 986 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 987 if (isa<ConstantInt>(V)) 988 return true; 989 990 // Otherwise, we need to accumulate a cost proportional to the number of 991 // distinct successor blocks. This fan-out in the CFG cannot be represented 992 // for free even if we can represent the core switch as a jumptable that 993 // takes a single instruction. 994 // 995 // NB: We convert large switches which are just used to initialize large phi 996 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 997 // inlining those. It will prevent inlining in cases where the optimization 998 // does not (yet) fire. 999 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 1000 SuccessorBlocks.insert(SI.getDefaultDest()); 1001 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 1002 SuccessorBlocks.insert(I.getCaseSuccessor()); 1003 // Add cost corresponding to the number of distinct destinations. The first 1004 // we model as free because of fallthrough. 1005 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 1006 return false; 1007 } 1008 1009 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1010 // We never want to inline functions that contain an indirectbr. This is 1011 // incorrect because all the blockaddress's (in static global initializers 1012 // for example) would be referring to the original function, and this 1013 // indirect jump would jump from the inlined copy of the function into the 1014 // original function which is extremely undefined behavior. 1015 // FIXME: This logic isn't really right; we can safely inline functions with 1016 // indirectbr's as long as no other function or global references the 1017 // blockaddress of a block within the current function. 1018 HasIndirectBr = true; 1019 return false; 1020 } 1021 1022 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 1023 // FIXME: It's not clear that a single instruction is an accurate model for 1024 // the inline cost of a resume instruction. 1025 return false; 1026 } 1027 1028 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 1029 // FIXME: It's not clear that a single instruction is an accurate model for 1030 // the inline cost of a cleanupret instruction. 1031 return false; 1032 } 1033 1034 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 1035 // FIXME: It's not clear that a single instruction is an accurate model for 1036 // the inline cost of a catchret instruction. 1037 return false; 1038 } 1039 1040 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1041 // FIXME: It might be reasonably to discount the cost of instructions leading 1042 // to unreachable as they have the lowest possible impact on both runtime and 1043 // code size. 1044 return true; // No actual code is needed for unreachable. 1045 } 1046 1047 bool CallAnalyzer::visitInstruction(Instruction &I) { 1048 // Some instructions are free. All of the free intrinsics can also be 1049 // handled by SROA, etc. 1050 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1051 return true; 1052 1053 // We found something we don't understand or can't handle. Mark any SROA-able 1054 // values in the operand list as no longer viable. 1055 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1056 disableSROA(*OI); 1057 1058 return false; 1059 } 1060 1061 1062 /// \brief Analyze a basic block for its contribution to the inline cost. 1063 /// 1064 /// This method walks the analyzer over every instruction in the given basic 1065 /// block and accounts for their cost during inlining at this callsite. It 1066 /// aborts early if the threshold has been exceeded or an impossible to inline 1067 /// construct has been detected. It returns false if inlining is no longer 1068 /// viable, and true if inlining remains viable. 1069 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1070 SmallPtrSetImpl<const Value *> &EphValues) { 1071 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1072 // FIXME: Currently, the number of instructions in a function regardless of 1073 // our ability to simplify them during inline to constants or dead code, 1074 // are actually used by the vector bonus heuristic. As long as that's true, 1075 // we have to special case debug intrinsics here to prevent differences in 1076 // inlining due to debug symbols. Eventually, the number of unsimplified 1077 // instructions shouldn't factor into the cost computation, but until then, 1078 // hack around it here. 1079 if (isa<DbgInfoIntrinsic>(I)) 1080 continue; 1081 1082 // Skip ephemeral values. 1083 if (EphValues.count(&*I)) 1084 continue; 1085 1086 ++NumInstructions; 1087 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1088 ++NumVectorInstructions; 1089 1090 // If the instruction is floating point, and the target says this operation 1091 // is expensive or the function has the "use-soft-float" attribute, this may 1092 // eventually become a library call. Treat the cost as such. 1093 if (I->getType()->isFloatingPointTy()) { 1094 bool hasSoftFloatAttr = false; 1095 1096 // If the function has the "use-soft-float" attribute, mark it as 1097 // expensive. 1098 if (F.hasFnAttribute("use-soft-float")) { 1099 Attribute Attr = F.getFnAttribute("use-soft-float"); 1100 StringRef Val = Attr.getValueAsString(); 1101 if (Val == "true") 1102 hasSoftFloatAttr = true; 1103 } 1104 1105 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1106 hasSoftFloatAttr) 1107 Cost += InlineConstants::CallPenalty; 1108 } 1109 1110 // If the instruction simplified to a constant, there is no cost to this 1111 // instruction. Visit the instructions using our InstVisitor to account for 1112 // all of the per-instruction logic. The visit tree returns true if we 1113 // consumed the instruction in any way, and false if the instruction's base 1114 // cost should count against inlining. 1115 if (Base::visit(&*I)) 1116 ++NumInstructionsSimplified; 1117 else 1118 Cost += InlineConstants::InstrCost; 1119 1120 // If the visit this instruction detected an uninlinable pattern, abort. 1121 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1122 HasIndirectBr || HasFrameEscape) 1123 return false; 1124 1125 // If the caller is a recursive function then we don't want to inline 1126 // functions which allocate a lot of stack space because it would increase 1127 // the caller stack usage dramatically. 1128 if (IsCallerRecursive && 1129 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1130 return false; 1131 1132 // Check if we've past the maximum possible threshold so we don't spin in 1133 // huge basic blocks that will never inline. 1134 if (Cost > Threshold) 1135 return false; 1136 } 1137 1138 return true; 1139 } 1140 1141 /// \brief Compute the base pointer and cumulative constant offsets for V. 1142 /// 1143 /// This strips all constant offsets off of V, leaving it the base pointer, and 1144 /// accumulates the total constant offset applied in the returned constant. It 1145 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1146 /// no constant offsets applied. 1147 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1148 if (!V->getType()->isPointerTy()) 1149 return nullptr; 1150 1151 const DataLayout &DL = F.getParent()->getDataLayout(); 1152 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1153 APInt Offset = APInt::getNullValue(IntPtrWidth); 1154 1155 // Even though we don't look through PHI nodes, we could be called on an 1156 // instruction in an unreachable block, which may be on a cycle. 1157 SmallPtrSet<Value *, 4> Visited; 1158 Visited.insert(V); 1159 do { 1160 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1161 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1162 return nullptr; 1163 V = GEP->getPointerOperand(); 1164 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1165 V = cast<Operator>(V)->getOperand(0); 1166 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1167 if (GA->isInterposable()) 1168 break; 1169 V = GA->getAliasee(); 1170 } else { 1171 break; 1172 } 1173 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1174 } while (Visited.insert(V).second); 1175 1176 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1177 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1178 } 1179 1180 /// \brief Analyze a call site for potential inlining. 1181 /// 1182 /// Returns true if inlining this call is viable, and false if it is not 1183 /// viable. It computes the cost and adjusts the threshold based on numerous 1184 /// factors and heuristics. If this method returns false but the computed cost 1185 /// is below the computed threshold, then inlining was forcibly disabled by 1186 /// some artifact of the routine. 1187 bool CallAnalyzer::analyzeCall(CallSite CS) { 1188 ++NumCallsAnalyzed; 1189 1190 // Perform some tweaks to the cost and threshold based on the direct 1191 // callsite information. 1192 1193 // We want to more aggressively inline vector-dense kernels, so up the 1194 // threshold, and we'll lower it if the % of vector instructions gets too 1195 // low. Note that these bonuses are some what arbitrary and evolved over time 1196 // by accident as much as because they are principled bonuses. 1197 // 1198 // FIXME: It would be nice to remove all such bonuses. At least it would be 1199 // nice to base the bonus values on something more scientific. 1200 assert(NumInstructions == 0); 1201 assert(NumVectorInstructions == 0); 1202 1203 // Update the threshold based on callsite properties 1204 updateThreshold(CS, F); 1205 1206 FiftyPercentVectorBonus = 3 * Threshold / 2; 1207 TenPercentVectorBonus = 3 * Threshold / 4; 1208 const DataLayout &DL = F.getParent()->getDataLayout(); 1209 1210 // Track whether the post-inlining function would have more than one basic 1211 // block. A single basic block is often intended for inlining. Balloon the 1212 // threshold by 50% until we pass the single-BB phase. 1213 bool SingleBB = true; 1214 int SingleBBBonus = Threshold / 2; 1215 1216 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1217 // this Threshold any time, and cost cannot decrease, we can stop processing 1218 // the rest of the function body. 1219 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1220 1221 // Give out bonuses per argument, as the instructions setting them up will 1222 // be gone after inlining. 1223 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1224 if (CS.isByValArgument(I)) { 1225 // We approximate the number of loads and stores needed by dividing the 1226 // size of the byval type by the target's pointer size. 1227 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1228 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1229 unsigned PointerSize = DL.getPointerSizeInBits(); 1230 // Ceiling division. 1231 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1232 1233 // If it generates more than 8 stores it is likely to be expanded as an 1234 // inline memcpy so we take that as an upper bound. Otherwise we assume 1235 // one load and one store per word copied. 1236 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1237 // here instead of a magic number of 8, but it's not available via 1238 // DataLayout. 1239 NumStores = std::min(NumStores, 8U); 1240 1241 Cost -= 2 * NumStores * InlineConstants::InstrCost; 1242 } else { 1243 // For non-byval arguments subtract off one instruction per call 1244 // argument. 1245 Cost -= InlineConstants::InstrCost; 1246 } 1247 } 1248 1249 // If there is only one call of the function, and it has internal linkage, 1250 // the cost of inlining it drops dramatically. 1251 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && 1252 &F == CS.getCalledFunction(); 1253 if (OnlyOneCallAndLocalLinkage) 1254 Cost += InlineConstants::LastCallToStaticBonus; 1255 1256 // If this function uses the coldcc calling convention, prefer not to inline 1257 // it. 1258 if (F.getCallingConv() == CallingConv::Cold) 1259 Cost += InlineConstants::ColdccPenalty; 1260 1261 // Check if we're done. This can happen due to bonuses and penalties. 1262 if (Cost > Threshold) 1263 return false; 1264 1265 if (F.empty()) 1266 return true; 1267 1268 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1269 // Check if the caller function is recursive itself. 1270 for (User *U : Caller->users()) { 1271 CallSite Site(U); 1272 if (!Site) 1273 continue; 1274 Instruction *I = Site.getInstruction(); 1275 if (I->getParent()->getParent() == Caller) { 1276 IsCallerRecursive = true; 1277 break; 1278 } 1279 } 1280 1281 // Populate our simplified values by mapping from function arguments to call 1282 // arguments with known important simplifications. 1283 CallSite::arg_iterator CAI = CS.arg_begin(); 1284 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1285 FAI != FAE; ++FAI, ++CAI) { 1286 assert(CAI != CS.arg_end()); 1287 if (Constant *C = dyn_cast<Constant>(CAI)) 1288 SimplifiedValues[&*FAI] = C; 1289 1290 Value *PtrArg = *CAI; 1291 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1292 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1293 1294 // We can SROA any pointer arguments derived from alloca instructions. 1295 if (isa<AllocaInst>(PtrArg)) { 1296 SROAArgValues[&*FAI] = PtrArg; 1297 SROAArgCosts[PtrArg] = 0; 1298 } 1299 } 1300 } 1301 NumConstantArgs = SimplifiedValues.size(); 1302 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1303 NumAllocaArgs = SROAArgValues.size(); 1304 1305 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1306 // the ephemeral values multiple times (and they're completely determined by 1307 // the callee, so this is purely duplicate work). 1308 SmallPtrSet<const Value *, 32> EphValues; 1309 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues); 1310 1311 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1312 // adding basic blocks of the callee which can be proven to be dead for this 1313 // particular call site in order to get more accurate cost estimates. This 1314 // requires a somewhat heavyweight iteration pattern: we need to walk the 1315 // basic blocks in a breadth-first order as we insert live successors. To 1316 // accomplish this, prioritizing for small iterations because we exit after 1317 // crossing our threshold, we use a small-size optimized SetVector. 1318 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1319 SmallPtrSet<BasicBlock *, 16> > BBSetVector; 1320 BBSetVector BBWorklist; 1321 BBWorklist.insert(&F.getEntryBlock()); 1322 // Note that we *must not* cache the size, this loop grows the worklist. 1323 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1324 // Bail out the moment we cross the threshold. This means we'll under-count 1325 // the cost, but only when undercounting doesn't matter. 1326 if (Cost > Threshold) 1327 break; 1328 1329 BasicBlock *BB = BBWorklist[Idx]; 1330 if (BB->empty()) 1331 continue; 1332 1333 // Disallow inlining a blockaddress. A blockaddress only has defined 1334 // behavior for an indirect branch in the same function, and we do not 1335 // currently support inlining indirect branches. But, the inliner may not 1336 // see an indirect branch that ends up being dead code at a particular call 1337 // site. If the blockaddress escapes the function, e.g., via a global 1338 // variable, inlining may lead to an invalid cross-function reference. 1339 if (BB->hasAddressTaken()) 1340 return false; 1341 1342 // Analyze the cost of this block. If we blow through the threshold, this 1343 // returns false, and we can bail on out. 1344 if (!analyzeBlock(BB, EphValues)) 1345 return false; 1346 1347 TerminatorInst *TI = BB->getTerminator(); 1348 1349 // Add in the live successors by first checking whether we have terminator 1350 // that may be simplified based on the values simplified by this call. 1351 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1352 if (BI->isConditional()) { 1353 Value *Cond = BI->getCondition(); 1354 if (ConstantInt *SimpleCond 1355 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1356 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1357 continue; 1358 } 1359 } 1360 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1361 Value *Cond = SI->getCondition(); 1362 if (ConstantInt *SimpleCond 1363 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1364 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1365 continue; 1366 } 1367 } 1368 1369 // If we're unable to select a particular successor, just count all of 1370 // them. 1371 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1372 ++TIdx) 1373 BBWorklist.insert(TI->getSuccessor(TIdx)); 1374 1375 // If we had any successors at this point, than post-inlining is likely to 1376 // have them as well. Note that we assume any basic blocks which existed 1377 // due to branches or switches which folded above will also fold after 1378 // inlining. 1379 if (SingleBB && TI->getNumSuccessors() > 1) { 1380 // Take off the bonus we applied to the threshold. 1381 Threshold -= SingleBBBonus; 1382 SingleBB = false; 1383 } 1384 } 1385 1386 // If this is a noduplicate call, we can still inline as long as 1387 // inlining this would cause the removal of the caller (so the instruction 1388 // is not actually duplicated, just moved). 1389 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1390 return false; 1391 1392 // We applied the maximum possible vector bonus at the beginning. Now, 1393 // subtract the excess bonus, if any, from the Threshold before 1394 // comparing against Cost. 1395 if (NumVectorInstructions <= NumInstructions / 10) 1396 Threshold -= FiftyPercentVectorBonus; 1397 else if (NumVectorInstructions <= NumInstructions / 2) 1398 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1399 1400 return Cost < std::max(1, Threshold); 1401 } 1402 1403 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1404 /// \brief Dump stats about this call's analysis. 1405 LLVM_DUMP_METHOD void CallAnalyzer::dump() { 1406 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1407 DEBUG_PRINT_STAT(NumConstantArgs); 1408 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1409 DEBUG_PRINT_STAT(NumAllocaArgs); 1410 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1411 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1412 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1413 DEBUG_PRINT_STAT(NumInstructions); 1414 DEBUG_PRINT_STAT(SROACostSavings); 1415 DEBUG_PRINT_STAT(SROACostSavingsLost); 1416 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1417 DEBUG_PRINT_STAT(Cost); 1418 DEBUG_PRINT_STAT(Threshold); 1419 #undef DEBUG_PRINT_STAT 1420 } 1421 #endif 1422 1423 /// \brief Test that two functions either have or have not the given attribute 1424 /// at the same time. 1425 template<typename AttrKind> 1426 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { 1427 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); 1428 } 1429 1430 /// \brief Test that there are no attribute conflicts between Caller and Callee 1431 /// that prevent inlining. 1432 static bool functionsHaveCompatibleAttributes(Function *Caller, 1433 Function *Callee, 1434 TargetTransformInfo &TTI) { 1435 return TTI.areInlineCompatible(Caller, Callee) && 1436 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1437 } 1438 1439 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, 1440 TargetTransformInfo &CalleeTTI, 1441 AssumptionCacheTracker *ACT) { 1442 return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, 1443 ACT); 1444 } 1445 1446 int llvm::computeThresholdFromOptLevels(unsigned OptLevel, 1447 unsigned SizeOptLevel) { 1448 if (OptLevel > 2) 1449 return OptAggressiveThreshold; 1450 if (SizeOptLevel == 1) // -Os 1451 return OptSizeThreshold; 1452 if (SizeOptLevel == 2) // -Oz 1453 return OptMinSizeThreshold; 1454 return DefaultInlineThreshold; 1455 } 1456 1457 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; } 1458 1459 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, 1460 int DefaultThreshold, 1461 TargetTransformInfo &CalleeTTI, 1462 AssumptionCacheTracker *ACT) { 1463 1464 // Cannot inline indirect calls. 1465 if (!Callee) 1466 return llvm::InlineCost::getNever(); 1467 1468 // Calls to functions with always-inline attributes should be inlined 1469 // whenever possible. 1470 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1471 if (isInlineViable(*Callee)) 1472 return llvm::InlineCost::getAlways(); 1473 return llvm::InlineCost::getNever(); 1474 } 1475 1476 // Never inline functions with conflicting attributes (unless callee has 1477 // always-inline attribute). 1478 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) 1479 return llvm::InlineCost::getNever(); 1480 1481 // Don't inline this call if the caller has the optnone attribute. 1482 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1483 return llvm::InlineCost::getNever(); 1484 1485 // Don't inline functions which can be interposed at link-time. Don't inline 1486 // functions marked noinline or call sites marked noinline. 1487 // Note: inlining non-exact non-interposable fucntions is fine, since we know 1488 // we have *a* correct implementation of the source level function. 1489 if (Callee->isInterposable() || 1490 Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline()) 1491 return llvm::InlineCost::getNever(); 1492 1493 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1494 << "...\n"); 1495 1496 CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); 1497 bool ShouldInline = CA.analyzeCall(CS); 1498 1499 DEBUG(CA.dump()); 1500 1501 // Check if there was a reason to force inlining or no inlining. 1502 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1503 return InlineCost::getNever(); 1504 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1505 return InlineCost::getAlways(); 1506 1507 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1508 } 1509 1510 bool llvm::isInlineViable(Function &F) { 1511 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1512 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1513 // Disallow inlining of functions which contain indirect branches or 1514 // blockaddresses. 1515 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1516 return false; 1517 1518 for (auto &II : *BI) { 1519 CallSite CS(&II); 1520 if (!CS) 1521 continue; 1522 1523 // Disallow recursive calls. 1524 if (&F == CS.getCalledFunction()) 1525 return false; 1526 1527 // Disallow calls which expose returns-twice to a function not previously 1528 // attributed as such. 1529 if (!ReturnsTwice && CS.isCall() && 1530 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1531 return false; 1532 1533 // Disallow inlining functions that call @llvm.localescape. Doing this 1534 // correctly would require major changes to the inliner. 1535 if (CS.getCalledFunction() && 1536 CS.getCalledFunction()->getIntrinsicID() == 1537 llvm::Intrinsic::localescape) 1538 return false; 1539 } 1540 } 1541 1542 return true; 1543 } 1544