1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/InlineCost.h" 15 #include "llvm/ADT/STLExtras.h" 16 #include "llvm/ADT/SetVector.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/TargetTransformInfo.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/CallingConv.h" 27 #include "llvm/IR/DataLayout.h" 28 #include "llvm/IR/GetElementPtrTypeIterator.h" 29 #include "llvm/IR/GlobalAlias.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Operator.h" 33 #include "llvm/Support/Debug.h" 34 #include "llvm/Support/raw_ostream.h" 35 36 using namespace llvm; 37 38 #define DEBUG_TYPE "inline-cost" 39 40 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 41 42 // Threshold to use when optsize is specified (and there is no 43 // -inline-threshold). 44 const int OptSizeThreshold = 75; 45 46 // Threshold to use when -Oz is specified (and there is no -inline-threshold). 47 const int OptMinSizeThreshold = 25; 48 49 // Threshold to use when -O[34] is specified (and there is no 50 // -inline-threshold). 51 const int OptAggressiveThreshold = 275; 52 53 static cl::opt<int> DefaultInlineThreshold( 54 "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore, 55 cl::desc("Control the amount of inlining to perform (default = 225)")); 56 57 static cl::opt<int> HintThreshold( 58 "inlinehint-threshold", cl::Hidden, cl::init(325), 59 cl::desc("Threshold for inlining functions with inline hint")); 60 61 // We introduce this threshold to help performance of instrumentation based 62 // PGO before we actually hook up inliner with analysis passes such as BPI and 63 // BFI. 64 static cl::opt<int> ColdThreshold( 65 "inlinecold-threshold", cl::Hidden, cl::init(225), 66 cl::desc("Threshold for inlining functions with cold attribute")); 67 68 namespace { 69 70 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 71 typedef InstVisitor<CallAnalyzer, bool> Base; 72 friend class InstVisitor<CallAnalyzer, bool>; 73 74 /// The TargetTransformInfo available for this compilation. 75 const TargetTransformInfo &TTI; 76 77 /// The cache of @llvm.assume intrinsics. 78 AssumptionCacheTracker *ACT; 79 80 // The called function. 81 Function &F; 82 83 // The candidate callsite being analyzed. Please do not use this to do 84 // analysis in the caller function; we want the inline cost query to be 85 // easily cacheable. Instead, use the cover function paramHasAttr. 86 CallSite CandidateCS; 87 88 int Threshold; 89 int Cost; 90 91 bool IsCallerRecursive; 92 bool IsRecursiveCall; 93 bool ExposesReturnsTwice; 94 bool HasDynamicAlloca; 95 bool ContainsNoDuplicateCall; 96 bool HasReturn; 97 bool HasIndirectBr; 98 bool HasFrameEscape; 99 100 /// Number of bytes allocated statically by the callee. 101 uint64_t AllocatedSize; 102 unsigned NumInstructions, NumVectorInstructions; 103 int FiftyPercentVectorBonus, TenPercentVectorBonus; 104 int VectorBonus; 105 106 // While we walk the potentially-inlined instructions, we build up and 107 // maintain a mapping of simplified values specific to this callsite. The 108 // idea is to propagate any special information we have about arguments to 109 // this call through the inlinable section of the function, and account for 110 // likely simplifications post-inlining. The most important aspect we track 111 // is CFG altering simplifications -- when we prove a basic block dead, that 112 // can cause dramatic shifts in the cost of inlining a function. 113 DenseMap<Value *, Constant *> SimplifiedValues; 114 115 // Keep track of the values which map back (through function arguments) to 116 // allocas on the caller stack which could be simplified through SROA. 117 DenseMap<Value *, Value *> SROAArgValues; 118 119 // The mapping of caller Alloca values to their accumulated cost savings. If 120 // we have to disable SROA for one of the allocas, this tells us how much 121 // cost must be added. 122 DenseMap<Value *, int> SROAArgCosts; 123 124 // Keep track of values which map to a pointer base and constant offset. 125 DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs; 126 127 // Custom simplification helper routines. 128 bool isAllocaDerivedArg(Value *V); 129 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 130 DenseMap<Value *, int>::iterator &CostIt); 131 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 132 void disableSROA(Value *V); 133 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 134 int InstructionCost); 135 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 136 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 137 bool simplifyCallSite(Function *F, CallSite CS); 138 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 139 140 /// Return true if the given argument to the function being considered for 141 /// inlining has the given attribute set either at the call site or the 142 /// function declaration. Primarily used to inspect call site specific 143 /// attributes since these can be more precise than the ones on the callee 144 /// itself. 145 bool paramHasAttr(Argument *A, Attribute::AttrKind Attr); 146 147 /// Return true if the given value is known non null within the callee if 148 /// inlined through this particular callsite. 149 bool isKnownNonNullInCallee(Value *V); 150 151 /// Update Threshold based on callsite properties such as callee 152 /// attributes and callee hotness for PGO builds. The Callee is explicitly 153 /// passed to support analyzing indirect calls whose target is inferred by 154 /// analysis. 155 void updateThreshold(CallSite CS, Function &Callee); 156 157 // Custom analysis routines. 158 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 159 160 // Disable several entry points to the visitor so we don't accidentally use 161 // them by declaring but not defining them here. 162 void visit(Module *); void visit(Module &); 163 void visit(Function *); void visit(Function &); 164 void visit(BasicBlock *); void visit(BasicBlock &); 165 166 // Provide base case for our instruction visit. 167 bool visitInstruction(Instruction &I); 168 169 // Our visit overrides. 170 bool visitAlloca(AllocaInst &I); 171 bool visitPHI(PHINode &I); 172 bool visitGetElementPtr(GetElementPtrInst &I); 173 bool visitBitCast(BitCastInst &I); 174 bool visitPtrToInt(PtrToIntInst &I); 175 bool visitIntToPtr(IntToPtrInst &I); 176 bool visitCastInst(CastInst &I); 177 bool visitUnaryInstruction(UnaryInstruction &I); 178 bool visitCmpInst(CmpInst &I); 179 bool visitSub(BinaryOperator &I); 180 bool visitBinaryOperator(BinaryOperator &I); 181 bool visitLoad(LoadInst &I); 182 bool visitStore(StoreInst &I); 183 bool visitExtractValue(ExtractValueInst &I); 184 bool visitInsertValue(InsertValueInst &I); 185 bool visitCallSite(CallSite CS); 186 bool visitReturnInst(ReturnInst &RI); 187 bool visitBranchInst(BranchInst &BI); 188 bool visitSwitchInst(SwitchInst &SI); 189 bool visitIndirectBrInst(IndirectBrInst &IBI); 190 bool visitResumeInst(ResumeInst &RI); 191 bool visitCleanupReturnInst(CleanupReturnInst &RI); 192 bool visitCatchReturnInst(CatchReturnInst &RI); 193 bool visitUnreachableInst(UnreachableInst &I); 194 195 public: 196 CallAnalyzer(const TargetTransformInfo &TTI, AssumptionCacheTracker *ACT, 197 Function &Callee, int Threshold, CallSite CSArg) 198 : TTI(TTI), ACT(ACT), F(Callee), CandidateCS(CSArg), Threshold(Threshold), 199 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 200 ExposesReturnsTwice(false), HasDynamicAlloca(false), 201 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 202 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 203 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 204 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 205 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 206 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 207 SROACostSavings(0), SROACostSavingsLost(0) {} 208 209 bool analyzeCall(CallSite CS); 210 211 int getThreshold() { return Threshold; } 212 int getCost() { return Cost; } 213 214 // Keep a bunch of stats about the cost savings found so we can print them 215 // out when debugging. 216 unsigned NumConstantArgs; 217 unsigned NumConstantOffsetPtrArgs; 218 unsigned NumAllocaArgs; 219 unsigned NumConstantPtrCmps; 220 unsigned NumConstantPtrDiffs; 221 unsigned NumInstructionsSimplified; 222 unsigned SROACostSavings; 223 unsigned SROACostSavingsLost; 224 225 void dump(); 226 }; 227 228 } // namespace 229 230 /// \brief Test whether the given value is an Alloca-derived function argument. 231 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 232 return SROAArgValues.count(V); 233 } 234 235 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 236 /// Returns false if V does not map to a SROA-candidate. 237 bool CallAnalyzer::lookupSROAArgAndCost( 238 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 239 if (SROAArgValues.empty() || SROAArgCosts.empty()) 240 return false; 241 242 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 243 if (ArgIt == SROAArgValues.end()) 244 return false; 245 246 Arg = ArgIt->second; 247 CostIt = SROAArgCosts.find(Arg); 248 return CostIt != SROAArgCosts.end(); 249 } 250 251 /// \brief Disable SROA for the candidate marked by this cost iterator. 252 /// 253 /// This marks the candidate as no longer viable for SROA, and adds the cost 254 /// savings associated with it back into the inline cost measurement. 255 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 256 // If we're no longer able to perform SROA we need to undo its cost savings 257 // and prevent subsequent analysis. 258 Cost += CostIt->second; 259 SROACostSavings -= CostIt->second; 260 SROACostSavingsLost += CostIt->second; 261 SROAArgCosts.erase(CostIt); 262 } 263 264 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 265 void CallAnalyzer::disableSROA(Value *V) { 266 Value *SROAArg; 267 DenseMap<Value *, int>::iterator CostIt; 268 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 269 disableSROA(CostIt); 270 } 271 272 /// \brief Accumulate the given cost for a particular SROA candidate. 273 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 274 int InstructionCost) { 275 CostIt->second += InstructionCost; 276 SROACostSavings += InstructionCost; 277 } 278 279 /// \brief Check whether a GEP's indices are all constant. 280 /// 281 /// Respects any simplified values known during the analysis of this callsite. 282 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 283 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 284 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 285 return false; 286 287 return true; 288 } 289 290 /// \brief Accumulate a constant GEP offset into an APInt if possible. 291 /// 292 /// Returns false if unable to compute the offset for any reason. Respects any 293 /// simplified values known during the analysis of this callsite. 294 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 295 const DataLayout &DL = F.getParent()->getDataLayout(); 296 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 297 assert(IntPtrWidth == Offset.getBitWidth()); 298 299 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 300 GTI != GTE; ++GTI) { 301 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 302 if (!OpC) 303 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 304 OpC = dyn_cast<ConstantInt>(SimpleOp); 305 if (!OpC) 306 return false; 307 if (OpC->isZero()) continue; 308 309 // Handle a struct index, which adds its field offset to the pointer. 310 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 311 unsigned ElementIdx = OpC->getZExtValue(); 312 const StructLayout *SL = DL.getStructLayout(STy); 313 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 314 continue; 315 } 316 317 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 318 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 319 } 320 return true; 321 } 322 323 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 324 // Check whether inlining will turn a dynamic alloca into a static 325 // alloca, and handle that case. 326 if (I.isArrayAllocation()) { 327 if (Constant *Size = SimplifiedValues.lookup(I.getArraySize())) { 328 ConstantInt *AllocSize = dyn_cast<ConstantInt>(Size); 329 assert(AllocSize && "Allocation size not a constant int?"); 330 Type *Ty = I.getAllocatedType(); 331 AllocatedSize += Ty->getPrimitiveSizeInBits() * AllocSize->getZExtValue(); 332 return Base::visitAlloca(I); 333 } 334 } 335 336 // Accumulate the allocated size. 337 if (I.isStaticAlloca()) { 338 const DataLayout &DL = F.getParent()->getDataLayout(); 339 Type *Ty = I.getAllocatedType(); 340 AllocatedSize += DL.getTypeAllocSize(Ty); 341 } 342 343 // We will happily inline static alloca instructions. 344 if (I.isStaticAlloca()) 345 return Base::visitAlloca(I); 346 347 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 348 // a variety of reasons, and so we would like to not inline them into 349 // functions which don't currently have a dynamic alloca. This simply 350 // disables inlining altogether in the presence of a dynamic alloca. 351 HasDynamicAlloca = true; 352 return false; 353 } 354 355 bool CallAnalyzer::visitPHI(PHINode &I) { 356 // FIXME: We should potentially be tracking values through phi nodes, 357 // especially when they collapse to a single value due to deleted CFG edges 358 // during inlining. 359 360 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 361 // though we don't want to propagate it's bonuses. The idea is to disable 362 // SROA if it *might* be used in an inappropriate manner. 363 364 // Phi nodes are always zero-cost. 365 return true; 366 } 367 368 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 369 Value *SROAArg; 370 DenseMap<Value *, int>::iterator CostIt; 371 bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(), 372 SROAArg, CostIt); 373 374 // Try to fold GEPs of constant-offset call site argument pointers. This 375 // requires target data and inbounds GEPs. 376 if (I.isInBounds()) { 377 // Check if we have a base + offset for the pointer. 378 Value *Ptr = I.getPointerOperand(); 379 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 380 if (BaseAndOffset.first) { 381 // Check if the offset of this GEP is constant, and if so accumulate it 382 // into Offset. 383 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 384 // Non-constant GEPs aren't folded, and disable SROA. 385 if (SROACandidate) 386 disableSROA(CostIt); 387 return false; 388 } 389 390 // Add the result as a new mapping to Base + Offset. 391 ConstantOffsetPtrs[&I] = BaseAndOffset; 392 393 // Also handle SROA candidates here, we already know that the GEP is 394 // all-constant indexed. 395 if (SROACandidate) 396 SROAArgValues[&I] = SROAArg; 397 398 return true; 399 } 400 } 401 402 if (isGEPOffsetConstant(I)) { 403 if (SROACandidate) 404 SROAArgValues[&I] = SROAArg; 405 406 // Constant GEPs are modeled as free. 407 return true; 408 } 409 410 // Variable GEPs will require math and will disable SROA. 411 if (SROACandidate) 412 disableSROA(CostIt); 413 return false; 414 } 415 416 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 417 // Propagate constants through bitcasts. 418 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 419 if (!COp) 420 COp = SimplifiedValues.lookup(I.getOperand(0)); 421 if (COp) 422 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 423 SimplifiedValues[&I] = C; 424 return true; 425 } 426 427 // Track base/offsets through casts 428 std::pair<Value *, APInt> BaseAndOffset 429 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 430 // Casts don't change the offset, just wrap it up. 431 if (BaseAndOffset.first) 432 ConstantOffsetPtrs[&I] = BaseAndOffset; 433 434 // Also look for SROA candidates here. 435 Value *SROAArg; 436 DenseMap<Value *, int>::iterator CostIt; 437 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 438 SROAArgValues[&I] = SROAArg; 439 440 // Bitcasts are always zero cost. 441 return true; 442 } 443 444 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 445 // Propagate constants through ptrtoint. 446 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 447 if (!COp) 448 COp = SimplifiedValues.lookup(I.getOperand(0)); 449 if (COp) 450 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 451 SimplifiedValues[&I] = C; 452 return true; 453 } 454 455 // Track base/offset pairs when converted to a plain integer provided the 456 // integer is large enough to represent the pointer. 457 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 458 const DataLayout &DL = F.getParent()->getDataLayout(); 459 if (IntegerSize >= DL.getPointerSizeInBits()) { 460 std::pair<Value *, APInt> BaseAndOffset 461 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 462 if (BaseAndOffset.first) 463 ConstantOffsetPtrs[&I] = BaseAndOffset; 464 } 465 466 // This is really weird. Technically, ptrtoint will disable SROA. However, 467 // unless that ptrtoint is *used* somewhere in the live basic blocks after 468 // inlining, it will be nuked, and SROA should proceed. All of the uses which 469 // would block SROA would also block SROA if applied directly to a pointer, 470 // and so we can just add the integer in here. The only places where SROA is 471 // preserved either cannot fire on an integer, or won't in-and-of themselves 472 // disable SROA (ext) w/o some later use that we would see and disable. 473 Value *SROAArg; 474 DenseMap<Value *, int>::iterator CostIt; 475 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 476 SROAArgValues[&I] = SROAArg; 477 478 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 479 } 480 481 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 482 // Propagate constants through ptrtoint. 483 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 484 if (!COp) 485 COp = SimplifiedValues.lookup(I.getOperand(0)); 486 if (COp) 487 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 488 SimplifiedValues[&I] = C; 489 return true; 490 } 491 492 // Track base/offset pairs when round-tripped through a pointer without 493 // modifications provided the integer is not too large. 494 Value *Op = I.getOperand(0); 495 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 496 const DataLayout &DL = F.getParent()->getDataLayout(); 497 if (IntegerSize <= DL.getPointerSizeInBits()) { 498 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 499 if (BaseAndOffset.first) 500 ConstantOffsetPtrs[&I] = BaseAndOffset; 501 } 502 503 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 504 Value *SROAArg; 505 DenseMap<Value *, int>::iterator CostIt; 506 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 507 SROAArgValues[&I] = SROAArg; 508 509 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 510 } 511 512 bool CallAnalyzer::visitCastInst(CastInst &I) { 513 // Propagate constants through ptrtoint. 514 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 515 if (!COp) 516 COp = SimplifiedValues.lookup(I.getOperand(0)); 517 if (COp) 518 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 519 SimplifiedValues[&I] = C; 520 return true; 521 } 522 523 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 524 disableSROA(I.getOperand(0)); 525 526 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 527 } 528 529 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 530 Value *Operand = I.getOperand(0); 531 Constant *COp = dyn_cast<Constant>(Operand); 532 if (!COp) 533 COp = SimplifiedValues.lookup(Operand); 534 if (COp) { 535 const DataLayout &DL = F.getParent()->getDataLayout(); 536 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { 537 SimplifiedValues[&I] = C; 538 return true; 539 } 540 } 541 542 // Disable any SROA on the argument to arbitrary unary operators. 543 disableSROA(Operand); 544 545 return false; 546 } 547 548 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 549 unsigned ArgNo = A->getArgNo(); 550 return CandidateCS.paramHasAttr(ArgNo+1, Attr); 551 } 552 553 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 554 // Does the *call site* have the NonNull attribute set on an argument? We 555 // use the attribute on the call site to memoize any analysis done in the 556 // caller. This will also trip if the callee function has a non-null 557 // parameter attribute, but that's a less interesting case because hopefully 558 // the callee would already have been simplified based on that. 559 if (Argument *A = dyn_cast<Argument>(V)) 560 if (paramHasAttr(A, Attribute::NonNull)) 561 return true; 562 563 // Is this an alloca in the caller? This is distinct from the attribute case 564 // above because attributes aren't updated within the inliner itself and we 565 // always want to catch the alloca derived case. 566 if (isAllocaDerivedArg(V)) 567 // We can actually predict the result of comparisons between an 568 // alloca-derived value and null. Note that this fires regardless of 569 // SROA firing. 570 return true; 571 572 return false; 573 } 574 575 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 576 // If -inline-threshold is not given, listen to the optsize attribute when it 577 // would decrease the threshold. 578 Function *Caller = CS.getCaller(); 579 580 // FIXME: Use Function::optForSize() 581 bool OptSize = Caller->hasFnAttribute(Attribute::OptimizeForSize); 582 583 if (!(DefaultInlineThreshold.getNumOccurrences() > 0) && OptSize && 584 OptSizeThreshold < Threshold) 585 Threshold = OptSizeThreshold; 586 587 // If profile information is available, use that to adjust threshold of hot 588 // and cold functions. 589 // FIXME: The heuristic used below for determining hotness and coldness are 590 // based on preliminary SPEC tuning and may not be optimal. Replace this with 591 // a well-tuned heuristic based on *callsite* hotness and not callee hotness. 592 uint64_t FunctionCount = 0, MaxFunctionCount = 0; 593 bool HasPGOCounts = false; 594 if (Callee.getEntryCount() && Callee.getParent()->getMaximumFunctionCount()) { 595 HasPGOCounts = true; 596 FunctionCount = Callee.getEntryCount().getValue(); 597 MaxFunctionCount = Callee.getParent()->getMaximumFunctionCount().getValue(); 598 } 599 600 // Listen to the inlinehint attribute or profile based hotness information 601 // when it would increase the threshold and the caller does not need to 602 // minimize its size. 603 bool InlineHint = 604 Callee.hasFnAttribute(Attribute::InlineHint) || 605 (HasPGOCounts && 606 FunctionCount >= (uint64_t)(0.3 * (double)MaxFunctionCount)); 607 if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize()) 608 Threshold = HintThreshold; 609 610 // Listen to the cold attribute or profile based coldness information 611 // when it would decrease the threshold. 612 bool ColdCallee = 613 Callee.hasFnAttribute(Attribute::Cold) || 614 (HasPGOCounts && 615 FunctionCount <= (uint64_t)(0.01 * (double)MaxFunctionCount)); 616 // Command line argument for DefaultInlineThreshold will override the default 617 // ColdThreshold. If we have -inline-threshold but no -inlinecold-threshold, 618 // do not use the default cold threshold even if it is smaller. 619 if ((DefaultInlineThreshold.getNumOccurrences() == 0 || 620 ColdThreshold.getNumOccurrences() > 0) && 621 ColdCallee && ColdThreshold < Threshold) 622 Threshold = ColdThreshold; 623 } 624 625 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 626 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 627 // First try to handle simplified comparisons. 628 if (!isa<Constant>(LHS)) 629 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 630 LHS = SimpleLHS; 631 if (!isa<Constant>(RHS)) 632 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 633 RHS = SimpleRHS; 634 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { 635 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 636 if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { 637 SimplifiedValues[&I] = C; 638 return true; 639 } 640 } 641 642 if (I.getOpcode() == Instruction::FCmp) 643 return false; 644 645 // Otherwise look for a comparison between constant offset pointers with 646 // a common base. 647 Value *LHSBase, *RHSBase; 648 APInt LHSOffset, RHSOffset; 649 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 650 if (LHSBase) { 651 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 652 if (RHSBase && LHSBase == RHSBase) { 653 // We have common bases, fold the icmp to a constant based on the 654 // offsets. 655 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 656 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 657 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 658 SimplifiedValues[&I] = C; 659 ++NumConstantPtrCmps; 660 return true; 661 } 662 } 663 } 664 665 // If the comparison is an equality comparison with null, we can simplify it 666 // if we know the value (argument) can't be null 667 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1)) && 668 isKnownNonNullInCallee(I.getOperand(0))) { 669 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 670 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 671 : ConstantInt::getFalse(I.getType()); 672 return true; 673 } 674 // Finally check for SROA candidates in comparisons. 675 Value *SROAArg; 676 DenseMap<Value *, int>::iterator CostIt; 677 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 678 if (isa<ConstantPointerNull>(I.getOperand(1))) { 679 accumulateSROACost(CostIt, InlineConstants::InstrCost); 680 return true; 681 } 682 683 disableSROA(CostIt); 684 } 685 686 return false; 687 } 688 689 bool CallAnalyzer::visitSub(BinaryOperator &I) { 690 // Try to handle a special case: we can fold computing the difference of two 691 // constant-related pointers. 692 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 693 Value *LHSBase, *RHSBase; 694 APInt LHSOffset, RHSOffset; 695 std::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 696 if (LHSBase) { 697 std::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 698 if (RHSBase && LHSBase == RHSBase) { 699 // We have common bases, fold the subtract to a constant based on the 700 // offsets. 701 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 702 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 703 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 704 SimplifiedValues[&I] = C; 705 ++NumConstantPtrDiffs; 706 return true; 707 } 708 } 709 } 710 711 // Otherwise, fall back to the generic logic for simplifying and handling 712 // instructions. 713 return Base::visitSub(I); 714 } 715 716 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 717 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 718 const DataLayout &DL = F.getParent()->getDataLayout(); 719 if (!isa<Constant>(LHS)) 720 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 721 LHS = SimpleLHS; 722 if (!isa<Constant>(RHS)) 723 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 724 RHS = SimpleRHS; 725 Value *SimpleV = nullptr; 726 if (auto FI = dyn_cast<FPMathOperator>(&I)) 727 SimpleV = 728 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 729 else 730 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); 731 732 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 733 SimplifiedValues[&I] = C; 734 return true; 735 } 736 737 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 738 disableSROA(LHS); 739 disableSROA(RHS); 740 741 return false; 742 } 743 744 bool CallAnalyzer::visitLoad(LoadInst &I) { 745 Value *SROAArg; 746 DenseMap<Value *, int>::iterator CostIt; 747 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 748 if (I.isSimple()) { 749 accumulateSROACost(CostIt, InlineConstants::InstrCost); 750 return true; 751 } 752 753 disableSROA(CostIt); 754 } 755 756 return false; 757 } 758 759 bool CallAnalyzer::visitStore(StoreInst &I) { 760 Value *SROAArg; 761 DenseMap<Value *, int>::iterator CostIt; 762 if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) { 763 if (I.isSimple()) { 764 accumulateSROACost(CostIt, InlineConstants::InstrCost); 765 return true; 766 } 767 768 disableSROA(CostIt); 769 } 770 771 return false; 772 } 773 774 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 775 // Constant folding for extract value is trivial. 776 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 777 if (!C) 778 C = SimplifiedValues.lookup(I.getAggregateOperand()); 779 if (C) { 780 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); 781 return true; 782 } 783 784 // SROA can look through these but give them a cost. 785 return false; 786 } 787 788 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 789 // Constant folding for insert value is trivial. 790 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 791 if (!AggC) 792 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 793 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 794 if (!InsertedC) 795 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); 796 if (AggC && InsertedC) { 797 SimplifiedValues[&I] = ConstantExpr::getInsertValue(AggC, InsertedC, 798 I.getIndices()); 799 return true; 800 } 801 802 // SROA can look through these but give them a cost. 803 return false; 804 } 805 806 /// \brief Try to simplify a call site. 807 /// 808 /// Takes a concrete function and callsite and tries to actually simplify it by 809 /// analyzing the arguments and call itself with instsimplify. Returns true if 810 /// it has simplified the callsite to some other entity (a constant), making it 811 /// free. 812 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 813 // FIXME: Using the instsimplify logic directly for this is inefficient 814 // because we have to continually rebuild the argument list even when no 815 // simplifications can be performed. Until that is fixed with remapping 816 // inside of instsimplify, directly constant fold calls here. 817 if (!canConstantFoldCallTo(F)) 818 return false; 819 820 // Try to re-map the arguments to constants. 821 SmallVector<Constant *, 4> ConstantArgs; 822 ConstantArgs.reserve(CS.arg_size()); 823 for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); 824 I != E; ++I) { 825 Constant *C = dyn_cast<Constant>(*I); 826 if (!C) 827 C = dyn_cast_or_null<Constant>(SimplifiedValues.lookup(*I)); 828 if (!C) 829 return false; // This argument doesn't map to a constant. 830 831 ConstantArgs.push_back(C); 832 } 833 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 834 SimplifiedValues[CS.getInstruction()] = C; 835 return true; 836 } 837 838 return false; 839 } 840 841 bool CallAnalyzer::visitCallSite(CallSite CS) { 842 if (CS.hasFnAttr(Attribute::ReturnsTwice) && 843 !F.hasFnAttribute(Attribute::ReturnsTwice)) { 844 // This aborts the entire analysis. 845 ExposesReturnsTwice = true; 846 return false; 847 } 848 if (CS.isCall() && 849 cast<CallInst>(CS.getInstruction())->cannotDuplicate()) 850 ContainsNoDuplicateCall = true; 851 852 if (Function *F = CS.getCalledFunction()) { 853 // When we have a concrete function, first try to simplify it directly. 854 if (simplifyCallSite(F, CS)) 855 return true; 856 857 // Next check if it is an intrinsic we know about. 858 // FIXME: Lift this into part of the InstVisitor. 859 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 860 switch (II->getIntrinsicID()) { 861 default: 862 return Base::visitCallSite(CS); 863 864 case Intrinsic::memset: 865 case Intrinsic::memcpy: 866 case Intrinsic::memmove: 867 // SROA can usually chew through these intrinsics, but they aren't free. 868 return false; 869 case Intrinsic::localescape: 870 HasFrameEscape = true; 871 return false; 872 } 873 } 874 875 if (F == CS.getInstruction()->getParent()->getParent()) { 876 // This flag will fully abort the analysis, so don't bother with anything 877 // else. 878 IsRecursiveCall = true; 879 return false; 880 } 881 882 if (TTI.isLoweredToCall(F)) { 883 // We account for the average 1 instruction per call argument setup 884 // here. 885 Cost += CS.arg_size() * InlineConstants::InstrCost; 886 887 // Everything other than inline ASM will also have a significant cost 888 // merely from making the call. 889 if (!isa<InlineAsm>(CS.getCalledValue())) 890 Cost += InlineConstants::CallPenalty; 891 } 892 893 return Base::visitCallSite(CS); 894 } 895 896 // Otherwise we're in a very special case -- an indirect function call. See 897 // if we can be particularly clever about this. 898 Value *Callee = CS.getCalledValue(); 899 900 // First, pay the price of the argument setup. We account for the average 901 // 1 instruction per call argument setup here. 902 Cost += CS.arg_size() * InlineConstants::InstrCost; 903 904 // Next, check if this happens to be an indirect function call to a known 905 // function in this inline context. If not, we've done all we can. 906 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 907 if (!F) 908 return Base::visitCallSite(CS); 909 910 // If we have a constant that we are calling as a function, we can peer 911 // through it and see the function target. This happens not infrequently 912 // during devirtualization and so we want to give it a hefty bonus for 913 // inlining, but cap that bonus in the event that inlining wouldn't pan 914 // out. Pretend to inline the function, with a custom threshold. 915 CallAnalyzer CA(TTI, ACT, *F, InlineConstants::IndirectCallThreshold, CS); 916 if (CA.analyzeCall(CS)) { 917 // We were able to inline the indirect call! Subtract the cost from the 918 // threshold to get the bonus we want to apply, but don't go below zero. 919 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 920 } 921 922 return Base::visitCallSite(CS); 923 } 924 925 bool CallAnalyzer::visitReturnInst(ReturnInst &RI) { 926 // At least one return instruction will be free after inlining. 927 bool Free = !HasReturn; 928 HasReturn = true; 929 return Free; 930 } 931 932 bool CallAnalyzer::visitBranchInst(BranchInst &BI) { 933 // We model unconditional branches as essentially free -- they really 934 // shouldn't exist at all, but handling them makes the behavior of the 935 // inliner more regular and predictable. Interestingly, conditional branches 936 // which will fold away are also free. 937 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 938 dyn_cast_or_null<ConstantInt>( 939 SimplifiedValues.lookup(BI.getCondition())); 940 } 941 942 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 943 // We model unconditional switches as free, see the comments on handling 944 // branches. 945 if (isa<ConstantInt>(SI.getCondition())) 946 return true; 947 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 948 if (isa<ConstantInt>(V)) 949 return true; 950 951 // Otherwise, we need to accumulate a cost proportional to the number of 952 // distinct successor blocks. This fan-out in the CFG cannot be represented 953 // for free even if we can represent the core switch as a jumptable that 954 // takes a single instruction. 955 // 956 // NB: We convert large switches which are just used to initialize large phi 957 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 958 // inlining those. It will prevent inlining in cases where the optimization 959 // does not (yet) fire. 960 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 961 SuccessorBlocks.insert(SI.getDefaultDest()); 962 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 963 SuccessorBlocks.insert(I.getCaseSuccessor()); 964 // Add cost corresponding to the number of distinct destinations. The first 965 // we model as free because of fallthrough. 966 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 967 return false; 968 } 969 970 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 971 // We never want to inline functions that contain an indirectbr. This is 972 // incorrect because all the blockaddress's (in static global initializers 973 // for example) would be referring to the original function, and this 974 // indirect jump would jump from the inlined copy of the function into the 975 // original function which is extremely undefined behavior. 976 // FIXME: This logic isn't really right; we can safely inline functions with 977 // indirectbr's as long as no other function or global references the 978 // blockaddress of a block within the current function. 979 HasIndirectBr = true; 980 return false; 981 } 982 983 bool CallAnalyzer::visitResumeInst(ResumeInst &RI) { 984 // FIXME: It's not clear that a single instruction is an accurate model for 985 // the inline cost of a resume instruction. 986 return false; 987 } 988 989 bool CallAnalyzer::visitCleanupReturnInst(CleanupReturnInst &CRI) { 990 // FIXME: It's not clear that a single instruction is an accurate model for 991 // the inline cost of a cleanupret instruction. 992 return false; 993 } 994 995 bool CallAnalyzer::visitCatchReturnInst(CatchReturnInst &CRI) { 996 // FIXME: It's not clear that a single instruction is an accurate model for 997 // the inline cost of a catchret instruction. 998 return false; 999 } 1000 1001 bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) { 1002 // FIXME: It might be reasonably to discount the cost of instructions leading 1003 // to unreachable as they have the lowest possible impact on both runtime and 1004 // code size. 1005 return true; // No actual code is needed for unreachable. 1006 } 1007 1008 bool CallAnalyzer::visitInstruction(Instruction &I) { 1009 // Some instructions are free. All of the free intrinsics can also be 1010 // handled by SROA, etc. 1011 if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I)) 1012 return true; 1013 1014 // We found something we don't understand or can't handle. Mark any SROA-able 1015 // values in the operand list as no longer viable. 1016 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 1017 disableSROA(*OI); 1018 1019 return false; 1020 } 1021 1022 1023 /// \brief Analyze a basic block for its contribution to the inline cost. 1024 /// 1025 /// This method walks the analyzer over every instruction in the given basic 1026 /// block and accounts for their cost during inlining at this callsite. It 1027 /// aborts early if the threshold has been exceeded or an impossible to inline 1028 /// construct has been detected. It returns false if inlining is no longer 1029 /// viable, and true if inlining remains viable. 1030 bool CallAnalyzer::analyzeBlock(BasicBlock *BB, 1031 SmallPtrSetImpl<const Value *> &EphValues) { 1032 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 1033 // FIXME: Currently, the number of instructions in a function regardless of 1034 // our ability to simplify them during inline to constants or dead code, 1035 // are actually used by the vector bonus heuristic. As long as that's true, 1036 // we have to special case debug intrinsics here to prevent differences in 1037 // inlining due to debug symbols. Eventually, the number of unsimplified 1038 // instructions shouldn't factor into the cost computation, but until then, 1039 // hack around it here. 1040 if (isa<DbgInfoIntrinsic>(I)) 1041 continue; 1042 1043 // Skip ephemeral values. 1044 if (EphValues.count(&*I)) 1045 continue; 1046 1047 ++NumInstructions; 1048 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 1049 ++NumVectorInstructions; 1050 1051 // If the instruction is floating point, and the target says this operation 1052 // is expensive or the function has the "use-soft-float" attribute, this may 1053 // eventually become a library call. Treat the cost as such. 1054 if (I->getType()->isFloatingPointTy()) { 1055 bool hasSoftFloatAttr = false; 1056 1057 // If the function has the "use-soft-float" attribute, mark it as 1058 // expensive. 1059 if (F.hasFnAttribute("use-soft-float")) { 1060 Attribute Attr = F.getFnAttribute("use-soft-float"); 1061 StringRef Val = Attr.getValueAsString(); 1062 if (Val == "true") 1063 hasSoftFloatAttr = true; 1064 } 1065 1066 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1067 hasSoftFloatAttr) 1068 Cost += InlineConstants::CallPenalty; 1069 } 1070 1071 // If the instruction simplified to a constant, there is no cost to this 1072 // instruction. Visit the instructions using our InstVisitor to account for 1073 // all of the per-instruction logic. The visit tree returns true if we 1074 // consumed the instruction in any way, and false if the instruction's base 1075 // cost should count against inlining. 1076 if (Base::visit(&*I)) 1077 ++NumInstructionsSimplified; 1078 else 1079 Cost += InlineConstants::InstrCost; 1080 1081 // If the visit this instruction detected an uninlinable pattern, abort. 1082 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1083 HasIndirectBr || HasFrameEscape) 1084 return false; 1085 1086 // If the caller is a recursive function then we don't want to inline 1087 // functions which allocate a lot of stack space because it would increase 1088 // the caller stack usage dramatically. 1089 if (IsCallerRecursive && 1090 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1091 return false; 1092 1093 // Check if we've past the maximum possible threshold so we don't spin in 1094 // huge basic blocks that will never inline. 1095 if (Cost > Threshold) 1096 return false; 1097 } 1098 1099 return true; 1100 } 1101 1102 /// \brief Compute the base pointer and cumulative constant offsets for V. 1103 /// 1104 /// This strips all constant offsets off of V, leaving it the base pointer, and 1105 /// accumulates the total constant offset applied in the returned constant. It 1106 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 1107 /// no constant offsets applied. 1108 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1109 if (!V->getType()->isPointerTy()) 1110 return nullptr; 1111 1112 const DataLayout &DL = F.getParent()->getDataLayout(); 1113 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1114 APInt Offset = APInt::getNullValue(IntPtrWidth); 1115 1116 // Even though we don't look through PHI nodes, we could be called on an 1117 // instruction in an unreachable block, which may be on a cycle. 1118 SmallPtrSet<Value *, 4> Visited; 1119 Visited.insert(V); 1120 do { 1121 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 1122 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 1123 return nullptr; 1124 V = GEP->getPointerOperand(); 1125 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 1126 V = cast<Operator>(V)->getOperand(0); 1127 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 1128 if (GA->mayBeOverridden()) 1129 break; 1130 V = GA->getAliasee(); 1131 } else { 1132 break; 1133 } 1134 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 1135 } while (Visited.insert(V).second); 1136 1137 Type *IntPtrTy = DL.getIntPtrType(V->getContext()); 1138 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 1139 } 1140 1141 /// \brief Analyze a call site for potential inlining. 1142 /// 1143 /// Returns true if inlining this call is viable, and false if it is not 1144 /// viable. It computes the cost and adjusts the threshold based on numerous 1145 /// factors and heuristics. If this method returns false but the computed cost 1146 /// is below the computed threshold, then inlining was forcibly disabled by 1147 /// some artifact of the routine. 1148 bool CallAnalyzer::analyzeCall(CallSite CS) { 1149 ++NumCallsAnalyzed; 1150 1151 // Perform some tweaks to the cost and threshold based on the direct 1152 // callsite information. 1153 1154 // We want to more aggressively inline vector-dense kernels, so up the 1155 // threshold, and we'll lower it if the % of vector instructions gets too 1156 // low. Note that these bonuses are some what arbitrary and evolved over time 1157 // by accident as much as because they are principled bonuses. 1158 // 1159 // FIXME: It would be nice to remove all such bonuses. At least it would be 1160 // nice to base the bonus values on something more scientific. 1161 assert(NumInstructions == 0); 1162 assert(NumVectorInstructions == 0); 1163 1164 // Update the threshold based on callsite properties 1165 updateThreshold(CS, F); 1166 1167 FiftyPercentVectorBonus = 3 * Threshold / 2; 1168 TenPercentVectorBonus = 3 * Threshold / 4; 1169 const DataLayout &DL = F.getParent()->getDataLayout(); 1170 1171 // Track whether the post-inlining function would have more than one basic 1172 // block. A single basic block is often intended for inlining. Balloon the 1173 // threshold by 50% until we pass the single-BB phase. 1174 bool SingleBB = true; 1175 int SingleBBBonus = Threshold / 2; 1176 1177 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1178 // this Threshold any time, and cost cannot decrease, we can stop processing 1179 // the rest of the function body. 1180 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1181 1182 // Give out bonuses per argument, as the instructions setting them up will 1183 // be gone after inlining. 1184 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1185 if (CS.isByValArgument(I)) { 1186 // We approximate the number of loads and stores needed by dividing the 1187 // size of the byval type by the target's pointer size. 1188 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 1189 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); 1190 unsigned PointerSize = DL.getPointerSizeInBits(); 1191 // Ceiling division. 1192 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 1193 1194 // If it generates more than 8 stores it is likely to be expanded as an 1195 // inline memcpy so we take that as an upper bound. Otherwise we assume 1196 // one load and one store per word copied. 1197 // FIXME: The maxStoresPerMemcpy setting from the target should be used 1198 // here instead of a magic number of 8, but it's not available via 1199 // DataLayout. 1200 NumStores = std::min(NumStores, 8U); 1201 1202 Cost -= 2 * NumStores * InlineConstants::InstrCost; 1203 } else { 1204 // For non-byval arguments subtract off one instruction per call 1205 // argument. 1206 Cost -= InlineConstants::InstrCost; 1207 } 1208 } 1209 1210 // If there is only one call of the function, and it has internal linkage, 1211 // the cost of inlining it drops dramatically. 1212 bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() && 1213 &F == CS.getCalledFunction(); 1214 if (OnlyOneCallAndLocalLinkage) 1215 Cost += InlineConstants::LastCallToStaticBonus; 1216 1217 // If the instruction after the call, or if the normal destination of the 1218 // invoke is an unreachable instruction, the function is noreturn. As such, 1219 // there is little point in inlining this unless there is literally zero 1220 // cost. 1221 Instruction *Instr = CS.getInstruction(); 1222 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 1223 if (isa<UnreachableInst>(II->getNormalDest()->begin())) 1224 Threshold = 0; 1225 } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr))) 1226 Threshold = 0; 1227 1228 // If this function uses the coldcc calling convention, prefer not to inline 1229 // it. 1230 if (F.getCallingConv() == CallingConv::Cold) 1231 Cost += InlineConstants::ColdccPenalty; 1232 1233 // Check if we're done. This can happen due to bonuses and penalties. 1234 if (Cost > Threshold) 1235 return false; 1236 1237 if (F.empty()) 1238 return true; 1239 1240 Function *Caller = CS.getInstruction()->getParent()->getParent(); 1241 // Check if the caller function is recursive itself. 1242 for (User *U : Caller->users()) { 1243 CallSite Site(U); 1244 if (!Site) 1245 continue; 1246 Instruction *I = Site.getInstruction(); 1247 if (I->getParent()->getParent() == Caller) { 1248 IsCallerRecursive = true; 1249 break; 1250 } 1251 } 1252 1253 // Populate our simplified values by mapping from function arguments to call 1254 // arguments with known important simplifications. 1255 CallSite::arg_iterator CAI = CS.arg_begin(); 1256 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 1257 FAI != FAE; ++FAI, ++CAI) { 1258 assert(CAI != CS.arg_end()); 1259 if (Constant *C = dyn_cast<Constant>(CAI)) 1260 SimplifiedValues[&*FAI] = C; 1261 1262 Value *PtrArg = *CAI; 1263 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 1264 ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue()); 1265 1266 // We can SROA any pointer arguments derived from alloca instructions. 1267 if (isa<AllocaInst>(PtrArg)) { 1268 SROAArgValues[&*FAI] = PtrArg; 1269 SROAArgCosts[PtrArg] = 0; 1270 } 1271 } 1272 } 1273 NumConstantArgs = SimplifiedValues.size(); 1274 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 1275 NumAllocaArgs = SROAArgValues.size(); 1276 1277 // FIXME: If a caller has multiple calls to a callee, we end up recomputing 1278 // the ephemeral values multiple times (and they're completely determined by 1279 // the callee, so this is purely duplicate work). 1280 SmallPtrSet<const Value *, 32> EphValues; 1281 CodeMetrics::collectEphemeralValues(&F, &ACT->getAssumptionCache(F), EphValues); 1282 1283 // The worklist of live basic blocks in the callee *after* inlining. We avoid 1284 // adding basic blocks of the callee which can be proven to be dead for this 1285 // particular call site in order to get more accurate cost estimates. This 1286 // requires a somewhat heavyweight iteration pattern: we need to walk the 1287 // basic blocks in a breadth-first order as we insert live successors. To 1288 // accomplish this, prioritizing for small iterations because we exit after 1289 // crossing our threshold, we use a small-size optimized SetVector. 1290 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1291 SmallPtrSet<BasicBlock *, 16> > BBSetVector; 1292 BBSetVector BBWorklist; 1293 BBWorklist.insert(&F.getEntryBlock()); 1294 // Note that we *must not* cache the size, this loop grows the worklist. 1295 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1296 // Bail out the moment we cross the threshold. This means we'll under-count 1297 // the cost, but only when undercounting doesn't matter. 1298 if (Cost > Threshold) 1299 break; 1300 1301 BasicBlock *BB = BBWorklist[Idx]; 1302 if (BB->empty()) 1303 continue; 1304 1305 // Disallow inlining a blockaddress. A blockaddress only has defined 1306 // behavior for an indirect branch in the same function, and we do not 1307 // currently support inlining indirect branches. But, the inliner may not 1308 // see an indirect branch that ends up being dead code at a particular call 1309 // site. If the blockaddress escapes the function, e.g., via a global 1310 // variable, inlining may lead to an invalid cross-function reference. 1311 if (BB->hasAddressTaken()) 1312 return false; 1313 1314 // Analyze the cost of this block. If we blow through the threshold, this 1315 // returns false, and we can bail on out. 1316 if (!analyzeBlock(BB, EphValues)) { 1317 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1318 HasIndirectBr || HasFrameEscape) 1319 return false; 1320 1321 // If the caller is a recursive function then we don't want to inline 1322 // functions which allocate a lot of stack space because it would increase 1323 // the caller stack usage dramatically. 1324 if (IsCallerRecursive && 1325 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1326 return false; 1327 1328 break; 1329 } 1330 1331 TerminatorInst *TI = BB->getTerminator(); 1332 1333 // Add in the live successors by first checking whether we have terminator 1334 // that may be simplified based on the values simplified by this call. 1335 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 1336 if (BI->isConditional()) { 1337 Value *Cond = BI->getCondition(); 1338 if (ConstantInt *SimpleCond 1339 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1340 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 1341 continue; 1342 } 1343 } 1344 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1345 Value *Cond = SI->getCondition(); 1346 if (ConstantInt *SimpleCond 1347 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1348 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1349 continue; 1350 } 1351 } 1352 1353 // If we're unable to select a particular successor, just count all of 1354 // them. 1355 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1356 ++TIdx) 1357 BBWorklist.insert(TI->getSuccessor(TIdx)); 1358 1359 // If we had any successors at this point, than post-inlining is likely to 1360 // have them as well. Note that we assume any basic blocks which existed 1361 // due to branches or switches which folded above will also fold after 1362 // inlining. 1363 if (SingleBB && TI->getNumSuccessors() > 1) { 1364 // Take off the bonus we applied to the threshold. 1365 Threshold -= SingleBBBonus; 1366 SingleBB = false; 1367 } 1368 } 1369 1370 // If this is a noduplicate call, we can still inline as long as 1371 // inlining this would cause the removal of the caller (so the instruction 1372 // is not actually duplicated, just moved). 1373 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1374 return false; 1375 1376 // We applied the maximum possible vector bonus at the beginning. Now, 1377 // subtract the excess bonus, if any, from the Threshold before 1378 // comparing against Cost. 1379 if (NumVectorInstructions <= NumInstructions / 10) 1380 Threshold -= FiftyPercentVectorBonus; 1381 else if (NumVectorInstructions <= NumInstructions / 2) 1382 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1383 1384 return Cost <= std::max(0, Threshold); 1385 } 1386 1387 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1388 /// \brief Dump stats about this call's analysis. 1389 void CallAnalyzer::dump() { 1390 #define DEBUG_PRINT_STAT(x) dbgs() << " " #x ": " << x << "\n" 1391 DEBUG_PRINT_STAT(NumConstantArgs); 1392 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1393 DEBUG_PRINT_STAT(NumAllocaArgs); 1394 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1395 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1396 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1397 DEBUG_PRINT_STAT(NumInstructions); 1398 DEBUG_PRINT_STAT(SROACostSavings); 1399 DEBUG_PRINT_STAT(SROACostSavingsLost); 1400 DEBUG_PRINT_STAT(ContainsNoDuplicateCall); 1401 DEBUG_PRINT_STAT(Cost); 1402 DEBUG_PRINT_STAT(Threshold); 1403 #undef DEBUG_PRINT_STAT 1404 } 1405 #endif 1406 1407 /// \brief Test that two functions either have or have not the given attribute 1408 /// at the same time. 1409 template<typename AttrKind> 1410 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { 1411 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); 1412 } 1413 1414 /// \brief Test that there are no attribute conflicts between Caller and Callee 1415 /// that prevent inlining. 1416 static bool functionsHaveCompatibleAttributes(Function *Caller, 1417 Function *Callee, 1418 TargetTransformInfo &TTI) { 1419 return TTI.areInlineCompatible(Caller, Callee) && 1420 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1421 } 1422 1423 InlineCost llvm::getInlineCost(CallSite CS, int DefaultThreshold, 1424 TargetTransformInfo &CalleeTTI, 1425 AssumptionCacheTracker *ACT) { 1426 return getInlineCost(CS, CS.getCalledFunction(), DefaultThreshold, CalleeTTI, 1427 ACT); 1428 } 1429 1430 int llvm::computeThresholdFromOptLevels(unsigned OptLevel, 1431 unsigned SizeOptLevel) { 1432 if (OptLevel > 2) 1433 return OptAggressiveThreshold; 1434 if (SizeOptLevel == 1) // -Os 1435 return OptSizeThreshold; 1436 if (SizeOptLevel == 2) // -Oz 1437 return OptMinSizeThreshold; 1438 return DefaultInlineThreshold; 1439 } 1440 1441 int llvm::getDefaultInlineThreshold() { return DefaultInlineThreshold; } 1442 1443 InlineCost llvm::getInlineCost(CallSite CS, Function *Callee, 1444 int DefaultThreshold, 1445 TargetTransformInfo &CalleeTTI, 1446 AssumptionCacheTracker *ACT) { 1447 1448 // Cannot inline indirect calls. 1449 if (!Callee) 1450 return llvm::InlineCost::getNever(); 1451 1452 // Calls to functions with always-inline attributes should be inlined 1453 // whenever possible. 1454 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 1455 if (isInlineViable(*Callee)) 1456 return llvm::InlineCost::getAlways(); 1457 return llvm::InlineCost::getNever(); 1458 } 1459 1460 // Never inline functions with conflicting attributes (unless callee has 1461 // always-inline attribute). 1462 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) 1463 return llvm::InlineCost::getNever(); 1464 1465 // Don't inline this call if the caller has the optnone attribute. 1466 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1467 return llvm::InlineCost::getNever(); 1468 1469 // Don't inline functions which can be redefined at link-time to mean 1470 // something else. Don't inline functions marked noinline or call sites 1471 // marked noinline. 1472 if (Callee->mayBeOverridden() || 1473 Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline()) 1474 return llvm::InlineCost::getNever(); 1475 1476 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1477 << "...\n"); 1478 1479 CallAnalyzer CA(CalleeTTI, ACT, *Callee, DefaultThreshold, CS); 1480 bool ShouldInline = CA.analyzeCall(CS); 1481 1482 DEBUG(CA.dump()); 1483 1484 // Check if there was a reason to force inlining or no inlining. 1485 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1486 return InlineCost::getNever(); 1487 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1488 return InlineCost::getAlways(); 1489 1490 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1491 } 1492 1493 bool llvm::isInlineViable(Function &F) { 1494 bool ReturnsTwice = F.hasFnAttribute(Attribute::ReturnsTwice); 1495 for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { 1496 // Disallow inlining of functions which contain indirect branches or 1497 // blockaddresses. 1498 if (isa<IndirectBrInst>(BI->getTerminator()) || BI->hasAddressTaken()) 1499 return false; 1500 1501 for (auto &II : *BI) { 1502 CallSite CS(&II); 1503 if (!CS) 1504 continue; 1505 1506 // Disallow recursive calls. 1507 if (&F == CS.getCalledFunction()) 1508 return false; 1509 1510 // Disallow calls which expose returns-twice to a function not previously 1511 // attributed as such. 1512 if (!ReturnsTwice && CS.isCall() && 1513 cast<CallInst>(CS.getInstruction())->canReturnTwice()) 1514 return false; 1515 1516 // Disallow inlining functions that call @llvm.localescape. Doing this 1517 // correctly would require major changes to the inliner. 1518 if (CS.getCalledFunction() && 1519 CS.getCalledFunction()->getIntrinsicID() == 1520 llvm::Intrinsic::localescape) 1521 return false; 1522 } 1523 } 1524 1525 return true; 1526 } 1527