1 //===- InlineCost.cpp - Cost analysis for inliner -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements inline cost analysis. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #define DEBUG_TYPE "inline-cost" 15 #include "llvm/Analysis/InlineCost.h" 16 #include "llvm/Analysis/ConstantFolding.h" 17 #include "llvm/Analysis/InstructionSimplify.h" 18 #include "llvm/Support/CallSite.h" 19 #include "llvm/Support/Debug.h" 20 #include "llvm/Support/InstVisitor.h" 21 #include "llvm/Support/GetElementPtrTypeIterator.h" 22 #include "llvm/Support/raw_ostream.h" 23 #include "llvm/CallingConv.h" 24 #include "llvm/IntrinsicInst.h" 25 #include "llvm/Operator.h" 26 #include "llvm/GlobalAlias.h" 27 #include "llvm/Target/TargetData.h" 28 #include "llvm/ADT/STLExtras.h" 29 #include "llvm/ADT/SetVector.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/SmallPtrSet.h" 32 #include "llvm/ADT/Statistic.h" 33 34 using namespace llvm; 35 36 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed"); 37 38 namespace { 39 40 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 41 typedef InstVisitor<CallAnalyzer, bool> Base; 42 friend class InstVisitor<CallAnalyzer, bool>; 43 44 // TargetData if available, or null. 45 const TargetData *const TD; 46 47 // The called function. 48 Function &F; 49 50 int Threshold; 51 int Cost; 52 const bool AlwaysInline; 53 54 bool IsCallerRecursive; 55 bool IsRecursiveCall; 56 bool ExposesReturnsTwice; 57 bool HasDynamicAlloca; 58 /// Number of bytes allocated statically by the callee. 59 uint64_t AllocatedSize; 60 unsigned NumInstructions, NumVectorInstructions; 61 int FiftyPercentVectorBonus, TenPercentVectorBonus; 62 int VectorBonus; 63 64 // While we walk the potentially-inlined instructions, we build up and 65 // maintain a mapping of simplified values specific to this callsite. The 66 // idea is to propagate any special information we have about arguments to 67 // this call through the inlinable section of the function, and account for 68 // likely simplifications post-inlining. The most important aspect we track 69 // is CFG altering simplifications -- when we prove a basic block dead, that 70 // can cause dramatic shifts in the cost of inlining a function. 71 DenseMap<Value *, Constant *> SimplifiedValues; 72 73 // Keep track of the values which map back (through function arguments) to 74 // allocas on the caller stack which could be simplified through SROA. 75 DenseMap<Value *, Value *> SROAArgValues; 76 77 // The mapping of caller Alloca values to their accumulated cost savings. If 78 // we have to disable SROA for one of the allocas, this tells us how much 79 // cost must be added. 80 DenseMap<Value *, int> SROAArgCosts; 81 82 // Keep track of values which map to a pointer base and constant offset. 83 DenseMap<Value *, std::pair<Value *, APInt> > ConstantOffsetPtrs; 84 85 // Custom simplification helper routines. 86 bool isAllocaDerivedArg(Value *V); 87 bool lookupSROAArgAndCost(Value *V, Value *&Arg, 88 DenseMap<Value *, int>::iterator &CostIt); 89 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 90 void disableSROA(Value *V); 91 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 92 int InstructionCost); 93 bool handleSROACandidate(bool IsSROAValid, 94 DenseMap<Value *, int>::iterator CostIt, 95 int InstructionCost); 96 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 97 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 98 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 99 100 // Custom analysis routines. 101 bool analyzeBlock(BasicBlock *BB); 102 103 // Disable several entry points to the visitor so we don't accidentally use 104 // them by declaring but not defining them here. 105 void visit(Module *); void visit(Module &); 106 void visit(Function *); void visit(Function &); 107 void visit(BasicBlock *); void visit(BasicBlock &); 108 109 // Provide base case for our instruction visit. 110 bool visitInstruction(Instruction &I); 111 112 // Our visit overrides. 113 bool visitAlloca(AllocaInst &I); 114 bool visitPHI(PHINode &I); 115 bool visitGetElementPtr(GetElementPtrInst &I); 116 bool visitBitCast(BitCastInst &I); 117 bool visitPtrToInt(PtrToIntInst &I); 118 bool visitIntToPtr(IntToPtrInst &I); 119 bool visitCastInst(CastInst &I); 120 bool visitUnaryInstruction(UnaryInstruction &I); 121 bool visitICmp(ICmpInst &I); 122 bool visitSub(BinaryOperator &I); 123 bool visitBinaryOperator(BinaryOperator &I); 124 bool visitLoad(LoadInst &I); 125 bool visitStore(StoreInst &I); 126 bool visitCallSite(CallSite CS); 127 128 public: 129 CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold) 130 : TD(TD), F(Callee), Threshold(Threshold), Cost(0), 131 AlwaysInline(F.getFnAttributes().hasAlwaysInlineAttr()), 132 IsCallerRecursive(false), IsRecursiveCall(false), 133 ExposesReturnsTwice(false), HasDynamicAlloca(false), AllocatedSize(0), 134 NumInstructions(0), NumVectorInstructions(0), 135 FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0), 136 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), 137 NumConstantPtrCmps(0), NumConstantPtrDiffs(0), 138 NumInstructionsSimplified(0), SROACostSavings(0), SROACostSavingsLost(0) { 139 } 140 141 bool analyzeCall(CallSite CS); 142 143 int getThreshold() { return Threshold; } 144 int getCost() { return Cost; } 145 146 // Keep a bunch of stats about the cost savings found so we can print them 147 // out when debugging. 148 unsigned NumConstantArgs; 149 unsigned NumConstantOffsetPtrArgs; 150 unsigned NumAllocaArgs; 151 unsigned NumConstantPtrCmps; 152 unsigned NumConstantPtrDiffs; 153 unsigned NumInstructionsSimplified; 154 unsigned SROACostSavings; 155 unsigned SROACostSavingsLost; 156 157 void dump(); 158 }; 159 160 } // namespace 161 162 /// \brief Test whether the given value is an Alloca-derived function argument. 163 bool CallAnalyzer::isAllocaDerivedArg(Value *V) { 164 return SROAArgValues.count(V); 165 } 166 167 /// \brief Lookup the SROA-candidate argument and cost iterator which V maps to. 168 /// Returns false if V does not map to a SROA-candidate. 169 bool CallAnalyzer::lookupSROAArgAndCost( 170 Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) { 171 if (SROAArgValues.empty() || SROAArgCosts.empty()) 172 return false; 173 174 DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V); 175 if (ArgIt == SROAArgValues.end()) 176 return false; 177 178 Arg = ArgIt->second; 179 CostIt = SROAArgCosts.find(Arg); 180 return CostIt != SROAArgCosts.end(); 181 } 182 183 /// \brief Disable SROA for the candidate marked by this cost iterator. 184 /// 185 /// This marks the candidate as no longer viable for SROA, and adds the cost 186 /// savings associated with it back into the inline cost measurement. 187 void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) { 188 // If we're no longer able to perform SROA we need to undo its cost savings 189 // and prevent subsequent analysis. 190 Cost += CostIt->second; 191 SROACostSavings -= CostIt->second; 192 SROACostSavingsLost += CostIt->second; 193 SROAArgCosts.erase(CostIt); 194 } 195 196 /// \brief If 'V' maps to a SROA candidate, disable SROA for it. 197 void CallAnalyzer::disableSROA(Value *V) { 198 Value *SROAArg; 199 DenseMap<Value *, int>::iterator CostIt; 200 if (lookupSROAArgAndCost(V, SROAArg, CostIt)) 201 disableSROA(CostIt); 202 } 203 204 /// \brief Accumulate the given cost for a particular SROA candidate. 205 void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 206 int InstructionCost) { 207 CostIt->second += InstructionCost; 208 SROACostSavings += InstructionCost; 209 } 210 211 /// \brief Helper for the common pattern of handling a SROA candidate. 212 /// Either accumulates the cost savings if the SROA remains valid, or disables 213 /// SROA for the candidate. 214 bool CallAnalyzer::handleSROACandidate(bool IsSROAValid, 215 DenseMap<Value *, int>::iterator CostIt, 216 int InstructionCost) { 217 if (IsSROAValid) { 218 accumulateSROACost(CostIt, InstructionCost); 219 return true; 220 } 221 222 disableSROA(CostIt); 223 return false; 224 } 225 226 /// \brief Check whether a GEP's indices are all constant. 227 /// 228 /// Respects any simplified values known during the analysis of this callsite. 229 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { 230 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) 231 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) 232 return false; 233 234 return true; 235 } 236 237 /// \brief Accumulate a constant GEP offset into an APInt if possible. 238 /// 239 /// Returns false if unable to compute the offset for any reason. Respects any 240 /// simplified values known during the analysis of this callsite. 241 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 242 if (!TD) 243 return false; 244 245 unsigned IntPtrWidth = TD->getPointerSizeInBits(); 246 assert(IntPtrWidth == Offset.getBitWidth()); 247 248 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 249 GTI != GTE; ++GTI) { 250 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 251 if (!OpC) 252 if (Constant *SimpleOp = SimplifiedValues.lookup(GTI.getOperand())) 253 OpC = dyn_cast<ConstantInt>(SimpleOp); 254 if (!OpC) 255 return false; 256 if (OpC->isZero()) continue; 257 258 // Handle a struct index, which adds its field offset to the pointer. 259 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 260 unsigned ElementIdx = OpC->getZExtValue(); 261 const StructLayout *SL = TD->getStructLayout(STy); 262 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 263 continue; 264 } 265 266 APInt TypeSize(IntPtrWidth, TD->getTypeAllocSize(GTI.getIndexedType())); 267 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 268 } 269 return true; 270 } 271 272 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 273 // FIXME: Check whether inlining will turn a dynamic alloca into a static 274 // alloca, and handle that case. 275 276 // Accumulate the allocated size. 277 if (I.isStaticAlloca()) { 278 Type *Ty = I.getAllocatedType(); 279 AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) : 280 Ty->getPrimitiveSizeInBits()); 281 } 282 283 // We will happily inline static alloca instructions or dynamic alloca 284 // instructions in always-inline situations. 285 if (AlwaysInline || I.isStaticAlloca()) 286 return Base::visitAlloca(I); 287 288 // FIXME: This is overly conservative. Dynamic allocas are inefficient for 289 // a variety of reasons, and so we would like to not inline them into 290 // functions which don't currently have a dynamic alloca. This simply 291 // disables inlining altogether in the presence of a dynamic alloca. 292 HasDynamicAlloca = true; 293 return false; 294 } 295 296 bool CallAnalyzer::visitPHI(PHINode &I) { 297 // FIXME: We should potentially be tracking values through phi nodes, 298 // especially when they collapse to a single value due to deleted CFG edges 299 // during inlining. 300 301 // FIXME: We need to propagate SROA *disabling* through phi nodes, even 302 // though we don't want to propagate it's bonuses. The idea is to disable 303 // SROA if it *might* be used in an inappropriate manner. 304 305 // Phi nodes are always zero-cost. 306 return true; 307 } 308 309 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 310 Value *SROAArg; 311 DenseMap<Value *, int>::iterator CostIt; 312 bool SROACandidate = lookupSROAArgAndCost(I.getPointerOperand(), 313 SROAArg, CostIt); 314 315 // Try to fold GEPs of constant-offset call site argument pointers. This 316 // requires target data and inbounds GEPs. 317 if (TD && I.isInBounds()) { 318 // Check if we have a base + offset for the pointer. 319 Value *Ptr = I.getPointerOperand(); 320 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); 321 if (BaseAndOffset.first) { 322 // Check if the offset of this GEP is constant, and if so accumulate it 323 // into Offset. 324 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { 325 // Non-constant GEPs aren't folded, and disable SROA. 326 if (SROACandidate) 327 disableSROA(CostIt); 328 return false; 329 } 330 331 // Add the result as a new mapping to Base + Offset. 332 ConstantOffsetPtrs[&I] = BaseAndOffset; 333 334 // Also handle SROA candidates here, we already know that the GEP is 335 // all-constant indexed. 336 if (SROACandidate) 337 SROAArgValues[&I] = SROAArg; 338 339 return true; 340 } 341 } 342 343 if (isGEPOffsetConstant(I)) { 344 if (SROACandidate) 345 SROAArgValues[&I] = SROAArg; 346 347 // Constant GEPs are modeled as free. 348 return true; 349 } 350 351 // Variable GEPs will require math and will disable SROA. 352 if (SROACandidate) 353 disableSROA(CostIt); 354 return false; 355 } 356 357 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 358 // Propagate constants through bitcasts. 359 if (Constant *COp = dyn_cast<Constant>(I.getOperand(0))) 360 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { 361 SimplifiedValues[&I] = C; 362 return true; 363 } 364 365 // Track base/offsets through casts 366 std::pair<Value *, APInt> BaseAndOffset 367 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 368 // Casts don't change the offset, just wrap it up. 369 if (BaseAndOffset.first) 370 ConstantOffsetPtrs[&I] = BaseAndOffset; 371 372 // Also look for SROA candidates here. 373 Value *SROAArg; 374 DenseMap<Value *, int>::iterator CostIt; 375 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 376 SROAArgValues[&I] = SROAArg; 377 378 // Bitcasts are always zero cost. 379 return true; 380 } 381 382 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 383 // Propagate constants through ptrtoint. 384 if (Constant *COp = dyn_cast<Constant>(I.getOperand(0))) 385 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { 386 SimplifiedValues[&I] = C; 387 return true; 388 } 389 390 // Track base/offset pairs when converted to a plain integer provided the 391 // integer is large enough to represent the pointer. 392 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 393 if (TD && IntegerSize >= TD->getPointerSizeInBits()) { 394 std::pair<Value *, APInt> BaseAndOffset 395 = ConstantOffsetPtrs.lookup(I.getOperand(0)); 396 if (BaseAndOffset.first) 397 ConstantOffsetPtrs[&I] = BaseAndOffset; 398 } 399 400 // This is really weird. Technically, ptrtoint will disable SROA. However, 401 // unless that ptrtoint is *used* somewhere in the live basic blocks after 402 // inlining, it will be nuked, and SROA should proceed. All of the uses which 403 // would block SROA would also block SROA if applied directly to a pointer, 404 // and so we can just add the integer in here. The only places where SROA is 405 // preserved either cannot fire on an integer, or won't in-and-of themselves 406 // disable SROA (ext) w/o some later use that we would see and disable. 407 Value *SROAArg; 408 DenseMap<Value *, int>::iterator CostIt; 409 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) 410 SROAArgValues[&I] = SROAArg; 411 412 return isInstructionFree(&I, TD); 413 } 414 415 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 416 // Propagate constants through ptrtoint. 417 if (Constant *COp = dyn_cast<Constant>(I.getOperand(0))) 418 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { 419 SimplifiedValues[&I] = C; 420 return true; 421 } 422 423 // Track base/offset pairs when round-tripped through a pointer without 424 // modifications provided the integer is not too large. 425 Value *Op = I.getOperand(0); 426 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 427 if (TD && IntegerSize <= TD->getPointerSizeInBits()) { 428 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 429 if (BaseAndOffset.first) 430 ConstantOffsetPtrs[&I] = BaseAndOffset; 431 } 432 433 // "Propagate" SROA here in the same manner as we do for ptrtoint above. 434 Value *SROAArg; 435 DenseMap<Value *, int>::iterator CostIt; 436 if (lookupSROAArgAndCost(Op, SROAArg, CostIt)) 437 SROAArgValues[&I] = SROAArg; 438 439 return isInstructionFree(&I, TD); 440 } 441 442 bool CallAnalyzer::visitCastInst(CastInst &I) { 443 // Propagate constants through ptrtoint. 444 if (Constant *COp = dyn_cast<Constant>(I.getOperand(0))) 445 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { 446 SimplifiedValues[&I] = C; 447 return true; 448 } 449 450 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 451 disableSROA(I.getOperand(0)); 452 453 return isInstructionFree(&I, TD); 454 } 455 456 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 457 Value *Operand = I.getOperand(0); 458 Constant *Ops[1] = { dyn_cast<Constant>(Operand) }; 459 if (Ops[0] || (Ops[0] = SimplifiedValues.lookup(Operand))) 460 if (Constant *C = ConstantFoldInstOperands(I.getOpcode(), I.getType(), 461 Ops, TD)) { 462 SimplifiedValues[&I] = C; 463 return true; 464 } 465 466 // Disable any SROA on the argument to arbitrary unary operators. 467 disableSROA(Operand); 468 469 return false; 470 } 471 472 bool CallAnalyzer::visitICmp(ICmpInst &I) { 473 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 474 // First try to handle simplified comparisons. 475 if (!isa<Constant>(LHS)) 476 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 477 LHS = SimpleLHS; 478 if (!isa<Constant>(RHS)) 479 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 480 RHS = SimpleRHS; 481 if (Constant *CLHS = dyn_cast<Constant>(LHS)) 482 if (Constant *CRHS = dyn_cast<Constant>(RHS)) 483 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 484 SimplifiedValues[&I] = C; 485 return true; 486 } 487 488 // Otherwise look for a comparison between constant offset pointers with 489 // a common base. 490 Value *LHSBase, *RHSBase; 491 APInt LHSOffset, RHSOffset; 492 llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 493 if (LHSBase) { 494 llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 495 if (RHSBase && LHSBase == RHSBase) { 496 // We have common bases, fold the icmp to a constant based on the 497 // offsets. 498 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 499 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 500 if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) { 501 SimplifiedValues[&I] = C; 502 ++NumConstantPtrCmps; 503 return true; 504 } 505 } 506 } 507 508 // If the comparison is an equality comparison with null, we can simplify it 509 // for any alloca-derived argument. 510 if (I.isEquality() && isa<ConstantPointerNull>(I.getOperand(1))) 511 if (isAllocaDerivedArg(I.getOperand(0))) { 512 // We can actually predict the result of comparisons between an 513 // alloca-derived value and null. Note that this fires regardless of 514 // SROA firing. 515 bool IsNotEqual = I.getPredicate() == CmpInst::ICMP_NE; 516 SimplifiedValues[&I] = IsNotEqual ? ConstantInt::getTrue(I.getType()) 517 : ConstantInt::getFalse(I.getType()); 518 return true; 519 } 520 521 // Finally check for SROA candidates in comparisons. 522 Value *SROAArg; 523 DenseMap<Value *, int>::iterator CostIt; 524 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 525 if (isa<ConstantPointerNull>(I.getOperand(1))) { 526 accumulateSROACost(CostIt, InlineConstants::InstrCost); 527 return true; 528 } 529 530 disableSROA(CostIt); 531 } 532 533 return false; 534 } 535 536 bool CallAnalyzer::visitSub(BinaryOperator &I) { 537 // Try to handle a special case: we can fold computing the difference of two 538 // constant-related pointers. 539 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 540 Value *LHSBase, *RHSBase; 541 APInt LHSOffset, RHSOffset; 542 llvm::tie(LHSBase, LHSOffset) = ConstantOffsetPtrs.lookup(LHS); 543 if (LHSBase) { 544 llvm::tie(RHSBase, RHSOffset) = ConstantOffsetPtrs.lookup(RHS); 545 if (RHSBase && LHSBase == RHSBase) { 546 // We have common bases, fold the subtract to a constant based on the 547 // offsets. 548 Constant *CLHS = ConstantInt::get(LHS->getContext(), LHSOffset); 549 Constant *CRHS = ConstantInt::get(RHS->getContext(), RHSOffset); 550 if (Constant *C = ConstantExpr::getSub(CLHS, CRHS)) { 551 SimplifiedValues[&I] = C; 552 ++NumConstantPtrDiffs; 553 return true; 554 } 555 } 556 } 557 558 // Otherwise, fall back to the generic logic for simplifying and handling 559 // instructions. 560 return Base::visitSub(I); 561 } 562 563 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 564 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 565 if (!isa<Constant>(LHS)) 566 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 567 LHS = SimpleLHS; 568 if (!isa<Constant>(RHS)) 569 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 570 RHS = SimpleRHS; 571 Value *SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, TD); 572 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { 573 SimplifiedValues[&I] = C; 574 return true; 575 } 576 577 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 578 disableSROA(LHS); 579 disableSROA(RHS); 580 581 return false; 582 } 583 584 bool CallAnalyzer::visitLoad(LoadInst &I) { 585 Value *SROAArg; 586 DenseMap<Value *, int>::iterator CostIt; 587 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 588 if (I.isSimple()) { 589 accumulateSROACost(CostIt, InlineConstants::InstrCost); 590 return true; 591 } 592 593 disableSROA(CostIt); 594 } 595 596 return false; 597 } 598 599 bool CallAnalyzer::visitStore(StoreInst &I) { 600 Value *SROAArg; 601 DenseMap<Value *, int>::iterator CostIt; 602 if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) { 603 if (I.isSimple()) { 604 accumulateSROACost(CostIt, InlineConstants::InstrCost); 605 return true; 606 } 607 608 disableSROA(CostIt); 609 } 610 611 return false; 612 } 613 614 bool CallAnalyzer::visitCallSite(CallSite CS) { 615 if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() && 616 !F.getFnAttributes().hasReturnsTwiceAttr()) { 617 // This aborts the entire analysis. 618 ExposesReturnsTwice = true; 619 return false; 620 } 621 622 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) { 623 switch (II->getIntrinsicID()) { 624 default: 625 return Base::visitCallSite(CS); 626 627 case Intrinsic::memset: 628 case Intrinsic::memcpy: 629 case Intrinsic::memmove: 630 // SROA can usually chew through these intrinsics, but they aren't free. 631 return false; 632 } 633 } 634 635 if (Function *F = CS.getCalledFunction()) { 636 if (F == CS.getInstruction()->getParent()->getParent()) { 637 // This flag will fully abort the analysis, so don't bother with anything 638 // else. 639 IsRecursiveCall = true; 640 return false; 641 } 642 643 if (!callIsSmall(CS)) { 644 // We account for the average 1 instruction per call argument setup 645 // here. 646 Cost += CS.arg_size() * InlineConstants::InstrCost; 647 648 // Everything other than inline ASM will also have a significant cost 649 // merely from making the call. 650 if (!isa<InlineAsm>(CS.getCalledValue())) 651 Cost += InlineConstants::CallPenalty; 652 } 653 654 return Base::visitCallSite(CS); 655 } 656 657 // Otherwise we're in a very special case -- an indirect function call. See 658 // if we can be particularly clever about this. 659 Value *Callee = CS.getCalledValue(); 660 661 // First, pay the price of the argument setup. We account for the average 662 // 1 instruction per call argument setup here. 663 Cost += CS.arg_size() * InlineConstants::InstrCost; 664 665 // Next, check if this happens to be an indirect function call to a known 666 // function in this inline context. If not, we've done all we can. 667 Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee)); 668 if (!F) 669 return Base::visitCallSite(CS); 670 671 // If we have a constant that we are calling as a function, we can peer 672 // through it and see the function target. This happens not infrequently 673 // during devirtualization and so we want to give it a hefty bonus for 674 // inlining, but cap that bonus in the event that inlining wouldn't pan 675 // out. Pretend to inline the function, with a custom threshold. 676 CallAnalyzer CA(TD, *F, InlineConstants::IndirectCallThreshold); 677 if (CA.analyzeCall(CS)) { 678 // We were able to inline the indirect call! Subtract the cost from the 679 // bonus we want to apply, but don't go below zero. 680 Cost -= std::max(0, InlineConstants::IndirectCallThreshold - CA.getCost()); 681 } 682 683 return Base::visitCallSite(CS); 684 } 685 686 bool CallAnalyzer::visitInstruction(Instruction &I) { 687 // Some instructions are free. All of the free intrinsics can also be 688 // handled by SROA, etc. 689 if (isInstructionFree(&I, TD)) 690 return true; 691 692 // We found something we don't understand or can't handle. Mark any SROA-able 693 // values in the operand list as no longer viable. 694 for (User::op_iterator OI = I.op_begin(), OE = I.op_end(); OI != OE; ++OI) 695 disableSROA(*OI); 696 697 return false; 698 } 699 700 701 /// \brief Analyze a basic block for its contribution to the inline cost. 702 /// 703 /// This method walks the analyzer over every instruction in the given basic 704 /// block and accounts for their cost during inlining at this callsite. It 705 /// aborts early if the threshold has been exceeded or an impossible to inline 706 /// construct has been detected. It returns false if inlining is no longer 707 /// viable, and true if inlining remains viable. 708 bool CallAnalyzer::analyzeBlock(BasicBlock *BB) { 709 for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end()); 710 I != E; ++I) { 711 ++NumInstructions; 712 if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy()) 713 ++NumVectorInstructions; 714 715 // If the instruction simplified to a constant, there is no cost to this 716 // instruction. Visit the instructions using our InstVisitor to account for 717 // all of the per-instruction logic. The visit tree returns true if we 718 // consumed the instruction in any way, and false if the instruction's base 719 // cost should count against inlining. 720 if (Base::visit(I)) 721 ++NumInstructionsSimplified; 722 else 723 Cost += InlineConstants::InstrCost; 724 725 // If the visit this instruction detected an uninlinable pattern, abort. 726 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca) 727 return false; 728 729 // If the caller is a recursive function then we don't want to inline 730 // functions which allocate a lot of stack space because it would increase 731 // the caller stack usage dramatically. 732 if (IsCallerRecursive && 733 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 734 return false; 735 736 if (NumVectorInstructions > NumInstructions/2) 737 VectorBonus = FiftyPercentVectorBonus; 738 else if (NumVectorInstructions > NumInstructions/10) 739 VectorBonus = TenPercentVectorBonus; 740 else 741 VectorBonus = 0; 742 743 // Check if we've past the threshold so we don't spin in huge basic 744 // blocks that will never inline. 745 if (!AlwaysInline && Cost > (Threshold + VectorBonus)) 746 return false; 747 } 748 749 return true; 750 } 751 752 /// \brief Compute the base pointer and cumulative constant offsets for V. 753 /// 754 /// This strips all constant offsets off of V, leaving it the base pointer, and 755 /// accumulates the total constant offset applied in the returned constant. It 756 /// returns 0 if V is not a pointer, and returns the constant '0' if there are 757 /// no constant offsets applied. 758 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 759 if (!TD || !V->getType()->isPointerTy()) 760 return 0; 761 762 unsigned IntPtrWidth = TD->getPointerSizeInBits(); 763 APInt Offset = APInt::getNullValue(IntPtrWidth); 764 765 // Even though we don't look through PHI nodes, we could be called on an 766 // instruction in an unreachable block, which may be on a cycle. 767 SmallPtrSet<Value *, 4> Visited; 768 Visited.insert(V); 769 do { 770 if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) { 771 if (!GEP->isInBounds() || !accumulateGEPOffset(*GEP, Offset)) 772 return 0; 773 V = GEP->getPointerOperand(); 774 } else if (Operator::getOpcode(V) == Instruction::BitCast) { 775 V = cast<Operator>(V)->getOperand(0); 776 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) { 777 if (GA->mayBeOverridden()) 778 break; 779 V = GA->getAliasee(); 780 } else { 781 break; 782 } 783 assert(V->getType()->isPointerTy() && "Unexpected operand type!"); 784 } while (Visited.insert(V)); 785 786 Type *IntPtrTy = TD->getIntPtrType(V->getContext()); 787 return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset)); 788 } 789 790 /// \brief Analyze a call site for potential inlining. 791 /// 792 /// Returns true if inlining this call is viable, and false if it is not 793 /// viable. It computes the cost and adjusts the threshold based on numerous 794 /// factors and heuristics. If this method returns false but the computed cost 795 /// is below the computed threshold, then inlining was forcibly disabled by 796 /// some artifact of the rountine. 797 bool CallAnalyzer::analyzeCall(CallSite CS) { 798 ++NumCallsAnalyzed; 799 800 // Track whether the post-inlining function would have more than one basic 801 // block. A single basic block is often intended for inlining. Balloon the 802 // threshold by 50% until we pass the single-BB phase. 803 bool SingleBB = true; 804 int SingleBBBonus = Threshold / 2; 805 Threshold += SingleBBBonus; 806 807 // Unless we are always-inlining, perform some tweaks to the cost and 808 // threshold based on the direct callsite information. 809 if (!AlwaysInline) { 810 // We want to more aggressively inline vector-dense kernels, so up the 811 // threshold, and we'll lower it if the % of vector instructions gets too 812 // low. 813 assert(NumInstructions == 0); 814 assert(NumVectorInstructions == 0); 815 FiftyPercentVectorBonus = Threshold; 816 TenPercentVectorBonus = Threshold / 2; 817 818 // Give out bonuses per argument, as the instructions setting them up will 819 // be gone after inlining. 820 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 821 if (TD && CS.isByValArgument(I)) { 822 // We approximate the number of loads and stores needed by dividing the 823 // size of the byval type by the target's pointer size. 824 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); 825 unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType()); 826 unsigned PointerSize = TD->getPointerSizeInBits(); 827 // Ceiling division. 828 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; 829 830 // If it generates more than 8 stores it is likely to be expanded as an 831 // inline memcpy so we take that as an upper bound. Otherwise we assume 832 // one load and one store per word copied. 833 // FIXME: The maxStoresPerMemcpy setting from the target should be used 834 // here instead of a magic number of 8, but it's not available via 835 // TargetData. 836 NumStores = std::min(NumStores, 8U); 837 838 Cost -= 2 * NumStores * InlineConstants::InstrCost; 839 } else { 840 // For non-byval arguments subtract off one instruction per call 841 // argument. 842 Cost -= InlineConstants::InstrCost; 843 } 844 } 845 846 // If there is only one call of the function, and it has internal linkage, 847 // the cost of inlining it drops dramatically. 848 if (F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction()) 849 Cost += InlineConstants::LastCallToStaticBonus; 850 851 // If the instruction after the call, or if the normal destination of the 852 // invoke is an unreachable instruction, the function is noreturn. As such, 853 // there is little point in inlining this unless there is literally zero 854 // cost. 855 Instruction *Instr = CS.getInstruction(); 856 if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) { 857 if (isa<UnreachableInst>(II->getNormalDest()->begin())) 858 Threshold = 1; 859 } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr))) 860 Threshold = 1; 861 862 // If this function uses the coldcc calling convention, prefer not to inline 863 // it. 864 if (F.getCallingConv() == CallingConv::Cold) 865 Cost += InlineConstants::ColdccPenalty; 866 867 // Check if we're done. This can happen due to bonuses and penalties. 868 if (Cost > Threshold) 869 return false; 870 } 871 872 if (F.empty()) 873 return true; 874 875 Function *Caller = CS.getInstruction()->getParent()->getParent(); 876 // Check if the caller function is recursive itself. 877 for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end(); 878 U != E; ++U) { 879 CallSite Site(cast<Value>(*U)); 880 if (!Site) 881 continue; 882 Instruction *I = Site.getInstruction(); 883 if (I->getParent()->getParent() == Caller) { 884 IsCallerRecursive = true; 885 break; 886 } 887 } 888 889 // Track whether we've seen a return instruction. The first return 890 // instruction is free, as at least one will usually disappear in inlining. 891 bool HasReturn = false; 892 893 // Populate our simplified values by mapping from function arguments to call 894 // arguments with known important simplifications. 895 CallSite::arg_iterator CAI = CS.arg_begin(); 896 for (Function::arg_iterator FAI = F.arg_begin(), FAE = F.arg_end(); 897 FAI != FAE; ++FAI, ++CAI) { 898 assert(CAI != CS.arg_end()); 899 if (Constant *C = dyn_cast<Constant>(CAI)) 900 SimplifiedValues[FAI] = C; 901 902 Value *PtrArg = *CAI; 903 if (ConstantInt *C = stripAndComputeInBoundsConstantOffsets(PtrArg)) { 904 ConstantOffsetPtrs[FAI] = std::make_pair(PtrArg, C->getValue()); 905 906 // We can SROA any pointer arguments derived from alloca instructions. 907 if (isa<AllocaInst>(PtrArg)) { 908 SROAArgValues[FAI] = PtrArg; 909 SROAArgCosts[PtrArg] = 0; 910 } 911 } 912 } 913 NumConstantArgs = SimplifiedValues.size(); 914 NumConstantOffsetPtrArgs = ConstantOffsetPtrs.size(); 915 NumAllocaArgs = SROAArgValues.size(); 916 917 // The worklist of live basic blocks in the callee *after* inlining. We avoid 918 // adding basic blocks of the callee which can be proven to be dead for this 919 // particular call site in order to get more accurate cost estimates. This 920 // requires a somewhat heavyweight iteration pattern: we need to walk the 921 // basic blocks in a breadth-first order as we insert live successors. To 922 // accomplish this, prioritizing for small iterations because we exit after 923 // crossing our threshold, we use a small-size optimized SetVector. 924 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 925 SmallPtrSet<BasicBlock *, 16> > BBSetVector; 926 BBSetVector BBWorklist; 927 BBWorklist.insert(&F.getEntryBlock()); 928 // Note that we *must not* cache the size, this loop grows the worklist. 929 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 930 // Bail out the moment we cross the threshold. This means we'll under-count 931 // the cost, but only when undercounting doesn't matter. 932 if (!AlwaysInline && Cost > (Threshold + VectorBonus)) 933 break; 934 935 BasicBlock *BB = BBWorklist[Idx]; 936 if (BB->empty()) 937 continue; 938 939 // Handle the terminator cost here where we can track returns and other 940 // function-wide constructs. 941 TerminatorInst *TI = BB->getTerminator(); 942 943 // We never want to inline functions that contain an indirectbr. This is 944 // incorrect because all the blockaddress's (in static global initializers 945 // for example) would be referring to the original function, and this 946 // indirect jump would jump from the inlined copy of the function into the 947 // original function which is extremely undefined behavior. 948 // FIXME: This logic isn't really right; we can safely inline functions 949 // with indirectbr's as long as no other function or global references the 950 // blockaddress of a block within the current function. And as a QOI issue, 951 // if someone is using a blockaddress without an indirectbr, and that 952 // reference somehow ends up in another function or global, we probably 953 // don't want to inline this function. 954 if (isa<IndirectBrInst>(TI)) 955 return false; 956 957 if (!HasReturn && isa<ReturnInst>(TI)) 958 HasReturn = true; 959 else 960 Cost += InlineConstants::InstrCost; 961 962 // Analyze the cost of this block. If we blow through the threshold, this 963 // returns false, and we can bail on out. 964 if (!analyzeBlock(BB)) { 965 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca) 966 return false; 967 968 // If the caller is a recursive function then we don't want to inline 969 // functions which allocate a lot of stack space because it would increase 970 // the caller stack usage dramatically. 971 if (IsCallerRecursive && 972 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 973 return false; 974 975 break; 976 } 977 978 // Add in the live successors by first checking whether we have terminator 979 // that may be simplified based on the values simplified by this call. 980 if (BranchInst *BI = dyn_cast<BranchInst>(TI)) { 981 if (BI->isConditional()) { 982 Value *Cond = BI->getCondition(); 983 if (ConstantInt *SimpleCond 984 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 985 BBWorklist.insert(BI->getSuccessor(SimpleCond->isZero() ? 1 : 0)); 986 continue; 987 } 988 } 989 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 990 Value *Cond = SI->getCondition(); 991 if (ConstantInt *SimpleCond 992 = dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 993 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 994 continue; 995 } 996 } 997 998 // If we're unable to select a particular successor, just count all of 999 // them. 1000 for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; 1001 ++TIdx) 1002 BBWorklist.insert(TI->getSuccessor(TIdx)); 1003 1004 // If we had any successors at this point, than post-inlining is likely to 1005 // have them as well. Note that we assume any basic blocks which existed 1006 // due to branches or switches which folded above will also fold after 1007 // inlining. 1008 if (SingleBB && TI->getNumSuccessors() > 1) { 1009 // Take off the bonus we applied to the threshold. 1010 Threshold -= SingleBBBonus; 1011 SingleBB = false; 1012 } 1013 } 1014 1015 Threshold += VectorBonus; 1016 1017 return AlwaysInline || Cost < Threshold; 1018 } 1019 1020 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1021 /// \brief Dump stats about this call's analysis. 1022 void CallAnalyzer::dump() { 1023 #define DEBUG_PRINT_STAT(x) llvm::dbgs() << " " #x ": " << x << "\n" 1024 DEBUG_PRINT_STAT(NumConstantArgs); 1025 DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs); 1026 DEBUG_PRINT_STAT(NumAllocaArgs); 1027 DEBUG_PRINT_STAT(NumConstantPtrCmps); 1028 DEBUG_PRINT_STAT(NumConstantPtrDiffs); 1029 DEBUG_PRINT_STAT(NumInstructionsSimplified); 1030 DEBUG_PRINT_STAT(SROACostSavings); 1031 DEBUG_PRINT_STAT(SROACostSavingsLost); 1032 #undef DEBUG_PRINT_STAT 1033 } 1034 #endif 1035 1036 InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) { 1037 return getInlineCost(CS, CS.getCalledFunction(), Threshold); 1038 } 1039 1040 InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee, 1041 int Threshold) { 1042 // Don't inline functions which can be redefined at link-time to mean 1043 // something else. Don't inline functions marked noinline or call sites 1044 // marked noinline. 1045 if (!Callee || Callee->mayBeOverridden() || 1046 Callee->getFnAttributes().hasNoInlineAttr() || CS.isNoInline()) 1047 return llvm::InlineCost::getNever(); 1048 1049 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1050 << "...\n"); 1051 1052 CallAnalyzer CA(TD, *Callee, Threshold); 1053 bool ShouldInline = CA.analyzeCall(CS); 1054 1055 DEBUG(CA.dump()); 1056 1057 // Check if there was a reason to force inlining or no inlining. 1058 if (!ShouldInline && CA.getCost() < CA.getThreshold()) 1059 return InlineCost::getNever(); 1060 if (ShouldInline && CA.getCost() >= CA.getThreshold()) 1061 return InlineCost::getAlways(); 1062 1063 return llvm::InlineCost::get(CA.getCost(), CA.getThreshold()); 1064 } 1065