1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/BlockFrequencyInfo.h" 24 #include "llvm/Analysis/BranchProbabilityInfo.h" 25 #include "llvm/Analysis/InlineCost.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 28 #include "llvm/Analysis/ProfileSummaryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 71 // Command line option to disable partial-inlining. The default is false: 72 static cl::opt<bool> 73 DisablePartialInlining("disable-partial-inlining", cl::init(false), 74 cl::Hidden, cl::desc("Disable partial ininling")); 75 76 // This is an option used by testing: 77 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 78 cl::init(false), cl::ZeroOrMore, 79 cl::ReallyHidden, 80 cl::desc("Skip Cost Analysis")); 81 82 static cl::opt<unsigned> MaxNumInlineBlocks( 83 "max-num-inline-blocks", cl::init(5), cl::Hidden, 84 cl::desc("Max number of blocks to be partially inlined")); 85 86 // Command line option to set the maximum number of partial inlining allowed 87 // for the module. The default value of -1 means no limit. 88 static cl::opt<int> MaxNumPartialInlining( 89 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 90 cl::desc("Max number of partial inlining. The default is unlimited")); 91 92 // Used only when PGO or user annotated branch data is absent. It is 93 // the least value that is used to weigh the outline region. If BFI 94 // produces larger value, the BFI value will be used. 95 static cl::opt<int> 96 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 97 cl::Hidden, cl::ZeroOrMore, 98 cl::desc("Relative frequency of outline region to " 99 "the entry block")); 100 101 static cl::opt<unsigned> ExtraOutliningPenalty( 102 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 103 cl::desc("A debug option to add additional penalty to the computed one.")); 104 105 namespace { 106 107 struct FunctionOutliningInfo { 108 FunctionOutliningInfo() = default; 109 110 // Returns the number of blocks to be inlined including all blocks 111 // in Entries and one return block. 112 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 113 114 // A set of blocks including the function entry that guard 115 // the region to be outlined. 116 SmallVector<BasicBlock *, 4> Entries; 117 118 // The return block that is not included in the outlined region. 119 BasicBlock *ReturnBlock = nullptr; 120 121 // The dominating block of the region to be outlined. 122 BasicBlock *NonReturnBlock = nullptr; 123 124 // The set of blocks in Entries that that are predecessors to ReturnBlock 125 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 126 }; 127 128 struct PartialInlinerImpl { 129 PartialInlinerImpl( 130 std::function<AssumptionCache &(Function &)> *GetAC, 131 std::function<TargetTransformInfo &(Function &)> *GTTI, 132 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 133 ProfileSummaryInfo *ProfSI) 134 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 135 136 bool run(Module &M); 137 Function *unswitchFunction(Function *F); 138 139 // This class speculatively clones the the function to be partial inlined. 140 // At the end of partial inlining, the remaining callsites to the cloned 141 // function that are not partially inlined will be fixed up to reference 142 // the original function, and the cloned function will be erased. 143 struct FunctionCloner { 144 FunctionCloner(Function *F, FunctionOutliningInfo *OI); 145 ~FunctionCloner(); 146 147 // Prepare for function outlining: making sure there is only 148 // one incoming edge from the extracted/outlined region to 149 // the return block. 150 void NormalizeReturnBlock(); 151 152 // Do function outlining: 153 Function *doFunctionOutlining(); 154 155 Function *OrigFunc = nullptr; 156 Function *ClonedFunc = nullptr; 157 Function *OutlinedFunc = nullptr; 158 BasicBlock *OutliningCallBB = nullptr; 159 // ClonedFunc is inlined in one of its callers after function 160 // outlining. 161 bool IsFunctionInlined = false; 162 // The cost of the region to be outlined. 163 int OutlinedRegionCost = 0; 164 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 165 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 166 }; 167 168 private: 169 int NumPartialInlining = 0; 170 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 171 std::function<TargetTransformInfo &(Function &)> *GetTTI; 172 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 173 ProfileSummaryInfo *PSI; 174 175 // Return the frequency of the OutlininingBB relative to F's entry point. 176 // The result is no larger than 1 and is represented using BP. 177 // (Note that the outlined region's 'head' block can only have incoming 178 // edges from the guarding entry blocks). 179 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 180 181 // Return true if the callee of CS should be partially inlined with 182 // profit. 183 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 184 BlockFrequency WeightedOutliningRcost, 185 OptimizationRemarkEmitter &ORE); 186 187 // Try to inline DuplicateFunction (cloned from F with call to 188 // the OutlinedFunction into its callers. Return true 189 // if there is any successful inlining. 190 bool tryPartialInline(FunctionCloner &Cloner); 191 192 // Compute the mapping from use site of DuplicationFunction to the enclosing 193 // BB's profile count. 194 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 195 DenseMap<User *, uint64_t> &SiteCountMap); 196 197 bool IsLimitReached() { 198 return (MaxNumPartialInlining != -1 && 199 NumPartialInlining >= MaxNumPartialInlining); 200 } 201 202 static CallSite getCallSite(User *U) { 203 CallSite CS; 204 if (CallInst *CI = dyn_cast<CallInst>(U)) 205 CS = CallSite(CI); 206 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 207 CS = CallSite(II); 208 else 209 llvm_unreachable("All uses must be calls"); 210 return CS; 211 } 212 213 static CallSite getOneCallSiteTo(Function *F) { 214 User *User = *F->user_begin(); 215 return getCallSite(User); 216 } 217 218 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 219 CallSite CS = getOneCallSiteTo(F); 220 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 221 BasicBlock *Block = CS.getParent(); 222 return std::make_tuple(DLoc, Block); 223 } 224 225 // Returns the costs associated with function outlining: 226 // - The first value is the non-weighted runtime cost for making the call 227 // to the outlined function, including the addtional setup cost in the 228 // outlined function itself; 229 // - The second value is the estimated size of the new call sequence in 230 // basic block Cloner.OutliningCallBB; 231 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 232 233 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 234 // approximate both the size and runtime cost (Note that in the current 235 // inline cost analysis, there is no clear distinction there either). 236 static int computeBBInlineCost(BasicBlock *BB); 237 238 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 239 }; 240 241 struct PartialInlinerLegacyPass : public ModulePass { 242 static char ID; // Pass identification, replacement for typeid 243 244 PartialInlinerLegacyPass() : ModulePass(ID) { 245 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 246 } 247 248 void getAnalysisUsage(AnalysisUsage &AU) const override { 249 AU.addRequired<AssumptionCacheTracker>(); 250 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 251 AU.addRequired<TargetTransformInfoWrapperPass>(); 252 } 253 254 bool runOnModule(Module &M) override { 255 if (skipModule(M)) 256 return false; 257 258 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 259 TargetTransformInfoWrapperPass *TTIWP = 260 &getAnalysis<TargetTransformInfoWrapperPass>(); 261 ProfileSummaryInfo *PSI = 262 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 263 264 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 265 [&ACT](Function &F) -> AssumptionCache & { 266 return ACT->getAssumptionCache(F); 267 }; 268 269 std::function<TargetTransformInfo &(Function &)> GetTTI = 270 [&TTIWP](Function &F) -> TargetTransformInfo & { 271 return TTIWP->getTTI(F); 272 }; 273 274 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 275 } 276 }; 277 278 } // end anonymous namespace 279 280 std::unique_ptr<FunctionOutliningInfo> 281 PartialInlinerImpl::computeOutliningInfo(Function *F) { 282 BasicBlock *EntryBlock = &F->front(); 283 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 284 if (!BR || BR->isUnconditional()) 285 return std::unique_ptr<FunctionOutliningInfo>(); 286 287 // Returns true if Succ is BB's successor 288 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 289 return is_contained(successors(BB), Succ); 290 }; 291 292 auto SuccSize = [](BasicBlock *BB) { 293 return std::distance(succ_begin(BB), succ_end(BB)); 294 }; 295 296 auto IsReturnBlock = [](BasicBlock *BB) { 297 TerminatorInst *TI = BB->getTerminator(); 298 return isa<ReturnInst>(TI); 299 }; 300 301 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 302 if (IsReturnBlock(Succ1)) 303 return std::make_tuple(Succ1, Succ2); 304 if (IsReturnBlock(Succ2)) 305 return std::make_tuple(Succ2, Succ1); 306 307 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 308 }; 309 310 // Detect a triangular shape: 311 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 312 if (IsSuccessor(Succ1, Succ2)) 313 return std::make_tuple(Succ1, Succ2); 314 if (IsSuccessor(Succ2, Succ1)) 315 return std::make_tuple(Succ2, Succ1); 316 317 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 318 }; 319 320 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 321 llvm::make_unique<FunctionOutliningInfo>(); 322 323 BasicBlock *CurrEntry = EntryBlock; 324 bool CandidateFound = false; 325 do { 326 // The number of blocks to be inlined has already reached 327 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 328 // disables partial inlining for the function. 329 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 330 break; 331 332 if (SuccSize(CurrEntry) != 2) 333 break; 334 335 BasicBlock *Succ1 = *succ_begin(CurrEntry); 336 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 337 338 BasicBlock *ReturnBlock, *NonReturnBlock; 339 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 340 341 if (ReturnBlock) { 342 OutliningInfo->Entries.push_back(CurrEntry); 343 OutliningInfo->ReturnBlock = ReturnBlock; 344 OutliningInfo->NonReturnBlock = NonReturnBlock; 345 CandidateFound = true; 346 break; 347 } 348 349 BasicBlock *CommSucc; 350 BasicBlock *OtherSucc; 351 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 352 353 if (!CommSucc) 354 break; 355 356 OutliningInfo->Entries.push_back(CurrEntry); 357 CurrEntry = OtherSucc; 358 } while (true); 359 360 if (!CandidateFound) 361 return std::unique_ptr<FunctionOutliningInfo>(); 362 363 // Do sanity check of the entries: threre should not 364 // be any successors (not in the entry set) other than 365 // {ReturnBlock, NonReturnBlock} 366 assert(OutliningInfo->Entries[0] == &F->front() && 367 "Function Entry must be the first in Entries vector"); 368 DenseSet<BasicBlock *> Entries; 369 for (BasicBlock *E : OutliningInfo->Entries) 370 Entries.insert(E); 371 372 // Returns true of BB has Predecessor which is not 373 // in Entries set. 374 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 375 for (auto Pred : predecessors(BB)) { 376 if (!Entries.count(Pred)) 377 return true; 378 } 379 return false; 380 }; 381 auto CheckAndNormalizeCandidate = 382 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 383 for (BasicBlock *E : OutliningInfo->Entries) { 384 for (auto Succ : successors(E)) { 385 if (Entries.count(Succ)) 386 continue; 387 if (Succ == OutliningInfo->ReturnBlock) 388 OutliningInfo->ReturnBlockPreds.push_back(E); 389 else if (Succ != OutliningInfo->NonReturnBlock) 390 return false; 391 } 392 // There should not be any outside incoming edges either: 393 if (HasNonEntryPred(E)) 394 return false; 395 } 396 return true; 397 }; 398 399 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 400 return std::unique_ptr<FunctionOutliningInfo>(); 401 402 // Now further growing the candidate's inlining region by 403 // peeling off dominating blocks from the outlining region: 404 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 405 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 406 if (SuccSize(Cand) != 2) 407 break; 408 409 if (HasNonEntryPred(Cand)) 410 break; 411 412 BasicBlock *Succ1 = *succ_begin(Cand); 413 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 414 415 BasicBlock *ReturnBlock, *NonReturnBlock; 416 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 417 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 418 break; 419 420 if (NonReturnBlock->getSinglePredecessor() != Cand) 421 break; 422 423 // Now grow and update OutlininigInfo: 424 OutliningInfo->Entries.push_back(Cand); 425 OutliningInfo->NonReturnBlock = NonReturnBlock; 426 OutliningInfo->ReturnBlockPreds.push_back(Cand); 427 Entries.insert(Cand); 428 } 429 430 return OutliningInfo; 431 } 432 433 // Check if there is PGO data or user annoated branch data: 434 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 435 if (F->getEntryCount()) 436 return true; 437 // Now check if any of the entry block has MD_prof data: 438 for (auto *E : OI->Entries) { 439 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 440 if (!BR || BR->isUnconditional()) 441 continue; 442 uint64_t T, F; 443 if (BR->extractProfMetadata(T, F)) 444 return true; 445 } 446 return false; 447 } 448 449 BranchProbability 450 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 451 auto EntryFreq = 452 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 453 auto OutliningCallFreq = 454 Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); 455 456 auto OutlineRegionRelFreq = 457 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 458 EntryFreq.getFrequency()); 459 460 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 461 return OutlineRegionRelFreq; 462 463 // When profile data is not available, we need to be conservative in 464 // estimating the overall savings. Static branch prediction can usually 465 // guess the branch direction right (taken/non-taken), but the guessed 466 // branch probability is usually not biased enough. In case when the 467 // outlined region is predicted to be likely, its probability needs 468 // to be made higher (more biased) to not under-estimate the cost of 469 // function outlining. On the other hand, if the outlined region 470 // is predicted to be less likely, the predicted probablity is usually 471 // higher than the actual. For instance, the actual probability of the 472 // less likely target is only 5%, but the guessed probablity can be 473 // 40%. In the latter case, there is no need for further adjustement. 474 // FIXME: add an option for this. 475 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 476 return OutlineRegionRelFreq; 477 478 OutlineRegionRelFreq = std::max( 479 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 480 481 return OutlineRegionRelFreq; 482 } 483 484 bool PartialInlinerImpl::shouldPartialInline( 485 CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 486 OptimizationRemarkEmitter &ORE) { 487 using namespace ore; 488 489 if (SkipCostAnalysis) 490 return true; 491 492 Instruction *Call = CS.getInstruction(); 493 Function *Callee = CS.getCalledFunction(); 494 assert(Callee == Cloner.ClonedFunc); 495 496 Function *Caller = CS.getCaller(); 497 auto &CalleeTTI = (*GetTTI)(*Callee); 498 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 499 *GetAssumptionCache, GetBFI, PSI, &ORE); 500 501 if (IC.isAlways()) { 502 ORE.emit([&]() { 503 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 504 << NV("Callee", Cloner.OrigFunc) 505 << " should always be fully inlined, not partially"; 506 }); 507 return false; 508 } 509 510 if (IC.isNever()) { 511 ORE.emit([&]() { 512 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 513 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 514 << NV("Caller", Caller) 515 << " because it should never be inlined (cost=never)"; 516 }); 517 return false; 518 } 519 520 if (!IC) { 521 ORE.emit([&]() { 522 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 523 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 524 << NV("Caller", Caller) << " because too costly to inline (cost=" 525 << NV("Cost", IC.getCost()) << ", threshold=" 526 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 527 }); 528 return false; 529 } 530 const DataLayout &DL = Caller->getParent()->getDataLayout(); 531 532 // The savings of eliminating the call: 533 int NonWeightedSavings = getCallsiteCost(CS, DL); 534 BlockFrequency NormWeightedSavings(NonWeightedSavings); 535 536 // Weighted saving is smaller than weighted cost, return false 537 if (NormWeightedSavings < WeightedOutliningRcost) { 538 ORE.emit([&]() { 539 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 540 Call) 541 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 542 << NV("Caller", Caller) << " runtime overhead (overhead=" 543 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 544 << ", savings=" 545 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 546 << ")" 547 << " of making the outlined call is too high"; 548 }); 549 550 return false; 551 } 552 553 ORE.emit([&]() { 554 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 555 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 556 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 557 << " (threshold=" 558 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 559 }); 560 return true; 561 } 562 563 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 564 // For now use a simplified version. The returned 'InlineCost' will be used 565 // to esimate the size cost as well as runtime cost of the BB. 566 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 567 int InlineCost = 0; 568 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 569 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 570 if (isa<DbgInfoIntrinsic>(I)) 571 continue; 572 573 switch (I->getOpcode()) { 574 case Instruction::BitCast: 575 case Instruction::PtrToInt: 576 case Instruction::IntToPtr: 577 case Instruction::Alloca: 578 continue; 579 case Instruction::GetElementPtr: 580 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 581 continue; 582 default: 583 break; 584 } 585 586 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 587 if (IntrInst) { 588 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 589 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 590 continue; 591 } 592 593 if (CallInst *CI = dyn_cast<CallInst>(I)) { 594 InlineCost += getCallsiteCost(CallSite(CI), DL); 595 continue; 596 } 597 598 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 599 InlineCost += getCallsiteCost(CallSite(II), DL); 600 continue; 601 } 602 603 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 604 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 605 continue; 606 } 607 InlineCost += InlineConstants::InstrCost; 608 } 609 return InlineCost; 610 } 611 612 std::tuple<int, int> 613 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 614 // Now compute the cost of the call sequence to the outlined function 615 // 'OutlinedFunction' in BB 'OutliningCallBB': 616 int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); 617 618 // Now compute the cost of the extracted/outlined function itself: 619 int OutlinedFunctionCost = 0; 620 for (BasicBlock &BB : *Cloner.OutlinedFunc) { 621 OutlinedFunctionCost += computeBBInlineCost(&BB); 622 } 623 624 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 625 "Outlined function cost should be no less than the outlined region"); 626 // The code extractor introduces a new root and exit stub blocks with 627 // additional unconditional branches. Those branches will be eliminated 628 // later with bb layout. The cost should be adjusted accordingly: 629 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; 630 631 int OutliningRuntimeOverhead = 632 OutliningFuncCallCost + 633 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 634 ExtraOutliningPenalty; 635 636 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 637 } 638 639 // Create the callsite to profile count map which is 640 // used to update the original function's entry count, 641 // after the function is partially inlined into the callsite. 642 void PartialInlinerImpl::computeCallsiteToProfCountMap( 643 Function *DuplicateFunction, 644 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 645 std::vector<User *> Users(DuplicateFunction->user_begin(), 646 DuplicateFunction->user_end()); 647 Function *CurrentCaller = nullptr; 648 std::unique_ptr<BlockFrequencyInfo> TempBFI; 649 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 650 651 auto ComputeCurrBFI = [&,this](Function *Caller) { 652 // For the old pass manager: 653 if (!GetBFI) { 654 DominatorTree DT(*Caller); 655 LoopInfo LI(DT); 656 BranchProbabilityInfo BPI(*Caller, LI); 657 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 658 CurrentCallerBFI = TempBFI.get(); 659 } else { 660 // New pass manager: 661 CurrentCallerBFI = &(*GetBFI)(*Caller); 662 } 663 }; 664 665 for (User *User : Users) { 666 CallSite CS = getCallSite(User); 667 Function *Caller = CS.getCaller(); 668 if (CurrentCaller != Caller) { 669 CurrentCaller = Caller; 670 ComputeCurrBFI(Caller); 671 } else { 672 assert(CurrentCallerBFI && "CallerBFI is not set"); 673 } 674 BasicBlock *CallBB = CS.getInstruction()->getParent(); 675 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 676 if (Count) 677 CallSiteToProfCountMap[User] = *Count; 678 else 679 CallSiteToProfCountMap[User] = 0; 680 } 681 } 682 683 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, 684 FunctionOutliningInfo *OI) 685 : OrigFunc(F) { 686 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 687 688 // Clone the function, so that we can hack away on it. 689 ValueToValueMapTy VMap; 690 ClonedFunc = CloneFunction(F, VMap); 691 692 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 693 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 694 for (BasicBlock *BB : OI->Entries) { 695 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 696 } 697 for (BasicBlock *E : OI->ReturnBlockPreds) { 698 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 699 ClonedOI->ReturnBlockPreds.push_back(NewE); 700 } 701 // Go ahead and update all uses to the duplicate, so that we can just 702 // use the inliner functionality when we're done hacking. 703 F->replaceAllUsesWith(ClonedFunc); 704 } 705 706 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 707 auto getFirstPHI = [](BasicBlock *BB) { 708 BasicBlock::iterator I = BB->begin(); 709 PHINode *FirstPhi = nullptr; 710 while (I != BB->end()) { 711 PHINode *Phi = dyn_cast<PHINode>(I); 712 if (!Phi) 713 break; 714 if (!FirstPhi) { 715 FirstPhi = Phi; 716 break; 717 } 718 } 719 return FirstPhi; 720 }; 721 722 // Special hackery is needed with PHI nodes that have inputs from more than 723 // one extracted block. For simplicity, just split the PHIs into a two-level 724 // sequence of PHIs, some of which will go in the extracted region, and some 725 // of which will go outside. 726 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 727 // only split block when necessary: 728 PHINode *FirstPhi = getFirstPHI(PreReturn); 729 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 730 731 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 732 return; 733 734 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 735 Value *CommonValue = PN->getIncomingValue(0); 736 if (all_of(PN->incoming_values(), 737 [&](Value *V) { return V == CommonValue; })) 738 return CommonValue; 739 return nullptr; 740 }; 741 742 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 743 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 744 BasicBlock::iterator I = PreReturn->begin(); 745 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 746 SmallVector<Instruction *, 4> DeadPhis; 747 while (I != PreReturn->end()) { 748 PHINode *OldPhi = dyn_cast<PHINode>(I); 749 if (!OldPhi) 750 break; 751 752 PHINode *RetPhi = 753 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 754 OldPhi->replaceAllUsesWith(RetPhi); 755 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 756 757 RetPhi->addIncoming(&*I, PreReturn); 758 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 759 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 760 OldPhi->removeIncomingValue(E); 761 } 762 763 // After incoming values splitting, the old phi may become trivial. 764 // Keeping the trivial phi can introduce definition inside the outline 765 // region which is live-out, causing necessary overhead (load, store 766 // arg passing etc). 767 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 768 OldPhi->replaceAllUsesWith(OldPhiVal); 769 DeadPhis.push_back(OldPhi); 770 } 771 ++I; 772 } 773 for (auto *DP : DeadPhis) 774 DP->eraseFromParent(); 775 776 for (auto E : ClonedOI->ReturnBlockPreds) { 777 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 778 } 779 } 780 781 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { 782 // Returns true if the block is to be partial inlined into the caller 783 // (i.e. not to be extracted to the out of line function) 784 auto ToBeInlined = [&, this](BasicBlock *BB) { 785 return BB == ClonedOI->ReturnBlock || 786 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 787 ClonedOI->Entries.end()); 788 }; 789 790 // Gather up the blocks that we're going to extract. 791 std::vector<BasicBlock *> ToExtract; 792 ToExtract.push_back(ClonedOI->NonReturnBlock); 793 OutlinedRegionCost += 794 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 795 for (BasicBlock &BB : *ClonedFunc) 796 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 797 ToExtract.push_back(&BB); 798 // FIXME: the code extractor may hoist/sink more code 799 // into the outlined function which may make the outlining 800 // overhead (the difference of the outlined function cost 801 // and OutliningRegionCost) look larger. 802 OutlinedRegionCost += computeBBInlineCost(&BB); 803 } 804 805 // The CodeExtractor needs a dominator tree. 806 DominatorTree DT; 807 DT.recalculate(*ClonedFunc); 808 809 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 810 LoopInfo LI(DT); 811 BranchProbabilityInfo BPI(*ClonedFunc, LI); 812 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 813 814 // Extract the body of the if. 815 OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 816 ClonedFuncBFI.get(), &BPI) 817 .extractCodeRegion(); 818 819 if (OutlinedFunc) { 820 OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 821 .getInstruction() 822 ->getParent(); 823 assert(OutliningCallBB->getParent() == ClonedFunc); 824 } 825 826 return OutlinedFunc; 827 } 828 829 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 830 // Ditch the duplicate, since we're done with it, and rewrite all remaining 831 // users (function pointers, etc.) back to the original function. 832 ClonedFunc->replaceAllUsesWith(OrigFunc); 833 ClonedFunc->eraseFromParent(); 834 if (!IsFunctionInlined) { 835 // Remove the function that is speculatively created if there is no 836 // reference. 837 if (OutlinedFunc) 838 OutlinedFunc->eraseFromParent(); 839 } 840 } 841 842 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 843 if (F->hasAddressTaken()) 844 return nullptr; 845 846 // Let inliner handle it 847 if (F->hasFnAttribute(Attribute::AlwaysInline)) 848 return nullptr; 849 850 if (F->hasFnAttribute(Attribute::NoInline)) 851 return nullptr; 852 853 if (PSI->isFunctionEntryCold(F)) 854 return nullptr; 855 856 if (F->user_begin() == F->user_end()) 857 return nullptr; 858 859 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 860 861 if (!OI) 862 return nullptr; 863 864 FunctionCloner Cloner(F, OI.get()); 865 Cloner.NormalizeReturnBlock(); 866 Function *OutlinedFunction = Cloner.doFunctionOutlining(); 867 868 bool AnyInline = tryPartialInline(Cloner); 869 870 if (AnyInline) 871 return OutlinedFunction; 872 873 return nullptr; 874 } 875 876 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 877 int NonWeightedRcost; 878 int SizeCost; 879 880 if (Cloner.OutlinedFunc == nullptr) 881 return false; 882 883 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 884 885 auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 886 auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 887 888 // The call sequence to the outlined function is larger than the original 889 // outlined region size, it does not increase the chances of inlining 890 // the function with outlining (The inliner uses the size increase to 891 // model the cost of inlining a callee). 892 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 893 OptimizationRemarkEmitter ORE(Cloner.OrigFunc); 894 DebugLoc DLoc; 895 BasicBlock *Block; 896 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 897 ORE.emit([&]() { 898 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 899 DLoc, Block) 900 << ore::NV("Function", Cloner.OrigFunc) 901 << " not partially inlined into callers (Original Size = " 902 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 903 << ", Size of call sequence to outlined function = " 904 << ore::NV("NewSize", SizeCost) << ")"; 905 }); 906 return false; 907 } 908 909 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 910 "F's users should all be replaced!"); 911 912 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 913 Cloner.ClonedFunc->user_end()); 914 915 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 916 if (Cloner.OrigFunc->getEntryCount()) 917 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 918 919 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 920 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 921 922 bool AnyInline = false; 923 for (User *User : Users) { 924 CallSite CS = getCallSite(User); 925 926 if (IsLimitReached()) 927 continue; 928 929 OptimizationRemarkEmitter ORE(CS.getCaller()); 930 931 if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) 932 continue; 933 934 ORE.emit([&]() { 935 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", 936 CS.getInstruction()) 937 << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 938 << ore::NV("Caller", CS.getCaller()); 939 }); 940 941 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 942 InlineFunction(CS, IFI); 943 944 // Now update the entry count: 945 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 946 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 947 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 948 } 949 950 AnyInline = true; 951 NumPartialInlining++; 952 // Update the stats 953 NumPartialInlined++; 954 } 955 956 if (AnyInline) { 957 Cloner.IsFunctionInlined = true; 958 if (CalleeEntryCount) 959 Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); 960 } 961 962 return AnyInline; 963 } 964 965 bool PartialInlinerImpl::run(Module &M) { 966 if (DisablePartialInlining) 967 return false; 968 969 std::vector<Function *> Worklist; 970 Worklist.reserve(M.size()); 971 for (Function &F : M) 972 if (!F.use_empty() && !F.isDeclaration()) 973 Worklist.push_back(&F); 974 975 bool Changed = false; 976 while (!Worklist.empty()) { 977 Function *CurrFunc = Worklist.back(); 978 Worklist.pop_back(); 979 980 if (CurrFunc->use_empty()) 981 continue; 982 983 bool Recursive = false; 984 for (User *U : CurrFunc->users()) 985 if (Instruction *I = dyn_cast<Instruction>(U)) 986 if (I->getParent()->getParent() == CurrFunc) { 987 Recursive = true; 988 break; 989 } 990 if (Recursive) 991 continue; 992 993 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 994 Worklist.push_back(NewFunc); 995 Changed = true; 996 } 997 } 998 999 return Changed; 1000 } 1001 1002 char PartialInlinerLegacyPass::ID = 0; 1003 1004 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1005 "Partial Inliner", false, false) 1006 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1007 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1008 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1009 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1010 "Partial Inliner", false, false) 1011 1012 ModulePass *llvm::createPartialInliningPass() { 1013 return new PartialInlinerLegacyPass(); 1014 } 1015 1016 PreservedAnalyses PartialInlinerPass::run(Module &M, 1017 ModuleAnalysisManager &AM) { 1018 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1019 1020 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1021 [&FAM](Function &F) -> AssumptionCache & { 1022 return FAM.getResult<AssumptionAnalysis>(F); 1023 }; 1024 1025 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1026 [&FAM](Function &F) -> BlockFrequencyInfo & { 1027 return FAM.getResult<BlockFrequencyAnalysis>(F); 1028 }; 1029 1030 std::function<TargetTransformInfo &(Function &)> GetTTI = 1031 [&FAM](Function &F) -> TargetTransformInfo & { 1032 return FAM.getResult<TargetIRAnalysis>(F); 1033 }; 1034 1035 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1036 1037 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 1038 return PreservedAnalyses::none(); 1039 return PreservedAnalyses::all(); 1040 } 1041