1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/CodeMetrics.h" 20 #include "llvm/Analysis/InlineCost.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/IntrinsicInst.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/Pass.h" 33 #include "llvm/Transforms/IPO.h" 34 #include "llvm/Transforms/Utils/Cloning.h" 35 #include "llvm/Transforms/Utils/CodeExtractor.h" 36 using namespace llvm; 37 38 #define DEBUG_TYPE "partial-inlining" 39 40 STATISTIC(NumPartialInlined, 41 "Number of callsites functions partially inlined into."); 42 43 // Command line option to disable partial-inlining. The default is false: 44 static cl::opt<bool> 45 DisablePartialInlining("disable-partial-inlining", cl::init(false), 46 cl::Hidden, cl::desc("Disable partial ininling")); 47 // This is an option used by testing: 48 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 49 cl::init(false), cl::ZeroOrMore, 50 cl::ReallyHidden, 51 cl::desc("Skip Cost Analysis")); 52 53 static cl::opt<unsigned> MaxNumInlineBlocks( 54 "max-num-inline-blocks", cl::init(5), cl::Hidden, 55 cl::desc("Max number of blocks to be partially inlined")); 56 57 // Command line option to set the maximum number of partial inlining allowed 58 // for the module. The default value of -1 means no limit. 59 static cl::opt<int> MaxNumPartialInlining( 60 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 61 cl::desc("Max number of partial inlining. The default is unlimited")); 62 63 // Used only when PGO or user annotated branch data is absent. It is 64 // the least value that is used to weigh the outline region. If BFI 65 // produces larger value, the BFI value will be used. 66 static cl::opt<int> 67 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 68 cl::Hidden, cl::ZeroOrMore, 69 cl::desc("Relative frequency of outline region to " 70 "the entry block")); 71 72 static cl::opt<unsigned> ExtraOutliningPenalty( 73 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 74 cl::desc("A debug option to add additional penalty to the computed one.")); 75 76 namespace { 77 78 struct FunctionOutliningInfo { 79 FunctionOutliningInfo() 80 : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), 81 ReturnBlockPreds() {} 82 // Returns the number of blocks to be inlined including all blocks 83 // in Entries and one return block. 84 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 85 86 // A set of blocks including the function entry that guard 87 // the region to be outlined. 88 SmallVector<BasicBlock *, 4> Entries; 89 // The return block that is not included in the outlined region. 90 BasicBlock *ReturnBlock; 91 // The dominating block of the region to be outlined. 92 BasicBlock *NonReturnBlock; 93 // The set of blocks in Entries that that are predecessors to ReturnBlock 94 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 95 }; 96 97 struct PartialInlinerImpl { 98 PartialInlinerImpl( 99 std::function<AssumptionCache &(Function &)> *GetAC, 100 std::function<TargetTransformInfo &(Function &)> *GTTI, 101 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 102 ProfileSummaryInfo *ProfSI) 103 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 104 bool run(Module &M); 105 Function *unswitchFunction(Function *F); 106 107 // This class speculatively clones the the function to be partial inlined. 108 // At the end of partial inlining, the remaining callsites to the cloned 109 // function that are not partially inlined will be fixed up to reference 110 // the original function, and the cloned function will be erased. 111 struct FunctionCloner { 112 FunctionCloner(Function *F, FunctionOutliningInfo *OI); 113 ~FunctionCloner(); 114 115 // Prepare for function outlining: making sure there is only 116 // one incoming edge from the extracted/outlined region to 117 // the return block. 118 void NormalizeReturnBlock(); 119 120 // Do function outlining: 121 Function *doFunctionOutlining(); 122 123 Function *OrigFunc = nullptr; 124 Function *ClonedFunc = nullptr; 125 Function *OutlinedFunc = nullptr; 126 BasicBlock *OutliningCallBB = nullptr; 127 // ClonedFunc is inlined in one of its callers after function 128 // outlining. 129 bool IsFunctionInlined = false; 130 // The cost of the region to be outlined. 131 int OutlinedRegionCost = 0; 132 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 133 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 134 }; 135 136 private: 137 int NumPartialInlining = 0; 138 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 139 std::function<TargetTransformInfo &(Function &)> *GetTTI; 140 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 141 ProfileSummaryInfo *PSI; 142 143 // Return the frequency of the OutlininingBB relative to F's entry point. 144 // The result is no larger than 1 and is represented using BP. 145 // (Note that the outlined region's 'head' block can only have incoming 146 // edges from the guarding entry blocks). 147 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 148 149 // Return true if the callee of CS should be partially inlined with 150 // profit. 151 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 152 BlockFrequency WeightedOutliningRcost, 153 OptimizationRemarkEmitter &ORE); 154 155 // Try to inline DuplicateFunction (cloned from F with call to 156 // the OutlinedFunction into its callers. Return true 157 // if there is any successful inlining. 158 bool tryPartialInline(FunctionCloner &Cloner); 159 160 // Compute the mapping from use site of DuplicationFunction to the enclosing 161 // BB's profile count. 162 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 163 DenseMap<User *, uint64_t> &SiteCountMap); 164 165 bool IsLimitReached() { 166 return (MaxNumPartialInlining != -1 && 167 NumPartialInlining >= MaxNumPartialInlining); 168 } 169 170 static CallSite getCallSite(User *U) { 171 CallSite CS; 172 if (CallInst *CI = dyn_cast<CallInst>(U)) 173 CS = CallSite(CI); 174 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 175 CS = CallSite(II); 176 else 177 llvm_unreachable("All uses must be calls"); 178 return CS; 179 } 180 181 static CallSite getOneCallSiteTo(Function *F) { 182 User *User = *F->user_begin(); 183 return getCallSite(User); 184 } 185 186 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 187 CallSite CS = getOneCallSiteTo(F); 188 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 189 BasicBlock *Block = CS.getParent(); 190 return std::make_tuple(DLoc, Block); 191 } 192 193 // Returns the costs associated with function outlining: 194 // - The first value is the non-weighted runtime cost for making the call 195 // to the outlined function, including the addtional setup cost in the 196 // outlined function itself; 197 // - The second value is the estimated size of the new call sequence in 198 // basic block Cloner.OutliningCallBB; 199 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 200 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 201 // approximate both the size and runtime cost (Note that in the current 202 // inline cost analysis, there is no clear distinction there either). 203 static int computeBBInlineCost(BasicBlock *BB); 204 205 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 206 207 }; 208 209 struct PartialInlinerLegacyPass : public ModulePass { 210 static char ID; // Pass identification, replacement for typeid 211 PartialInlinerLegacyPass() : ModulePass(ID) { 212 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 213 } 214 215 void getAnalysisUsage(AnalysisUsage &AU) const override { 216 AU.addRequired<AssumptionCacheTracker>(); 217 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 218 AU.addRequired<TargetTransformInfoWrapperPass>(); 219 } 220 bool runOnModule(Module &M) override { 221 if (skipModule(M)) 222 return false; 223 224 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 225 TargetTransformInfoWrapperPass *TTIWP = 226 &getAnalysis<TargetTransformInfoWrapperPass>(); 227 ProfileSummaryInfo *PSI = 228 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 229 230 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 231 [&ACT](Function &F) -> AssumptionCache & { 232 return ACT->getAssumptionCache(F); 233 }; 234 235 std::function<TargetTransformInfo &(Function &)> GetTTI = 236 [&TTIWP](Function &F) -> TargetTransformInfo & { 237 return TTIWP->getTTI(F); 238 }; 239 240 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 241 } 242 }; 243 } 244 245 std::unique_ptr<FunctionOutliningInfo> 246 PartialInlinerImpl::computeOutliningInfo(Function *F) { 247 BasicBlock *EntryBlock = &F->front(); 248 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 249 if (!BR || BR->isUnconditional()) 250 return std::unique_ptr<FunctionOutliningInfo>(); 251 252 // Returns true if Succ is BB's successor 253 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 254 return is_contained(successors(BB), Succ); 255 }; 256 257 auto SuccSize = [](BasicBlock *BB) { 258 return std::distance(succ_begin(BB), succ_end(BB)); 259 }; 260 261 auto IsReturnBlock = [](BasicBlock *BB) { 262 TerminatorInst *TI = BB->getTerminator(); 263 return isa<ReturnInst>(TI); 264 }; 265 266 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 267 if (IsReturnBlock(Succ1)) 268 return std::make_tuple(Succ1, Succ2); 269 if (IsReturnBlock(Succ2)) 270 return std::make_tuple(Succ2, Succ1); 271 272 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 273 }; 274 275 // Detect a triangular shape: 276 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 277 if (IsSuccessor(Succ1, Succ2)) 278 return std::make_tuple(Succ1, Succ2); 279 if (IsSuccessor(Succ2, Succ1)) 280 return std::make_tuple(Succ2, Succ1); 281 282 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 283 }; 284 285 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 286 llvm::make_unique<FunctionOutliningInfo>(); 287 288 BasicBlock *CurrEntry = EntryBlock; 289 bool CandidateFound = false; 290 do { 291 // The number of blocks to be inlined has already reached 292 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 293 // disables partial inlining for the function. 294 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 295 break; 296 297 if (SuccSize(CurrEntry) != 2) 298 break; 299 300 BasicBlock *Succ1 = *succ_begin(CurrEntry); 301 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 302 303 BasicBlock *ReturnBlock, *NonReturnBlock; 304 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 305 306 if (ReturnBlock) { 307 OutliningInfo->Entries.push_back(CurrEntry); 308 OutliningInfo->ReturnBlock = ReturnBlock; 309 OutliningInfo->NonReturnBlock = NonReturnBlock; 310 CandidateFound = true; 311 break; 312 } 313 314 BasicBlock *CommSucc; 315 BasicBlock *OtherSucc; 316 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 317 318 if (!CommSucc) 319 break; 320 321 OutliningInfo->Entries.push_back(CurrEntry); 322 CurrEntry = OtherSucc; 323 324 } while (true); 325 326 if (!CandidateFound) 327 return std::unique_ptr<FunctionOutliningInfo>(); 328 329 // Do sanity check of the entries: threre should not 330 // be any successors (not in the entry set) other than 331 // {ReturnBlock, NonReturnBlock} 332 assert(OutliningInfo->Entries[0] == &F->front() && 333 "Function Entry must be the first in Entries vector"); 334 DenseSet<BasicBlock *> Entries; 335 for (BasicBlock *E : OutliningInfo->Entries) 336 Entries.insert(E); 337 338 // Returns true of BB has Predecessor which is not 339 // in Entries set. 340 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 341 for (auto Pred : predecessors(BB)) { 342 if (!Entries.count(Pred)) 343 return true; 344 } 345 return false; 346 }; 347 auto CheckAndNormalizeCandidate = 348 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 349 for (BasicBlock *E : OutliningInfo->Entries) { 350 for (auto Succ : successors(E)) { 351 if (Entries.count(Succ)) 352 continue; 353 if (Succ == OutliningInfo->ReturnBlock) 354 OutliningInfo->ReturnBlockPreds.push_back(E); 355 else if (Succ != OutliningInfo->NonReturnBlock) 356 return false; 357 } 358 // There should not be any outside incoming edges either: 359 if (HasNonEntryPred(E)) 360 return false; 361 } 362 return true; 363 }; 364 365 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 366 return std::unique_ptr<FunctionOutliningInfo>(); 367 368 // Now further growing the candidate's inlining region by 369 // peeling off dominating blocks from the outlining region: 370 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 371 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 372 if (SuccSize(Cand) != 2) 373 break; 374 375 if (HasNonEntryPred(Cand)) 376 break; 377 378 BasicBlock *Succ1 = *succ_begin(Cand); 379 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 380 381 BasicBlock *ReturnBlock, *NonReturnBlock; 382 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 383 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 384 break; 385 386 if (NonReturnBlock->getSinglePredecessor() != Cand) 387 break; 388 389 // Now grow and update OutlininigInfo: 390 OutliningInfo->Entries.push_back(Cand); 391 OutliningInfo->NonReturnBlock = NonReturnBlock; 392 OutliningInfo->ReturnBlockPreds.push_back(Cand); 393 Entries.insert(Cand); 394 } 395 396 return OutliningInfo; 397 } 398 399 // Check if there is PGO data or user annoated branch data: 400 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 401 if (F->getEntryCount()) 402 return true; 403 // Now check if any of the entry block has MD_prof data: 404 for (auto *E : OI->Entries) { 405 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 406 if (!BR || BR->isUnconditional()) 407 continue; 408 uint64_t T, F; 409 if (BR->extractProfMetadata(T, F)) 410 return true; 411 } 412 return false; 413 } 414 415 BranchProbability 416 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 417 418 auto EntryFreq = 419 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 420 auto OutliningCallFreq = 421 Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); 422 423 auto OutlineRegionRelFreq = 424 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 425 EntryFreq.getFrequency()); 426 427 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 428 return OutlineRegionRelFreq; 429 430 // When profile data is not available, we need to be conservative in 431 // estimating the overall savings. Static branch prediction can usually 432 // guess the branch direction right (taken/non-taken), but the guessed 433 // branch probability is usually not biased enough. In case when the 434 // outlined region is predicted to be likely, its probability needs 435 // to be made higher (more biased) to not under-estimate the cost of 436 // function outlining. On the other hand, if the outlined region 437 // is predicted to be less likely, the predicted probablity is usually 438 // higher than the actual. For instance, the actual probability of the 439 // less likely target is only 5%, but the guessed probablity can be 440 // 40%. In the latter case, there is no need for further adjustement. 441 // FIXME: add an option for this. 442 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 443 return OutlineRegionRelFreq; 444 445 OutlineRegionRelFreq = std::max( 446 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 447 448 return OutlineRegionRelFreq; 449 } 450 451 bool PartialInlinerImpl::shouldPartialInline( 452 CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 453 OptimizationRemarkEmitter &ORE) { 454 455 using namespace ore; 456 if (SkipCostAnalysis) 457 return true; 458 459 Instruction *Call = CS.getInstruction(); 460 Function *Callee = CS.getCalledFunction(); 461 assert(Callee == Cloner.ClonedFunc); 462 463 Function *Caller = CS.getCaller(); 464 auto &CalleeTTI = (*GetTTI)(*Callee); 465 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 466 *GetAssumptionCache, GetBFI, PSI, &ORE); 467 468 if (IC.isAlways()) { 469 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 470 << NV("Callee", Cloner.OrigFunc) 471 << " should always be fully inlined, not partially"); 472 return false; 473 } 474 475 if (IC.isNever()) { 476 ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 477 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 478 << NV("Caller", Caller) 479 << " because it should never be inlined (cost=never)"); 480 return false; 481 } 482 483 if (!IC) { 484 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 485 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 486 << NV("Caller", Caller) << " because too costly to inline (cost=" 487 << NV("Cost", IC.getCost()) << ", threshold=" 488 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 489 return false; 490 } 491 const DataLayout &DL = Caller->getParent()->getDataLayout(); 492 493 // The savings of eliminating the call: 494 int NonWeightedSavings = getCallsiteCost(CS, DL); 495 BlockFrequency NormWeightedSavings(NonWeightedSavings); 496 497 // Weighted saving is smaller than weighted cost, return false 498 if (NormWeightedSavings < WeightedOutliningRcost) { 499 ORE.emit( 500 OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) 501 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 502 << NV("Caller", Caller) << " runtime overhead (overhead=" 503 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 504 << ", savings=" 505 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" 506 << " of making the outlined call is too high"); 507 508 return false; 509 } 510 511 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 512 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 513 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 514 << " (threshold=" 515 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 516 return true; 517 } 518 519 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 520 // For now use a simplified version. The returned 'InlineCost' will be used 521 // to esimate the size cost as well as runtime cost of the BB. 522 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 523 int InlineCost = 0; 524 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 525 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 526 if (isa<DbgInfoIntrinsic>(I)) 527 continue; 528 529 switch (I->getOpcode()) { 530 case Instruction::BitCast: 531 case Instruction::PtrToInt: 532 case Instruction::IntToPtr: 533 case Instruction::Alloca: 534 continue; 535 case Instruction::GetElementPtr: 536 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 537 continue; 538 default: 539 break; 540 } 541 542 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 543 if (IntrInst) { 544 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 545 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 546 continue; 547 } 548 549 if (CallInst *CI = dyn_cast<CallInst>(I)) { 550 InlineCost += getCallsiteCost(CallSite(CI), DL); 551 continue; 552 } 553 554 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 555 InlineCost += getCallsiteCost(CallSite(II), DL); 556 continue; 557 } 558 559 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 560 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 561 continue; 562 } 563 InlineCost += InlineConstants::InstrCost; 564 } 565 return InlineCost; 566 } 567 568 std::tuple<int, int> 569 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 570 571 // Now compute the cost of the call sequence to the outlined function 572 // 'OutlinedFunction' in BB 'OutliningCallBB': 573 int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); 574 575 // Now compute the cost of the extracted/outlined function itself: 576 int OutlinedFunctionCost = 0; 577 for (BasicBlock &BB : *Cloner.OutlinedFunc) { 578 OutlinedFunctionCost += computeBBInlineCost(&BB); 579 } 580 581 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 582 "Outlined function cost should be no less than the outlined region"); 583 // The code extractor introduces a new root and exit stub blocks with 584 // additional unconditional branches. Those branches will be eliminated 585 // later with bb layout. The cost should be adjusted accordingly: 586 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; 587 588 int OutliningRuntimeOverhead = 589 OutliningFuncCallCost + 590 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 591 ExtraOutliningPenalty; 592 593 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 594 } 595 596 // Create the callsite to profile count map which is 597 // used to update the original function's entry count, 598 // after the function is partially inlined into the callsite. 599 void PartialInlinerImpl::computeCallsiteToProfCountMap( 600 Function *DuplicateFunction, 601 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 602 std::vector<User *> Users(DuplicateFunction->user_begin(), 603 DuplicateFunction->user_end()); 604 Function *CurrentCaller = nullptr; 605 std::unique_ptr<BlockFrequencyInfo> TempBFI; 606 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 607 608 auto ComputeCurrBFI = [&,this](Function *Caller) { 609 // For the old pass manager: 610 if (!GetBFI) { 611 DominatorTree DT(*Caller); 612 LoopInfo LI(DT); 613 BranchProbabilityInfo BPI(*Caller, LI); 614 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 615 CurrentCallerBFI = TempBFI.get(); 616 } else { 617 // New pass manager: 618 CurrentCallerBFI = &(*GetBFI)(*Caller); 619 } 620 }; 621 622 for (User *User : Users) { 623 CallSite CS = getCallSite(User); 624 Function *Caller = CS.getCaller(); 625 if (CurrentCaller != Caller) { 626 CurrentCaller = Caller; 627 ComputeCurrBFI(Caller); 628 } else { 629 assert(CurrentCallerBFI && "CallerBFI is not set"); 630 } 631 BasicBlock *CallBB = CS.getInstruction()->getParent(); 632 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 633 if (Count) 634 CallSiteToProfCountMap[User] = *Count; 635 else 636 CallSiteToProfCountMap[User] = 0; 637 } 638 } 639 640 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, 641 FunctionOutliningInfo *OI) 642 : OrigFunc(F) { 643 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 644 645 // Clone the function, so that we can hack away on it. 646 ValueToValueMapTy VMap; 647 ClonedFunc = CloneFunction(F, VMap); 648 649 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 650 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 651 for (BasicBlock *BB : OI->Entries) { 652 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 653 } 654 for (BasicBlock *E : OI->ReturnBlockPreds) { 655 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 656 ClonedOI->ReturnBlockPreds.push_back(NewE); 657 } 658 // Go ahead and update all uses to the duplicate, so that we can just 659 // use the inliner functionality when we're done hacking. 660 F->replaceAllUsesWith(ClonedFunc); 661 } 662 663 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 664 665 auto getFirstPHI = [](BasicBlock *BB) { 666 BasicBlock::iterator I = BB->begin(); 667 PHINode *FirstPhi = nullptr; 668 while (I != BB->end()) { 669 PHINode *Phi = dyn_cast<PHINode>(I); 670 if (!Phi) 671 break; 672 if (!FirstPhi) { 673 FirstPhi = Phi; 674 break; 675 } 676 } 677 return FirstPhi; 678 }; 679 680 // Special hackery is needed with PHI nodes that have inputs from more than 681 // one extracted block. For simplicity, just split the PHIs into a two-level 682 // sequence of PHIs, some of which will go in the extracted region, and some 683 // of which will go outside. 684 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 685 // only split block when necessary: 686 PHINode *FirstPhi = getFirstPHI(PreReturn); 687 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 688 689 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 690 return; 691 692 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 693 Value *CommonValue = PN->getIncomingValue(0); 694 if (all_of(PN->incoming_values(), 695 [&](Value *V) { return V == CommonValue; })) 696 return CommonValue; 697 return nullptr; 698 }; 699 700 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 701 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 702 BasicBlock::iterator I = PreReturn->begin(); 703 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 704 SmallVector<Instruction *, 4> DeadPhis; 705 while (I != PreReturn->end()) { 706 PHINode *OldPhi = dyn_cast<PHINode>(I); 707 if (!OldPhi) 708 break; 709 710 PHINode *RetPhi = 711 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 712 OldPhi->replaceAllUsesWith(RetPhi); 713 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 714 715 RetPhi->addIncoming(&*I, PreReturn); 716 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 717 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 718 OldPhi->removeIncomingValue(E); 719 } 720 721 // After incoming values splitting, the old phi may become trivial. 722 // Keeping the trivial phi can introduce definition inside the outline 723 // region which is live-out, causing necessary overhead (load, store 724 // arg passing etc). 725 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 726 OldPhi->replaceAllUsesWith(OldPhiVal); 727 DeadPhis.push_back(OldPhi); 728 } 729 ++I; 730 } 731 for (auto *DP : DeadPhis) 732 DP->eraseFromParent(); 733 734 for (auto E : ClonedOI->ReturnBlockPreds) { 735 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 736 } 737 } 738 739 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { 740 // Returns true if the block is to be partial inlined into the caller 741 // (i.e. not to be extracted to the out of line function) 742 auto ToBeInlined = [&, this](BasicBlock *BB) { 743 return BB == ClonedOI->ReturnBlock || 744 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 745 ClonedOI->Entries.end()); 746 }; 747 748 // Gather up the blocks that we're going to extract. 749 std::vector<BasicBlock *> ToExtract; 750 ToExtract.push_back(ClonedOI->NonReturnBlock); 751 OutlinedRegionCost += 752 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 753 for (BasicBlock &BB : *ClonedFunc) 754 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 755 ToExtract.push_back(&BB); 756 // FIXME: the code extractor may hoist/sink more code 757 // into the outlined function which may make the outlining 758 // overhead (the difference of the outlined function cost 759 // and OutliningRegionCost) look larger. 760 OutlinedRegionCost += computeBBInlineCost(&BB); 761 } 762 763 // The CodeExtractor needs a dominator tree. 764 DominatorTree DT; 765 DT.recalculate(*ClonedFunc); 766 767 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 768 LoopInfo LI(DT); 769 BranchProbabilityInfo BPI(*ClonedFunc, LI); 770 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 771 772 // Extract the body of the if. 773 OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 774 ClonedFuncBFI.get(), &BPI) 775 .extractCodeRegion(); 776 777 if (OutlinedFunc) { 778 OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 779 .getInstruction() 780 ->getParent(); 781 assert(OutliningCallBB->getParent() == ClonedFunc); 782 } 783 784 return OutlinedFunc; 785 } 786 787 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 788 // Ditch the duplicate, since we're done with it, and rewrite all remaining 789 // users (function pointers, etc.) back to the original function. 790 ClonedFunc->replaceAllUsesWith(OrigFunc); 791 ClonedFunc->eraseFromParent(); 792 if (!IsFunctionInlined) { 793 // Remove the function that is speculatively created if there is no 794 // reference. 795 if (OutlinedFunc) 796 OutlinedFunc->eraseFromParent(); 797 } 798 } 799 800 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 801 802 if (F->hasAddressTaken()) 803 return nullptr; 804 805 // Let inliner handle it 806 if (F->hasFnAttribute(Attribute::AlwaysInline)) 807 return nullptr; 808 809 if (F->hasFnAttribute(Attribute::NoInline)) 810 return nullptr; 811 812 if (PSI->isFunctionEntryCold(F)) 813 return nullptr; 814 815 if (F->user_begin() == F->user_end()) 816 return nullptr; 817 818 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 819 820 if (!OI) 821 return nullptr; 822 823 FunctionCloner Cloner(F, OI.get()); 824 Cloner.NormalizeReturnBlock(); 825 Function *OutlinedFunction = Cloner.doFunctionOutlining(); 826 827 bool AnyInline = tryPartialInline(Cloner); 828 829 if (AnyInline) 830 return OutlinedFunction; 831 832 return nullptr; 833 } 834 835 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 836 int NonWeightedRcost; 837 int SizeCost; 838 839 if (Cloner.OutlinedFunc == nullptr) 840 return false; 841 842 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 843 844 auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 845 auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 846 847 // The call sequence to the outlined function is larger than the original 848 // outlined region size, it does not increase the chances of inlining 849 // the function with outlining (The inliner uses the size increase to 850 // model the cost of inlining a callee). 851 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 852 OptimizationRemarkEmitter ORE(Cloner.OrigFunc); 853 DebugLoc DLoc; 854 BasicBlock *Block; 855 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 856 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 857 DLoc, Block) 858 << ore::NV("Function", Cloner.OrigFunc) 859 << " not partially inlined into callers (Original Size = " 860 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 861 << ", Size of call sequence to outlined function = " 862 << ore::NV("NewSize", SizeCost) << ")"); 863 return false; 864 } 865 866 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 867 "F's users should all be replaced!"); 868 869 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 870 Cloner.ClonedFunc->user_end()); 871 872 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 873 if (Cloner.OrigFunc->getEntryCount()) 874 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 875 876 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 877 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 878 879 bool AnyInline = false; 880 for (User *User : Users) { 881 CallSite CS = getCallSite(User); 882 883 if (IsLimitReached()) 884 continue; 885 886 OptimizationRemarkEmitter ORE(CS.getCaller()); 887 888 if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) 889 continue; 890 891 ORE.emit( 892 OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) 893 << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 894 << ore::NV("Caller", CS.getCaller())); 895 896 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 897 InlineFunction(CS, IFI); 898 899 // Now update the entry count: 900 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 901 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 902 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 903 } 904 905 AnyInline = true; 906 NumPartialInlining++; 907 // Update the stats 908 NumPartialInlined++; 909 } 910 911 if (AnyInline) { 912 Cloner.IsFunctionInlined = true; 913 if (CalleeEntryCount) 914 Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); 915 } 916 917 return AnyInline; 918 } 919 920 bool PartialInlinerImpl::run(Module &M) { 921 if (DisablePartialInlining) 922 return false; 923 924 std::vector<Function *> Worklist; 925 Worklist.reserve(M.size()); 926 for (Function &F : M) 927 if (!F.use_empty() && !F.isDeclaration()) 928 Worklist.push_back(&F); 929 930 bool Changed = false; 931 while (!Worklist.empty()) { 932 Function *CurrFunc = Worklist.back(); 933 Worklist.pop_back(); 934 935 if (CurrFunc->use_empty()) 936 continue; 937 938 bool Recursive = false; 939 for (User *U : CurrFunc->users()) 940 if (Instruction *I = dyn_cast<Instruction>(U)) 941 if (I->getParent()->getParent() == CurrFunc) { 942 Recursive = true; 943 break; 944 } 945 if (Recursive) 946 continue; 947 948 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 949 Worklist.push_back(NewFunc); 950 Changed = true; 951 } 952 } 953 954 return Changed; 955 } 956 957 char PartialInlinerLegacyPass::ID = 0; 958 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 959 "Partial Inliner", false, false) 960 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 961 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 962 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 963 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 964 "Partial Inliner", false, false) 965 966 ModulePass *llvm::createPartialInliningPass() { 967 return new PartialInlinerLegacyPass(); 968 } 969 970 PreservedAnalyses PartialInlinerPass::run(Module &M, 971 ModuleAnalysisManager &AM) { 972 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 973 974 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 975 [&FAM](Function &F) -> AssumptionCache & { 976 return FAM.getResult<AssumptionAnalysis>(F); 977 }; 978 979 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 980 [&FAM](Function &F) -> BlockFrequencyInfo & { 981 return FAM.getResult<BlockFrequencyAnalysis>(F); 982 }; 983 984 std::function<TargetTransformInfo &(Function &)> GetTTI = 985 [&FAM](Function &F) -> TargetTransformInfo & { 986 return FAM.getResult<TargetIRAnalysis>(F); 987 }; 988 989 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 990 991 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 992 return PreservedAnalyses::none(); 993 return PreservedAnalyses::all(); 994 } 995