1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/CodeMetrics.h" 20 #include "llvm/Analysis/InlineCost.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Module.h" 31 #include "llvm/Pass.h" 32 #include "llvm/Transforms/IPO.h" 33 #include "llvm/Transforms/Utils/Cloning.h" 34 #include "llvm/Transforms/Utils/CodeExtractor.h" 35 using namespace llvm; 36 37 #define DEBUG_TYPE "partial-inlining" 38 39 STATISTIC(NumPartialInlined, 40 "Number of callsites functions partially inlined into."); 41 42 // Command line option to disable partial-inlining. The default is false: 43 static cl::opt<bool> 44 DisablePartialInlining("disable-partial-inlining", cl::init(false), 45 cl::Hidden, cl::desc("Disable partial ininling")); 46 // This is an option used by testing: 47 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 48 cl::init(false), cl::ZeroOrMore, 49 cl::ReallyHidden, 50 cl::desc("Skip Cost Analysis")); 51 52 static cl::opt<unsigned> MaxNumInlineBlocks( 53 "max-num-inline-blocks", cl::init(5), cl::Hidden, 54 cl::desc("Max number of blocks to be partially inlined")); 55 56 // Command line option to set the maximum number of partial inlining allowed 57 // for the module. The default value of -1 means no limit. 58 static cl::opt<int> MaxNumPartialInlining( 59 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 60 cl::desc("Max number of partial inlining. The default is unlimited")); 61 62 // Used only when PGO or user annotated branch data is absent. It is 63 // the least value that is used to weigh the outline region. If BFI 64 // produces larger value, the BFI value will be used. 65 static cl::opt<int> 66 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 67 cl::Hidden, cl::ZeroOrMore, 68 cl::desc("Relative frequency of outline region to " 69 "the entry block")); 70 71 static cl::opt<unsigned> ExtraOutliningPenalty( 72 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 73 cl::desc("A debug option to add additional penalty to the computed one.")); 74 75 namespace { 76 77 struct FunctionOutliningInfo { 78 FunctionOutliningInfo() 79 : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), 80 ReturnBlockPreds() {} 81 // Returns the number of blocks to be inlined including all blocks 82 // in Entries and one return block. 83 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 84 85 // A set of blocks including the function entry that guard 86 // the region to be outlined. 87 SmallVector<BasicBlock *, 4> Entries; 88 // The return block that is not included in the outlined region. 89 BasicBlock *ReturnBlock; 90 // The dominating block of the region to be outlined. 91 BasicBlock *NonReturnBlock; 92 // The set of blocks in Entries that that are predecessors to ReturnBlock 93 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 94 }; 95 96 struct PartialInlinerImpl { 97 PartialInlinerImpl( 98 std::function<AssumptionCache &(Function &)> *GetAC, 99 std::function<TargetTransformInfo &(Function &)> *GTTI, 100 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 101 ProfileSummaryInfo *ProfSI) 102 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 103 bool run(Module &M); 104 Function *unswitchFunction(Function *F); 105 106 // This class speculatively clones the the function to be partial inlined. 107 // At the end of partial inlining, the remaining callsites to the cloned 108 // function that are not partially inlined will be fixed up to reference 109 // the original function, and the cloned function will be erased. 110 struct FunctionCloner { 111 FunctionCloner(Function *F, FunctionOutliningInfo *OI); 112 ~FunctionCloner(); 113 114 // Prepare for function outlining: making sure there is only 115 // one incoming edge from the extracted/outlined region to 116 // the return block. 117 void NormalizeReturnBlock(); 118 119 // Do function outlining: 120 Function *doFunctionOutlining(); 121 122 Function *OrigFunc = nullptr; 123 Function *ClonedFunc = nullptr; 124 Function *OutlinedFunc = nullptr; 125 BasicBlock *OutliningCallBB = nullptr; 126 // ClonedFunc is inlined in one of its callers after function 127 // outlining. 128 bool IsFunctionInlined = false; 129 // The cost of the region to be outlined. 130 int OutlinedRegionCost = 0; 131 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 132 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 133 }; 134 135 private: 136 int NumPartialInlining = 0; 137 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 138 std::function<TargetTransformInfo &(Function &)> *GetTTI; 139 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 140 ProfileSummaryInfo *PSI; 141 142 // Return the frequency of the OutlininingBB relative to F's entry point. 143 // The result is no larger than 1 and is represented using BP. 144 // (Note that the outlined region's 'head' block can only have incoming 145 // edges from the guarding entry blocks). 146 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 147 148 // Return true if the callee of CS should be partially inlined with 149 // profit. 150 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 151 BlockFrequency WeightedOutliningRcost, 152 OptimizationRemarkEmitter &ORE); 153 154 // Try to inline DuplicateFunction (cloned from F with call to 155 // the OutlinedFunction into its callers. Return true 156 // if there is any successful inlining. 157 bool tryPartialInline(FunctionCloner &Cloner); 158 159 // Compute the mapping from use site of DuplicationFunction to the enclosing 160 // BB's profile count. 161 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 162 DenseMap<User *, uint64_t> &SiteCountMap); 163 164 bool IsLimitReached() { 165 return (MaxNumPartialInlining != -1 && 166 NumPartialInlining >= MaxNumPartialInlining); 167 } 168 169 static CallSite getCallSite(User *U) { 170 CallSite CS; 171 if (CallInst *CI = dyn_cast<CallInst>(U)) 172 CS = CallSite(CI); 173 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 174 CS = CallSite(II); 175 else 176 llvm_unreachable("All uses must be calls"); 177 return CS; 178 } 179 180 static CallSite getOneCallSiteTo(Function *F) { 181 User *User = *F->user_begin(); 182 return getCallSite(User); 183 } 184 185 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 186 CallSite CS = getOneCallSiteTo(F); 187 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 188 BasicBlock *Block = CS.getParent(); 189 return std::make_tuple(DLoc, Block); 190 } 191 192 // Returns the costs associated with function outlining: 193 // - The first value is the non-weighted runtime cost for making the call 194 // to the outlined function, including the addtional setup cost in the 195 // outlined function itself; 196 // - The second value is the estimated size of the new call sequence in 197 // basic block Cloner.OutliningCallBB; 198 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 199 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 200 // approximate both the size and runtime cost (Note that in the current 201 // inline cost analysis, there is no clear distinction there either). 202 static int computeBBInlineCost(BasicBlock *BB); 203 204 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 205 206 }; 207 208 struct PartialInlinerLegacyPass : public ModulePass { 209 static char ID; // Pass identification, replacement for typeid 210 PartialInlinerLegacyPass() : ModulePass(ID) { 211 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 212 } 213 214 void getAnalysisUsage(AnalysisUsage &AU) const override { 215 AU.addRequired<AssumptionCacheTracker>(); 216 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 217 AU.addRequired<TargetTransformInfoWrapperPass>(); 218 } 219 bool runOnModule(Module &M) override { 220 if (skipModule(M)) 221 return false; 222 223 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 224 TargetTransformInfoWrapperPass *TTIWP = 225 &getAnalysis<TargetTransformInfoWrapperPass>(); 226 ProfileSummaryInfo *PSI = 227 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 228 229 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 230 [&ACT](Function &F) -> AssumptionCache & { 231 return ACT->getAssumptionCache(F); 232 }; 233 234 std::function<TargetTransformInfo &(Function &)> GetTTI = 235 [&TTIWP](Function &F) -> TargetTransformInfo & { 236 return TTIWP->getTTI(F); 237 }; 238 239 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 240 } 241 }; 242 } 243 244 std::unique_ptr<FunctionOutliningInfo> 245 PartialInlinerImpl::computeOutliningInfo(Function *F) { 246 BasicBlock *EntryBlock = &F->front(); 247 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 248 if (!BR || BR->isUnconditional()) 249 return std::unique_ptr<FunctionOutliningInfo>(); 250 251 // Returns true if Succ is BB's successor 252 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 253 return is_contained(successors(BB), Succ); 254 }; 255 256 auto SuccSize = [](BasicBlock *BB) { 257 return std::distance(succ_begin(BB), succ_end(BB)); 258 }; 259 260 auto IsReturnBlock = [](BasicBlock *BB) { 261 TerminatorInst *TI = BB->getTerminator(); 262 return isa<ReturnInst>(TI); 263 }; 264 265 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 266 if (IsReturnBlock(Succ1)) 267 return std::make_tuple(Succ1, Succ2); 268 if (IsReturnBlock(Succ2)) 269 return std::make_tuple(Succ2, Succ1); 270 271 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 272 }; 273 274 // Detect a triangular shape: 275 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 276 if (IsSuccessor(Succ1, Succ2)) 277 return std::make_tuple(Succ1, Succ2); 278 if (IsSuccessor(Succ2, Succ1)) 279 return std::make_tuple(Succ2, Succ1); 280 281 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 282 }; 283 284 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 285 llvm::make_unique<FunctionOutliningInfo>(); 286 287 BasicBlock *CurrEntry = EntryBlock; 288 bool CandidateFound = false; 289 do { 290 // The number of blocks to be inlined has already reached 291 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 292 // disables partial inlining for the function. 293 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 294 break; 295 296 if (SuccSize(CurrEntry) != 2) 297 break; 298 299 BasicBlock *Succ1 = *succ_begin(CurrEntry); 300 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 301 302 BasicBlock *ReturnBlock, *NonReturnBlock; 303 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 304 305 if (ReturnBlock) { 306 OutliningInfo->Entries.push_back(CurrEntry); 307 OutliningInfo->ReturnBlock = ReturnBlock; 308 OutliningInfo->NonReturnBlock = NonReturnBlock; 309 CandidateFound = true; 310 break; 311 } 312 313 BasicBlock *CommSucc; 314 BasicBlock *OtherSucc; 315 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 316 317 if (!CommSucc) 318 break; 319 320 OutliningInfo->Entries.push_back(CurrEntry); 321 CurrEntry = OtherSucc; 322 323 } while (true); 324 325 if (!CandidateFound) 326 return std::unique_ptr<FunctionOutliningInfo>(); 327 328 // Do sanity check of the entries: threre should not 329 // be any successors (not in the entry set) other than 330 // {ReturnBlock, NonReturnBlock} 331 assert(OutliningInfo->Entries[0] == &F->front() && 332 "Function Entry must be the first in Entries vector"); 333 DenseSet<BasicBlock *> Entries; 334 for (BasicBlock *E : OutliningInfo->Entries) 335 Entries.insert(E); 336 337 // Returns true of BB has Predecessor which is not 338 // in Entries set. 339 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 340 for (auto Pred : predecessors(BB)) { 341 if (!Entries.count(Pred)) 342 return true; 343 } 344 return false; 345 }; 346 auto CheckAndNormalizeCandidate = 347 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 348 for (BasicBlock *E : OutliningInfo->Entries) { 349 for (auto Succ : successors(E)) { 350 if (Entries.count(Succ)) 351 continue; 352 if (Succ == OutliningInfo->ReturnBlock) 353 OutliningInfo->ReturnBlockPreds.push_back(E); 354 else if (Succ != OutliningInfo->NonReturnBlock) 355 return false; 356 } 357 // There should not be any outside incoming edges either: 358 if (HasNonEntryPred(E)) 359 return false; 360 } 361 return true; 362 }; 363 364 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 365 return std::unique_ptr<FunctionOutliningInfo>(); 366 367 // Now further growing the candidate's inlining region by 368 // peeling off dominating blocks from the outlining region: 369 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 370 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 371 if (SuccSize(Cand) != 2) 372 break; 373 374 if (HasNonEntryPred(Cand)) 375 break; 376 377 BasicBlock *Succ1 = *succ_begin(Cand); 378 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 379 380 BasicBlock *ReturnBlock, *NonReturnBlock; 381 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 382 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 383 break; 384 385 if (NonReturnBlock->getSinglePredecessor() != Cand) 386 break; 387 388 // Now grow and update OutlininigInfo: 389 OutliningInfo->Entries.push_back(Cand); 390 OutliningInfo->NonReturnBlock = NonReturnBlock; 391 OutliningInfo->ReturnBlockPreds.push_back(Cand); 392 Entries.insert(Cand); 393 } 394 395 return OutliningInfo; 396 } 397 398 // Check if there is PGO data or user annoated branch data: 399 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 400 if (F->getEntryCount()) 401 return true; 402 // Now check if any of the entry block has MD_prof data: 403 for (auto *E : OI->Entries) { 404 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 405 if (!BR || BR->isUnconditional()) 406 continue; 407 uint64_t T, F; 408 if (BR->extractProfMetadata(T, F)) 409 return true; 410 } 411 return false; 412 } 413 414 BranchProbability 415 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 416 417 auto EntryFreq = 418 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 419 auto OutliningCallFreq = 420 Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB); 421 422 auto OutlineRegionRelFreq = 423 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 424 EntryFreq.getFrequency()); 425 426 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 427 return OutlineRegionRelFreq; 428 429 // When profile data is not available, we need to be conservative in 430 // estimating the overall savings. Static branch prediction can usually 431 // guess the branch direction right (taken/non-taken), but the guessed 432 // branch probability is usually not biased enough. In case when the 433 // outlined region is predicted to be likely, its probability needs 434 // to be made higher (more biased) to not under-estimate the cost of 435 // function outlining. On the other hand, if the outlined region 436 // is predicted to be less likely, the predicted probablity is usually 437 // higher than the actual. For instance, the actual probability of the 438 // less likely target is only 5%, but the guessed probablity can be 439 // 40%. In the latter case, there is no need for further adjustement. 440 // FIXME: add an option for this. 441 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 442 return OutlineRegionRelFreq; 443 444 OutlineRegionRelFreq = std::max( 445 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 446 447 return OutlineRegionRelFreq; 448 } 449 450 bool PartialInlinerImpl::shouldPartialInline( 451 CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 452 OptimizationRemarkEmitter &ORE) { 453 454 using namespace ore; 455 if (SkipCostAnalysis) 456 return true; 457 458 Instruction *Call = CS.getInstruction(); 459 Function *Callee = CS.getCalledFunction(); 460 assert(Callee == Cloner.ClonedFunc); 461 462 Function *Caller = CS.getCaller(); 463 auto &CalleeTTI = (*GetTTI)(*Callee); 464 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 465 *GetAssumptionCache, GetBFI, PSI, &ORE); 466 467 if (IC.isAlways()) { 468 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 469 << NV("Callee", Cloner.OrigFunc) 470 << " should always be fully inlined, not partially"); 471 return false; 472 } 473 474 if (IC.isNever()) { 475 ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 476 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 477 << NV("Caller", Caller) 478 << " because it should never be inlined (cost=never)"); 479 return false; 480 } 481 482 if (!IC) { 483 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 484 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 485 << NV("Caller", Caller) << " because too costly to inline (cost=" 486 << NV("Cost", IC.getCost()) << ", threshold=" 487 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 488 return false; 489 } 490 const DataLayout &DL = Caller->getParent()->getDataLayout(); 491 492 // The savings of eliminating the call: 493 int NonWeightedSavings = getCallsiteCost(CS, DL); 494 BlockFrequency NormWeightedSavings(NonWeightedSavings); 495 496 // Weighted saving is smaller than weighted cost, return false 497 if (NormWeightedSavings < WeightedOutliningRcost) { 498 ORE.emit( 499 OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) 500 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 501 << NV("Caller", Caller) << " runtime overhead (overhead=" 502 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 503 << ", savings=" 504 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" 505 << " of making the outlined call is too high"); 506 507 return false; 508 } 509 510 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 511 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 512 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 513 << " (threshold=" 514 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 515 return true; 516 } 517 518 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 519 // For now use a simplified version. The returned 'InlineCost' will be used 520 // to esimate the size cost as well as runtime cost of the BB. 521 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 522 int InlineCost = 0; 523 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 524 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 525 if (isa<DbgInfoIntrinsic>(I)) 526 continue; 527 528 switch (I->getOpcode()) { 529 case Instruction::BitCast: 530 case Instruction::PtrToInt: 531 case Instruction::IntToPtr: 532 case Instruction::Alloca: 533 continue; 534 case Instruction::GetElementPtr: 535 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 536 continue; 537 default: 538 break; 539 } 540 541 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 542 if (IntrInst) { 543 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 544 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 545 continue; 546 } 547 548 if (CallInst *CI = dyn_cast<CallInst>(I)) { 549 InlineCost += getCallsiteCost(CallSite(CI), DL); 550 continue; 551 } 552 553 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 554 InlineCost += getCallsiteCost(CallSite(II), DL); 555 continue; 556 } 557 558 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 559 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 560 continue; 561 } 562 InlineCost += InlineConstants::InstrCost; 563 } 564 return InlineCost; 565 } 566 567 std::tuple<int, int> 568 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 569 570 // Now compute the cost of the call sequence to the outlined function 571 // 'OutlinedFunction' in BB 'OutliningCallBB': 572 int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB); 573 574 // Now compute the cost of the extracted/outlined function itself: 575 int OutlinedFunctionCost = 0; 576 for (BasicBlock &BB : *Cloner.OutlinedFunc) { 577 OutlinedFunctionCost += computeBBInlineCost(&BB); 578 } 579 580 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 581 "Outlined function cost should be no less than the outlined region"); 582 // The code extractor introduces a new root and exit stub blocks with 583 // additional unconditional branches. Those branches will be eliminated 584 // later with bb layout. The cost should be adjusted accordingly: 585 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; 586 587 int OutliningRuntimeOverhead = 588 OutliningFuncCallCost + 589 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 590 ExtraOutliningPenalty; 591 592 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 593 } 594 595 // Create the callsite to profile count map which is 596 // used to update the original function's entry count, 597 // after the function is partially inlined into the callsite. 598 void PartialInlinerImpl::computeCallsiteToProfCountMap( 599 Function *DuplicateFunction, 600 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 601 std::vector<User *> Users(DuplicateFunction->user_begin(), 602 DuplicateFunction->user_end()); 603 Function *CurrentCaller = nullptr; 604 std::unique_ptr<BlockFrequencyInfo> TempBFI; 605 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 606 607 auto ComputeCurrBFI = [&,this](Function *Caller) { 608 // For the old pass manager: 609 if (!GetBFI) { 610 DominatorTree DT(*Caller); 611 LoopInfo LI(DT); 612 BranchProbabilityInfo BPI(*Caller, LI); 613 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 614 CurrentCallerBFI = TempBFI.get(); 615 } else { 616 // New pass manager: 617 CurrentCallerBFI = &(*GetBFI)(*Caller); 618 } 619 }; 620 621 for (User *User : Users) { 622 CallSite CS = getCallSite(User); 623 Function *Caller = CS.getCaller(); 624 if (CurrentCaller != Caller) { 625 CurrentCaller = Caller; 626 ComputeCurrBFI(Caller); 627 } else { 628 assert(CurrentCallerBFI && "CallerBFI is not set"); 629 } 630 BasicBlock *CallBB = CS.getInstruction()->getParent(); 631 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 632 if (Count) 633 CallSiteToProfCountMap[User] = *Count; 634 else 635 CallSiteToProfCountMap[User] = 0; 636 } 637 } 638 639 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F, 640 FunctionOutliningInfo *OI) 641 : OrigFunc(F) { 642 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 643 644 // Clone the function, so that we can hack away on it. 645 ValueToValueMapTy VMap; 646 ClonedFunc = CloneFunction(F, VMap); 647 648 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 649 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 650 for (BasicBlock *BB : OI->Entries) { 651 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 652 } 653 for (BasicBlock *E : OI->ReturnBlockPreds) { 654 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 655 ClonedOI->ReturnBlockPreds.push_back(NewE); 656 } 657 // Go ahead and update all uses to the duplicate, so that we can just 658 // use the inliner functionality when we're done hacking. 659 F->replaceAllUsesWith(ClonedFunc); 660 } 661 662 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 663 664 auto getFirstPHI = [](BasicBlock *BB) { 665 BasicBlock::iterator I = BB->begin(); 666 PHINode *FirstPhi = nullptr; 667 while (I != BB->end()) { 668 PHINode *Phi = dyn_cast<PHINode>(I); 669 if (!Phi) 670 break; 671 if (!FirstPhi) { 672 FirstPhi = Phi; 673 break; 674 } 675 } 676 return FirstPhi; 677 }; 678 679 // Special hackery is needed with PHI nodes that have inputs from more than 680 // one extracted block. For simplicity, just split the PHIs into a two-level 681 // sequence of PHIs, some of which will go in the extracted region, and some 682 // of which will go outside. 683 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 684 // only split block when necessary: 685 PHINode *FirstPhi = getFirstPHI(PreReturn); 686 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 687 688 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 689 return; 690 691 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 692 Value *CommonValue = PN->getIncomingValue(0); 693 if (all_of(PN->incoming_values(), 694 [&](Value *V) { return V == CommonValue; })) 695 return CommonValue; 696 return nullptr; 697 }; 698 699 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 700 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 701 BasicBlock::iterator I = PreReturn->begin(); 702 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 703 SmallVector<Instruction *, 4> DeadPhis; 704 while (I != PreReturn->end()) { 705 PHINode *OldPhi = dyn_cast<PHINode>(I); 706 if (!OldPhi) 707 break; 708 709 PHINode *RetPhi = 710 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 711 OldPhi->replaceAllUsesWith(RetPhi); 712 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 713 714 RetPhi->addIncoming(&*I, PreReturn); 715 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 716 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 717 OldPhi->removeIncomingValue(E); 718 } 719 720 // After incoming values splitting, the old phi may become trivial. 721 // Keeping the trivial phi can introduce definition inside the outline 722 // region which is live-out, causing necessary overhead (load, store 723 // arg passing etc). 724 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 725 OldPhi->replaceAllUsesWith(OldPhiVal); 726 DeadPhis.push_back(OldPhi); 727 } 728 ++I; 729 } 730 for (auto *DP : DeadPhis) 731 DP->eraseFromParent(); 732 733 for (auto E : ClonedOI->ReturnBlockPreds) { 734 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 735 } 736 } 737 738 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() { 739 // Returns true if the block is to be partial inlined into the caller 740 // (i.e. not to be extracted to the out of line function) 741 auto ToBeInlined = [&, this](BasicBlock *BB) { 742 return BB == ClonedOI->ReturnBlock || 743 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 744 ClonedOI->Entries.end()); 745 }; 746 747 // Gather up the blocks that we're going to extract. 748 std::vector<BasicBlock *> ToExtract; 749 ToExtract.push_back(ClonedOI->NonReturnBlock); 750 OutlinedRegionCost += 751 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 752 for (BasicBlock &BB : *ClonedFunc) 753 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 754 ToExtract.push_back(&BB); 755 // FIXME: the code extractor may hoist/sink more code 756 // into the outlined function which may make the outlining 757 // overhead (the difference of the outlined function cost 758 // and OutliningRegionCost) look larger. 759 OutlinedRegionCost += computeBBInlineCost(&BB); 760 } 761 762 // The CodeExtractor needs a dominator tree. 763 DominatorTree DT; 764 DT.recalculate(*ClonedFunc); 765 766 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 767 LoopInfo LI(DT); 768 BranchProbabilityInfo BPI(*ClonedFunc, LI); 769 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 770 771 // Extract the body of the if. 772 OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 773 ClonedFuncBFI.get(), &BPI) 774 .extractCodeRegion(); 775 776 if (OutlinedFunc) { 777 OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 778 .getInstruction() 779 ->getParent(); 780 assert(OutliningCallBB->getParent() == ClonedFunc); 781 } 782 783 return OutlinedFunc; 784 } 785 786 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 787 // Ditch the duplicate, since we're done with it, and rewrite all remaining 788 // users (function pointers, etc.) back to the original function. 789 ClonedFunc->replaceAllUsesWith(OrigFunc); 790 ClonedFunc->eraseFromParent(); 791 if (!IsFunctionInlined) { 792 // Remove the function that is speculatively created if there is no 793 // reference. 794 if (OutlinedFunc) 795 OutlinedFunc->eraseFromParent(); 796 } 797 } 798 799 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 800 801 if (F->hasAddressTaken()) 802 return nullptr; 803 804 // Let inliner handle it 805 if (F->hasFnAttribute(Attribute::AlwaysInline)) 806 return nullptr; 807 808 if (F->hasFnAttribute(Attribute::NoInline)) 809 return nullptr; 810 811 if (PSI->isFunctionEntryCold(F)) 812 return nullptr; 813 814 if (F->user_begin() == F->user_end()) 815 return nullptr; 816 817 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 818 819 if (!OI) 820 return nullptr; 821 822 FunctionCloner Cloner(F, OI.get()); 823 Cloner.NormalizeReturnBlock(); 824 Function *OutlinedFunction = Cloner.doFunctionOutlining(); 825 826 bool AnyInline = tryPartialInline(Cloner); 827 828 if (AnyInline) 829 return OutlinedFunction; 830 831 return nullptr; 832 } 833 834 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 835 int NonWeightedRcost; 836 int SizeCost; 837 838 if (Cloner.OutlinedFunc == nullptr) 839 return false; 840 841 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 842 843 auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 844 auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 845 846 // The call sequence to the outlined function is larger than the original 847 // outlined region size, it does not increase the chances of inlining 848 // the function with outlining (The inliner uses the size increase to 849 // model the cost of inlining a callee). 850 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 851 OptimizationRemarkEmitter ORE(Cloner.OrigFunc); 852 DebugLoc DLoc; 853 BasicBlock *Block; 854 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 855 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 856 DLoc, Block) 857 << ore::NV("Function", Cloner.OrigFunc) 858 << " not partially inlined into callers (Original Size = " 859 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 860 << ", Size of call sequence to outlined function = " 861 << ore::NV("NewSize", SizeCost) << ")"); 862 return false; 863 } 864 865 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 866 "F's users should all be replaced!"); 867 868 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 869 Cloner.ClonedFunc->user_end()); 870 871 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 872 if (Cloner.OrigFunc->getEntryCount()) 873 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 874 875 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 876 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 877 878 bool AnyInline = false; 879 for (User *User : Users) { 880 CallSite CS = getCallSite(User); 881 882 if (IsLimitReached()) 883 continue; 884 885 OptimizationRemarkEmitter ORE(CS.getCaller()); 886 887 if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE)) 888 continue; 889 890 ORE.emit( 891 OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) 892 << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 893 << ore::NV("Caller", CS.getCaller())); 894 895 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 896 InlineFunction(CS, IFI); 897 898 // Now update the entry count: 899 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 900 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 901 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 902 } 903 904 AnyInline = true; 905 NumPartialInlining++; 906 // Update the stats 907 NumPartialInlined++; 908 } 909 910 if (AnyInline) { 911 Cloner.IsFunctionInlined = true; 912 if (CalleeEntryCount) 913 Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); 914 } 915 916 return AnyInline; 917 } 918 919 bool PartialInlinerImpl::run(Module &M) { 920 if (DisablePartialInlining) 921 return false; 922 923 std::vector<Function *> Worklist; 924 Worklist.reserve(M.size()); 925 for (Function &F : M) 926 if (!F.use_empty() && !F.isDeclaration()) 927 Worklist.push_back(&F); 928 929 bool Changed = false; 930 while (!Worklist.empty()) { 931 Function *CurrFunc = Worklist.back(); 932 Worklist.pop_back(); 933 934 if (CurrFunc->use_empty()) 935 continue; 936 937 bool Recursive = false; 938 for (User *U : CurrFunc->users()) 939 if (Instruction *I = dyn_cast<Instruction>(U)) 940 if (I->getParent()->getParent() == CurrFunc) { 941 Recursive = true; 942 break; 943 } 944 if (Recursive) 945 continue; 946 947 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 948 Worklist.push_back(NewFunc); 949 Changed = true; 950 } 951 } 952 953 return Changed; 954 } 955 956 char PartialInlinerLegacyPass::ID = 0; 957 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 958 "Partial Inliner", false, false) 959 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 960 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 961 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 962 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 963 "Partial Inliner", false, false) 964 965 ModulePass *llvm::createPartialInliningPass() { 966 return new PartialInlinerLegacyPass(); 967 } 968 969 PreservedAnalyses PartialInlinerPass::run(Module &M, 970 ModuleAnalysisManager &AM) { 971 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 972 973 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 974 [&FAM](Function &F) -> AssumptionCache & { 975 return FAM.getResult<AssumptionAnalysis>(F); 976 }; 977 978 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 979 [&FAM](Function &F) -> BlockFrequencyInfo & { 980 return FAM.getResult<BlockFrequencyAnalysis>(F); 981 }; 982 983 std::function<TargetTransformInfo &(Function &)> GetTTI = 984 [&FAM](Function &F) -> TargetTransformInfo & { 985 return FAM.getResult<TargetIRAnalysis>(F); 986 }; 987 988 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 989 990 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 991 return PreservedAnalyses::none(); 992 return PreservedAnalyses::all(); 993 } 994