1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/CodeMetrics.h" 20 #include "llvm/Analysis/InlineCost.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Module.h" 31 #include "llvm/Pass.h" 32 #include "llvm/Transforms/IPO.h" 33 #include "llvm/Transforms/Utils/Cloning.h" 34 #include "llvm/Transforms/Utils/CodeExtractor.h" 35 using namespace llvm; 36 37 #define DEBUG_TYPE "partial-inlining" 38 39 STATISTIC(NumPartialInlined, 40 "Number of callsites functions partially inlined into."); 41 42 // Command line option to disable partial-inlining. The default is false: 43 static cl::opt<bool> 44 DisablePartialInlining("disable-partial-inlining", cl::init(false), 45 cl::Hidden, cl::desc("Disable partial ininling")); 46 // This is an option used by testing: 47 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 48 cl::init(false), cl::ZeroOrMore, 49 cl::ReallyHidden, 50 cl::desc("Skip Cost Analysis")); 51 52 static cl::opt<unsigned> MaxNumInlineBlocks( 53 "max-num-inline-blocks", cl::init(5), cl::Hidden, 54 cl::desc("Max Number of Blocks To be Partially Inlined")); 55 56 // Command line option to set the maximum number of partial inlining allowed 57 // for the module. The default value of -1 means no limit. 58 static cl::opt<int> MaxNumPartialInlining( 59 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 60 cl::desc("Max number of partial inlining. The default is unlimited")); 61 62 // Used only when PGO or user annotated branch data is absent. It is 63 // the least value that is used to weigh the outline region. If BFI 64 // produces larger value, the BFI value will be used. 65 static cl::opt<int> 66 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 67 cl::Hidden, cl::ZeroOrMore, 68 cl::desc("Relative frequency of outline region to " 69 "the entry block")); 70 71 static cl::opt<unsigned> ExtraOutliningPenalty( 72 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 73 cl::desc("A debug option to add additional penalty to the computed one.")); 74 75 namespace { 76 77 struct FunctionOutliningInfo { 78 FunctionOutliningInfo() 79 : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), 80 ReturnBlockPreds() {} 81 // Returns the number of blocks to be inlined including all blocks 82 // in Entries and one return block. 83 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 84 85 // A set of blocks including the function entry that guard 86 // the region to be outlined. 87 SmallVector<BasicBlock *, 4> Entries; 88 // The return block that is not included in the outlined region. 89 BasicBlock *ReturnBlock; 90 // The dominating block of the region to be outlined. 91 BasicBlock *NonReturnBlock; 92 // The set of blocks in Entries that that are predecessors to ReturnBlock 93 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 94 }; 95 96 struct PartialInlinerImpl { 97 PartialInlinerImpl( 98 std::function<AssumptionCache &(Function &)> *GetAC, 99 std::function<TargetTransformInfo &(Function &)> *GTTI, 100 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 101 ProfileSummaryInfo *ProfSI) 102 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 103 bool run(Module &M); 104 Function *unswitchFunction(Function *F); 105 106 private: 107 int NumPartialInlining = 0; 108 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 109 std::function<TargetTransformInfo &(Function &)> *GetTTI; 110 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 111 ProfileSummaryInfo *PSI; 112 113 // Return the frequency of the OutlininingBB relative to F's entry point. 114 // The result is no larger than 1 and is represented using BP. 115 // (Note that the outlined region's 'head' block can only have incoming 116 // edges from the guarding entry blocks). 117 BranchProbability getOutliningCallBBRelativeFreq(Function *F, 118 FunctionOutliningInfo *OI, 119 Function *DuplicateFunction, 120 BlockFrequencyInfo *BFI, 121 BasicBlock *OutliningCallBB); 122 123 // Return true if the callee of CS should be partially inlined with 124 // profit. 125 bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, 126 BlockFrequencyInfo *CalleeBFI, 127 BasicBlock *OutliningCallBB, 128 int OutliningCallOverhead, 129 OptimizationRemarkEmitter &ORE); 130 131 // Try to inline DuplicateFunction (cloned from F with call to 132 // the OutlinedFunction into its callers. Return true 133 // if there is any successful inlining. 134 bool tryPartialInline(Function *DuplicateFunction, 135 Function *F, /*orignal function */ 136 FunctionOutliningInfo *OI, Function *OutlinedFunction, 137 BlockFrequencyInfo *CalleeBFI); 138 139 // Compute the mapping from use site of DuplicationFunction to the enclosing 140 // BB's profile count. 141 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 142 DenseMap<User *, uint64_t> &SiteCountMap); 143 144 bool IsLimitReached() { 145 return (MaxNumPartialInlining != -1 && 146 NumPartialInlining >= MaxNumPartialInlining); 147 } 148 149 CallSite getCallSite(User *U) { 150 CallSite CS; 151 if (CallInst *CI = dyn_cast<CallInst>(U)) 152 CS = CallSite(CI); 153 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 154 CS = CallSite(II); 155 else 156 llvm_unreachable("All uses must be calls"); 157 return CS; 158 } 159 160 CallSite getOneCallSiteTo(Function *F) { 161 User *User = *F->user_begin(); 162 return getCallSite(User); 163 } 164 165 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 166 CallSite CS = getOneCallSiteTo(F); 167 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 168 BasicBlock *Block = CS.getParent(); 169 return std::make_tuple(DLoc, Block); 170 } 171 172 // Returns the costs associated with function outlining: 173 // - The first value is the non-weighted runtime cost for making the call 174 // to the outlined function 'OutlinedFunction', including the addtional 175 // setup cost in the outlined function itself; 176 // - The second value is the estimated size of the new call sequence in 177 // basic block 'OutliningCallBB'; 178 // - The third value is the estimated size of the original code from 179 // function 'F' that is extracted into the outlined function. 180 std::tuple<int, int, int> 181 computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, 182 Function *OutlinedFunction, 183 BasicBlock *OutliningCallBB); 184 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 185 // approximate both the size and runtime cost (Note that in the current 186 // inline cost analysis, there is no clear distinction there either). 187 int computeBBInlineCost(BasicBlock *BB); 188 189 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 190 191 }; 192 193 struct PartialInlinerLegacyPass : public ModulePass { 194 static char ID; // Pass identification, replacement for typeid 195 PartialInlinerLegacyPass() : ModulePass(ID) { 196 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 197 } 198 199 void getAnalysisUsage(AnalysisUsage &AU) const override { 200 AU.addRequired<AssumptionCacheTracker>(); 201 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 202 AU.addRequired<TargetTransformInfoWrapperPass>(); 203 } 204 bool runOnModule(Module &M) override { 205 if (skipModule(M)) 206 return false; 207 208 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 209 TargetTransformInfoWrapperPass *TTIWP = 210 &getAnalysis<TargetTransformInfoWrapperPass>(); 211 ProfileSummaryInfo *PSI = 212 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 213 214 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 215 [&ACT](Function &F) -> AssumptionCache & { 216 return ACT->getAssumptionCache(F); 217 }; 218 219 std::function<TargetTransformInfo &(Function &)> GetTTI = 220 [&TTIWP](Function &F) -> TargetTransformInfo & { 221 return TTIWP->getTTI(F); 222 }; 223 224 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 225 } 226 }; 227 } 228 229 std::unique_ptr<FunctionOutliningInfo> 230 PartialInlinerImpl::computeOutliningInfo(Function *F) { 231 BasicBlock *EntryBlock = &F->front(); 232 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 233 if (!BR || BR->isUnconditional()) 234 return std::unique_ptr<FunctionOutliningInfo>(); 235 236 // Returns true if Succ is BB's successor 237 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 238 return is_contained(successors(BB), Succ); 239 }; 240 241 auto SuccSize = [](BasicBlock *BB) { 242 return std::distance(succ_begin(BB), succ_end(BB)); 243 }; 244 245 auto IsReturnBlock = [](BasicBlock *BB) { 246 TerminatorInst *TI = BB->getTerminator(); 247 return isa<ReturnInst>(TI); 248 }; 249 250 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 251 if (IsReturnBlock(Succ1)) 252 return std::make_tuple(Succ1, Succ2); 253 if (IsReturnBlock(Succ2)) 254 return std::make_tuple(Succ2, Succ1); 255 256 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 257 }; 258 259 // Detect a triangular shape: 260 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 261 if (IsSuccessor(Succ1, Succ2)) 262 return std::make_tuple(Succ1, Succ2); 263 if (IsSuccessor(Succ2, Succ1)) 264 return std::make_tuple(Succ2, Succ1); 265 266 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 267 }; 268 269 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 270 llvm::make_unique<FunctionOutliningInfo>(); 271 272 BasicBlock *CurrEntry = EntryBlock; 273 bool CandidateFound = false; 274 do { 275 // The number of blocks to be inlined has already reached 276 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 277 // disables partial inlining for the function. 278 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 279 break; 280 281 if (SuccSize(CurrEntry) != 2) 282 break; 283 284 BasicBlock *Succ1 = *succ_begin(CurrEntry); 285 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 286 287 BasicBlock *ReturnBlock, *NonReturnBlock; 288 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 289 290 if (ReturnBlock) { 291 OutliningInfo->Entries.push_back(CurrEntry); 292 OutliningInfo->ReturnBlock = ReturnBlock; 293 OutliningInfo->NonReturnBlock = NonReturnBlock; 294 CandidateFound = true; 295 break; 296 } 297 298 BasicBlock *CommSucc; 299 BasicBlock *OtherSucc; 300 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 301 302 if (!CommSucc) 303 break; 304 305 OutliningInfo->Entries.push_back(CurrEntry); 306 CurrEntry = OtherSucc; 307 308 } while (true); 309 310 if (!CandidateFound) 311 return std::unique_ptr<FunctionOutliningInfo>(); 312 313 // Do sanity check of the entries: threre should not 314 // be any successors (not in the entry set) other than 315 // {ReturnBlock, NonReturnBlock} 316 assert(OutliningInfo->Entries[0] == &F->front() && 317 "Function Entry must be the first in Entries vector"); 318 DenseSet<BasicBlock *> Entries; 319 for (BasicBlock *E : OutliningInfo->Entries) 320 Entries.insert(E); 321 322 // Returns true of BB has Predecessor which is not 323 // in Entries set. 324 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 325 for (auto Pred : predecessors(BB)) { 326 if (!Entries.count(Pred)) 327 return true; 328 } 329 return false; 330 }; 331 auto CheckAndNormalizeCandidate = 332 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 333 for (BasicBlock *E : OutliningInfo->Entries) { 334 for (auto Succ : successors(E)) { 335 if (Entries.count(Succ)) 336 continue; 337 if (Succ == OutliningInfo->ReturnBlock) 338 OutliningInfo->ReturnBlockPreds.push_back(E); 339 else if (Succ != OutliningInfo->NonReturnBlock) 340 return false; 341 } 342 // There should not be any outside incoming edges either: 343 if (HasNonEntryPred(E)) 344 return false; 345 } 346 return true; 347 }; 348 349 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 350 return std::unique_ptr<FunctionOutliningInfo>(); 351 352 // Now further growing the candidate's inlining region by 353 // peeling off dominating blocks from the outlining region: 354 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 355 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 356 if (SuccSize(Cand) != 2) 357 break; 358 359 if (HasNonEntryPred(Cand)) 360 break; 361 362 BasicBlock *Succ1 = *succ_begin(Cand); 363 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 364 365 BasicBlock *ReturnBlock, *NonReturnBlock; 366 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 367 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 368 break; 369 370 if (NonReturnBlock->getSinglePredecessor() != Cand) 371 break; 372 373 // Now grow and update OutlininigInfo: 374 OutliningInfo->Entries.push_back(Cand); 375 OutliningInfo->NonReturnBlock = NonReturnBlock; 376 OutliningInfo->ReturnBlockPreds.push_back(Cand); 377 Entries.insert(Cand); 378 } 379 380 return OutliningInfo; 381 } 382 383 // Check if there is PGO data or user annoated branch data: 384 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 385 if (F->getEntryCount()) 386 return true; 387 // Now check if any of the entry block has MD_prof data: 388 for (auto *E : OI->Entries) { 389 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 390 if (!BR || BR->isUnconditional()) 391 continue; 392 uint64_t T, F; 393 if (BR->extractProfMetadata(T, F)) 394 return true; 395 } 396 return false; 397 } 398 399 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 400 Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, 401 BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { 402 403 auto EntryFreq = 404 BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); 405 auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); 406 407 auto OutlineRegionRelFreq = 408 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 409 EntryFreq.getFrequency()); 410 411 if (hasProfileData(F, OI)) 412 return OutlineRegionRelFreq; 413 414 // When profile data is not available, we need to be conservative in 415 // estimating the overall savings. Static branch prediction can usually 416 // guess the branch direction right (taken/non-taken), but the guessed 417 // branch probability is usually not biased enough. In case when the 418 // outlined region is predicted to be likely, its probability needs 419 // to be made higher (more biased) to not under-estimate the cost of 420 // function outlining. On the other hand, if the outlined region 421 // is predicted to be less likely, the predicted probablity is usually 422 // higher than the actual. For instance, the actual probability of the 423 // less likely target is only 5%, but the guessed probablity can be 424 // 40%. In the latter case, there is no need for further adjustement. 425 // FIXME: add an option for this. 426 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 427 return OutlineRegionRelFreq; 428 429 OutlineRegionRelFreq = std::max( 430 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 431 432 return OutlineRegionRelFreq; 433 } 434 435 bool PartialInlinerImpl::shouldPartialInline( 436 CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, 437 BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, 438 int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { 439 using namespace ore; 440 if (SkipCostAnalysis) 441 return true; 442 443 Instruction *Call = CS.getInstruction(); 444 Function *Callee = CS.getCalledFunction(); 445 Function *Caller = CS.getCaller(); 446 auto &CalleeTTI = (*GetTTI)(*Callee); 447 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 448 *GetAssumptionCache, GetBFI, PSI); 449 450 if (IC.isAlways()) { 451 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 452 << NV("Callee", F) 453 << " should always be fully inlined, not partially"); 454 return false; 455 } 456 457 if (IC.isNever()) { 458 ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 459 << NV("Callee", F) << " not partially inlined into " 460 << NV("Caller", Caller) 461 << " because it should never be inlined (cost=never)"); 462 return false; 463 } 464 465 if (!IC) { 466 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 467 << NV("Callee", F) << " not partially inlined into " 468 << NV("Caller", Caller) << " because too costly to inline (cost=" 469 << NV("Cost", IC.getCost()) << ", threshold=" 470 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 471 return false; 472 } 473 const DataLayout &DL = Caller->getParent()->getDataLayout(); 474 // The savings of eliminating the call: 475 int NonWeightedSavings = getCallsiteCost(CS, DL); 476 BlockFrequency NormWeightedSavings(NonWeightedSavings); 477 478 auto RelativeFreq = 479 getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); 480 auto NormWeightedRcost = 481 BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; 482 483 // Weighted saving is smaller than weighted cost, return false 484 if (NormWeightedSavings < NormWeightedRcost) { 485 ORE.emit( 486 OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) 487 << NV("Callee", F) << " not partially inlined into " 488 << NV("Caller", Caller) << " runtime overhead (overhead=" 489 << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) 490 << ", savings=" 491 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" 492 << " of making the outlined call is too high"); 493 494 return false; 495 } 496 497 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 498 << NV("Callee", F) << " can be partially inlined into " 499 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 500 << " (threshold=" 501 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 502 return true; 503 } 504 505 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 506 // For now use a simplified version. The returned 'InlineCost' will be used 507 // to esimate the size cost as well as runtime cost of the BB. 508 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 509 int InlineCost = 0; 510 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 511 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 512 if (isa<DbgInfoIntrinsic>(I)) 513 continue; 514 515 switch (I->getOpcode()) { 516 case Instruction::BitCast: 517 case Instruction::PtrToInt: 518 case Instruction::IntToPtr: 519 case Instruction::Alloca: 520 continue; 521 case Instruction::GetElementPtr: 522 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 523 continue; 524 default: 525 break; 526 } 527 528 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 529 if (IntrInst) { 530 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 531 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 532 continue; 533 } 534 535 if (CallInst *CI = dyn_cast<CallInst>(I)) { 536 InlineCost += getCallsiteCost(CallSite(CI), DL); 537 continue; 538 } 539 540 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 541 InlineCost += getCallsiteCost(CallSite(II), DL); 542 continue; 543 } 544 545 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 546 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 547 continue; 548 } 549 InlineCost += InlineConstants::InstrCost; 550 } 551 return InlineCost; 552 } 553 554 std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( 555 Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, 556 BasicBlock *OutliningCallBB) { 557 // First compute the cost of the outlined region 'OI' in the original 558 // function 'F'. 559 // FIXME: The code extractor (outliner) can now do code sinking/hoisting 560 // to reduce outlining cost. The hoisted/sunk code currently do not 561 // incur any runtime cost so it is still OK to compare the outlined 562 // function cost with the outlined region in the original function. 563 // If this ever changes, we will need to introduce new extractor api 564 // to pass the information. 565 int OutlinedRegionCost = 0; 566 for (BasicBlock &BB : *F) { 567 if (&BB != OI->ReturnBlock && 568 // Assuming Entry set is small -- do a linear search here: 569 std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == 570 OI->Entries.end()) { 571 OutlinedRegionCost += computeBBInlineCost(&BB); 572 } 573 } 574 575 // Now compute the cost of the call sequence to the outlined function 576 // 'OutlinedFunction' in BB 'OutliningCallBB': 577 int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); 578 579 // Now compute the cost of the extracted/outlined function itself: 580 int OutlinedFunctionCost = 0; 581 for (BasicBlock &BB : *OutlinedFunction) { 582 OutlinedFunctionCost += computeBBInlineCost(&BB); 583 } 584 585 assert(OutlinedFunctionCost >= OutlinedRegionCost && 586 "Outlined function cost should be no less than the outlined region"); 587 // The code extractor introduces a new root and exit stub blocks with 588 // additional unconditional branches. Those branches will be eliminated 589 // later with bb layout. The cost should be adjusted accordingly: 590 OutlinedFunctionCost -= 2 * InlineConstants::InstrCost; 591 592 int OutliningRuntimeOverhead = OutliningFuncCallCost + 593 (OutlinedFunctionCost - OutlinedRegionCost) + 594 ExtraOutliningPenalty; 595 596 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, 597 OutlinedRegionCost); 598 } 599 600 // Create the callsite to profile count map which is 601 // used to update the original function's entry count, 602 // after the function is partially inlined into the callsite. 603 void PartialInlinerImpl::computeCallsiteToProfCountMap( 604 Function *DuplicateFunction, 605 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 606 std::vector<User *> Users(DuplicateFunction->user_begin(), 607 DuplicateFunction->user_end()); 608 Function *CurrentCaller = nullptr; 609 std::unique_ptr<BlockFrequencyInfo> TempBFI; 610 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 611 612 auto ComputeCurrBFI = [&,this](Function *Caller) { 613 // For the old pass manager: 614 if (!GetBFI) { 615 DominatorTree DT(*Caller); 616 LoopInfo LI(DT); 617 BranchProbabilityInfo BPI(*Caller, LI); 618 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 619 CurrentCallerBFI = TempBFI.get(); 620 } else { 621 // New pass manager: 622 CurrentCallerBFI = &(*GetBFI)(*Caller); 623 } 624 }; 625 626 for (User *User : Users) { 627 CallSite CS = getCallSite(User); 628 Function *Caller = CS.getCaller(); 629 if (CurrentCaller != Caller) { 630 CurrentCaller = Caller; 631 ComputeCurrBFI(Caller); 632 } else { 633 assert(CurrentCallerBFI && "CallerBFI is not set"); 634 } 635 BasicBlock *CallBB = CS.getInstruction()->getParent(); 636 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 637 if (Count) 638 CallSiteToProfCountMap[User] = *Count; 639 else 640 CallSiteToProfCountMap[User] = 0; 641 } 642 } 643 644 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 645 646 if (F->hasAddressTaken()) 647 return nullptr; 648 649 // Let inliner handle it 650 if (F->hasFnAttribute(Attribute::AlwaysInline)) 651 return nullptr; 652 653 if (F->hasFnAttribute(Attribute::NoInline)) 654 return nullptr; 655 656 if (PSI->isFunctionEntryCold(F)) 657 return nullptr; 658 659 if (F->user_begin() == F->user_end()) 660 return nullptr; 661 662 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 663 664 if (!OI) 665 return nullptr; 666 667 // Clone the function, so that we can hack away on it. 668 ValueToValueMapTy VMap; 669 Function *DuplicateFunction = CloneFunction(F, VMap); 670 BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 671 BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 672 DenseSet<BasicBlock *> NewEntries; 673 for (BasicBlock *BB : OI->Entries) { 674 NewEntries.insert(cast<BasicBlock>(VMap[BB])); 675 } 676 677 // Go ahead and update all uses to the duplicate, so that we can just 678 // use the inliner functionality when we're done hacking. 679 F->replaceAllUsesWith(DuplicateFunction); 680 681 auto getFirstPHI = [](BasicBlock *BB) { 682 BasicBlock::iterator I = BB->begin(); 683 PHINode *FirstPhi = nullptr; 684 while (I != BB->end()) { 685 PHINode *Phi = dyn_cast<PHINode>(I); 686 if (!Phi) 687 break; 688 if (!FirstPhi) { 689 FirstPhi = Phi; 690 break; 691 } 692 } 693 return FirstPhi; 694 }; 695 // Special hackery is needed with PHI nodes that have inputs from more than 696 // one extracted block. For simplicity, just split the PHIs into a two-level 697 // sequence of PHIs, some of which will go in the extracted region, and some 698 // of which will go outside. 699 BasicBlock *PreReturn = NewReturnBlock; 700 // only split block when necessary: 701 PHINode *FirstPhi = getFirstPHI(PreReturn); 702 unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); 703 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 704 Value *CommonValue = PN->getIncomingValue(0); 705 if (all_of(PN->incoming_values(), 706 [&](Value *V) { return V == CommonValue; })) 707 return CommonValue; 708 return nullptr; 709 }; 710 711 if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { 712 713 NewReturnBlock = NewReturnBlock->splitBasicBlock( 714 NewReturnBlock->getFirstNonPHI()->getIterator()); 715 BasicBlock::iterator I = PreReturn->begin(); 716 Instruction *Ins = &NewReturnBlock->front(); 717 SmallVector<Instruction *, 4> DeadPhis; 718 while (I != PreReturn->end()) { 719 PHINode *OldPhi = dyn_cast<PHINode>(I); 720 if (!OldPhi) 721 break; 722 723 PHINode *RetPhi = 724 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 725 OldPhi->replaceAllUsesWith(RetPhi); 726 Ins = NewReturnBlock->getFirstNonPHI(); 727 728 RetPhi->addIncoming(&*I, PreReturn); 729 for (BasicBlock *E : OI->ReturnBlockPreds) { 730 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 731 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); 732 OldPhi->removeIncomingValue(NewE); 733 } 734 735 // After incoming values splitting, the old phi may become trivial. 736 // Keeping the trivial phi can introduce definition inside the outline 737 // region which is live-out, causing necessary overhead (load, store 738 // arg passing etc). 739 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 740 OldPhi->replaceAllUsesWith(OldPhiVal); 741 DeadPhis.push_back(OldPhi); 742 } 743 744 ++I; 745 } 746 747 for (auto *DP : DeadPhis) 748 DP->eraseFromParent(); 749 750 for (auto E : OI->ReturnBlockPreds) { 751 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 752 NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); 753 } 754 } 755 756 // Returns true if the block is to be partial inlined into the caller 757 // (i.e. not to be extracted to the out of line function) 758 auto ToBeInlined = [&](BasicBlock *BB) { 759 return BB == NewReturnBlock || NewEntries.count(BB); 760 }; 761 // Gather up the blocks that we're going to extract. 762 std::vector<BasicBlock *> ToExtract; 763 ToExtract.push_back(NewNonReturnBlock); 764 for (BasicBlock &BB : *DuplicateFunction) 765 if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock) 766 ToExtract.push_back(&BB); 767 768 // The CodeExtractor needs a dominator tree. 769 DominatorTree DT; 770 DT.recalculate(*DuplicateFunction); 771 772 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 773 LoopInfo LI(DT); 774 BranchProbabilityInfo BPI(*DuplicateFunction, LI); 775 BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); 776 777 // Extract the body of the if. 778 Function *OutlinedFunction = 779 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) 780 .extractCodeRegion(); 781 782 bool AnyInline = 783 tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); 784 785 // Ditch the duplicate, since we're done with it, and rewrite all remaining 786 // users (function pointers, etc.) back to the original function. 787 DuplicateFunction->replaceAllUsesWith(F); 788 DuplicateFunction->eraseFromParent(); 789 790 if (AnyInline) 791 return OutlinedFunction; 792 793 // Remove the function that is speculatively created: 794 if (OutlinedFunction) 795 OutlinedFunction->eraseFromParent(); 796 797 return nullptr; 798 } 799 800 bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, 801 Function *F, 802 FunctionOutliningInfo *OI, 803 Function *OutlinedFunction, 804 BlockFrequencyInfo *CalleeBFI) { 805 if (OutlinedFunction == nullptr) 806 return false; 807 808 int NonWeightedRcost; 809 int SizeCost; 810 int OutlinedRegionSizeCost; 811 812 auto OutliningCallBB = 813 getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); 814 815 std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = 816 computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); 817 818 // The call sequence to the outlined function is larger than the original 819 // outlined region size, it does not increase the chances of inlining 820 // 'F' with outlining (The inliner usies the size increase to model the 821 // the cost of inlining a callee). 822 if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { 823 OptimizationRemarkEmitter ORE(F); 824 DebugLoc DLoc; 825 BasicBlock *Block; 826 std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); 827 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 828 DLoc, Block) 829 << ore::NV("Function", F) 830 << " not partially inlined into callers (Original Size = " 831 << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) 832 << ", Size of call sequence to outlined function = " 833 << ore::NV("NewSize", SizeCost) << ")"); 834 return false; 835 } 836 837 assert(F->user_begin() == F->user_end() && 838 "F's users should all be replaced!"); 839 std::vector<User *> Users(DuplicateFunction->user_begin(), 840 DuplicateFunction->user_end()); 841 842 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 843 if (F->getEntryCount()) 844 computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); 845 846 auto CalleeEntryCount = F->getEntryCount(); 847 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 848 bool AnyInline = false; 849 for (User *User : Users) { 850 CallSite CS = getCallSite(User); 851 852 if (IsLimitReached()) 853 continue; 854 855 OptimizationRemarkEmitter ORE(CS.getCaller()); 856 857 if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, 858 NonWeightedRcost, ORE)) 859 continue; 860 861 ORE.emit( 862 OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) 863 << ore::NV("Callee", F) << " partially inlined into " 864 << ore::NV("Caller", CS.getCaller())); 865 866 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 867 InlineFunction(CS, IFI); 868 869 // Now update the entry count: 870 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 871 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 872 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 873 } 874 875 AnyInline = true; 876 NumPartialInlining++; 877 // Update the stats 878 NumPartialInlined++; 879 } 880 881 if (AnyInline && CalleeEntryCount) 882 F->setEntryCount(CalleeEntryCountV); 883 884 return AnyInline; 885 } 886 887 bool PartialInlinerImpl::run(Module &M) { 888 if (DisablePartialInlining) 889 return false; 890 891 std::vector<Function *> Worklist; 892 Worklist.reserve(M.size()); 893 for (Function &F : M) 894 if (!F.use_empty() && !F.isDeclaration()) 895 Worklist.push_back(&F); 896 897 bool Changed = false; 898 while (!Worklist.empty()) { 899 Function *CurrFunc = Worklist.back(); 900 Worklist.pop_back(); 901 902 if (CurrFunc->use_empty()) 903 continue; 904 905 bool Recursive = false; 906 for (User *U : CurrFunc->users()) 907 if (Instruction *I = dyn_cast<Instruction>(U)) 908 if (I->getParent()->getParent() == CurrFunc) { 909 Recursive = true; 910 break; 911 } 912 if (Recursive) 913 continue; 914 915 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 916 Worklist.push_back(NewFunc); 917 Changed = true; 918 } 919 } 920 921 return Changed; 922 } 923 924 char PartialInlinerLegacyPass::ID = 0; 925 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 926 "Partial Inliner", false, false) 927 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 928 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 929 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 930 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 931 "Partial Inliner", false, false) 932 933 ModulePass *llvm::createPartialInliningPass() { 934 return new PartialInlinerLegacyPass(); 935 } 936 937 PreservedAnalyses PartialInlinerPass::run(Module &M, 938 ModuleAnalysisManager &AM) { 939 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 940 941 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 942 [&FAM](Function &F) -> AssumptionCache & { 943 return FAM.getResult<AssumptionAnalysis>(F); 944 }; 945 946 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 947 [&FAM](Function &F) -> BlockFrequencyInfo & { 948 return FAM.getResult<BlockFrequencyAnalysis>(F); 949 }; 950 951 std::function<TargetTransformInfo &(Function &)> GetTTI = 952 [&FAM](Function &F) -> TargetTransformInfo & { 953 return FAM.getResult<TargetIRAnalysis>(F); 954 }; 955 956 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 957 958 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 959 return PreservedAnalyses::none(); 960 return PreservedAnalyses::all(); 961 } 962