1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/Analysis/BlockFrequencyInfo.h" 18 #include "llvm/Analysis/BranchProbabilityInfo.h" 19 #include "llvm/Analysis/CodeMetrics.h" 20 #include "llvm/Analysis/InlineCost.h" 21 #include "llvm/Analysis/LoopInfo.h" 22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h" 23 #include "llvm/Analysis/ProfileSummaryInfo.h" 24 #include "llvm/Analysis/TargetLibraryInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/IR/CFG.h" 27 #include "llvm/IR/DiagnosticInfo.h" 28 #include "llvm/IR/Dominators.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Module.h" 31 #include "llvm/Pass.h" 32 #include "llvm/Transforms/IPO.h" 33 #include "llvm/Transforms/Utils/Cloning.h" 34 #include "llvm/Transforms/Utils/CodeExtractor.h" 35 using namespace llvm; 36 37 #define DEBUG_TYPE "partial-inlining" 38 39 STATISTIC(NumPartialInlined, 40 "Number of callsites functions partially inlined into."); 41 42 // Command line option to disable partial-inlining. The default is false: 43 static cl::opt<bool> 44 DisablePartialInlining("disable-partial-inlining", cl::init(false), 45 cl::Hidden, cl::desc("Disable partial ininling")); 46 // This is an option used by testing: 47 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 48 cl::init(false), cl::ZeroOrMore, 49 cl::ReallyHidden, 50 cl::desc("Skip Cost Analysis")); 51 52 static cl::opt<unsigned> MaxNumInlineBlocks( 53 "max-num-inline-blocks", cl::init(5), cl::Hidden, 54 cl::desc("Max Number of Blocks To be Partially Inlined")); 55 56 // Command line option to set the maximum number of partial inlining allowed 57 // for the module. The default value of -1 means no limit. 58 static cl::opt<int> MaxNumPartialInlining( 59 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 60 cl::desc("Max number of partial inlining. The default is unlimited")); 61 62 // Used only when PGO or user annotated branch data is absent. It is 63 // the least value that is used to weigh the outline region. If BFI 64 // produces larger value, the BFI value will be used. 65 static cl::opt<int> 66 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 67 cl::Hidden, cl::ZeroOrMore, 68 cl::desc("Relative frequency of outline region to " 69 "the entry block")); 70 71 namespace { 72 73 struct FunctionOutliningInfo { 74 FunctionOutliningInfo() 75 : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr), 76 ReturnBlockPreds() {} 77 // Returns the number of blocks to be inlined including all blocks 78 // in Entries and one return block. 79 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 80 81 // A set of blocks including the function entry that guard 82 // the region to be outlined. 83 SmallVector<BasicBlock *, 4> Entries; 84 // The return block that is not included in the outlined region. 85 BasicBlock *ReturnBlock; 86 // The dominating block of the region ot be outlined. 87 BasicBlock *NonReturnBlock; 88 // The set of blocks in Entries that that are predecessors to ReturnBlock 89 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 90 }; 91 92 struct PartialInlinerImpl { 93 PartialInlinerImpl( 94 std::function<AssumptionCache &(Function &)> *GetAC, 95 std::function<TargetTransformInfo &(Function &)> *GTTI, 96 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 97 ProfileSummaryInfo *ProfSI) 98 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 99 bool run(Module &M); 100 Function *unswitchFunction(Function *F); 101 102 private: 103 int NumPartialInlining = 0; 104 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 105 std::function<TargetTransformInfo &(Function &)> *GetTTI; 106 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 107 ProfileSummaryInfo *PSI; 108 109 // Return the frequency of the OutlininingBB relative to F's entry point. 110 // The result is no larger than 1 and is represented using BP. 111 // (Note that the outlined region's 'head' block can only have incoming 112 // edges from the guarding entry blocks). 113 BranchProbability getOutliningCallBBRelativeFreq(Function *F, 114 FunctionOutliningInfo *OI, 115 Function *DuplicateFunction, 116 BlockFrequencyInfo *BFI, 117 BasicBlock *OutliningCallBB); 118 119 // Return true if the callee of CS should be partially inlined with 120 // profit. 121 bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, 122 BlockFrequencyInfo *CalleeBFI, 123 BasicBlock *OutliningCallBB, 124 int OutliningCallOverhead, 125 OptimizationRemarkEmitter &ORE); 126 127 // Try to inline DuplicateFunction (cloned from F with call to 128 // the OutlinedFunction into its callers. Return true 129 // if there is any successful inlining. 130 bool tryPartialInline(Function *DuplicateFunction, 131 Function *F, /*orignal function */ 132 FunctionOutliningInfo *OI, Function *OutlinedFunction, 133 BlockFrequencyInfo *CalleeBFI); 134 135 // Compute the mapping from use site of DuplicationFunction to the enclosing 136 // BB's profile count. 137 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 138 DenseMap<User *, uint64_t> &SiteCountMap); 139 140 bool IsLimitReached() { 141 return (MaxNumPartialInlining != -1 && 142 NumPartialInlining >= MaxNumPartialInlining); 143 } 144 145 CallSite getCallSite(User *U) { 146 CallSite CS; 147 if (CallInst *CI = dyn_cast<CallInst>(U)) 148 CS = CallSite(CI); 149 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 150 CS = CallSite(II); 151 else 152 llvm_unreachable("All uses must be calls"); 153 return CS; 154 } 155 156 CallSite getOneCallSiteTo(Function *F) { 157 User *User = *F->user_begin(); 158 return getCallSite(User); 159 } 160 161 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 162 CallSite CS = getOneCallSiteTo(F); 163 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 164 BasicBlock *Block = CS.getParent(); 165 return std::make_tuple(DLoc, Block); 166 } 167 168 // Returns the costs associated with function outlining: 169 // - The first value is the non-weighted runtime cost for making the call 170 // to the outlined function 'OutlinedFunction', including the addtional 171 // setup cost in the outlined function itself; 172 // - The second value is the estimated size of the new call sequence in 173 // basic block 'OutliningCallBB'; 174 // - The third value is the estimated size of the original code from 175 // function 'F' that is extracted into the outlined function. 176 std::tuple<int, int, int> 177 computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, 178 Function *OutlinedFunction, 179 BasicBlock *OutliningCallBB); 180 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 181 // approximate both the size and runtime cost (Note that in the current 182 // inline cost analysis, there is no clear distinction there either). 183 int computeBBInlineCost(BasicBlock *BB); 184 185 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 186 187 }; 188 189 struct PartialInlinerLegacyPass : public ModulePass { 190 static char ID; // Pass identification, replacement for typeid 191 PartialInlinerLegacyPass() : ModulePass(ID) { 192 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 193 } 194 195 void getAnalysisUsage(AnalysisUsage &AU) const override { 196 AU.addRequired<AssumptionCacheTracker>(); 197 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 198 AU.addRequired<TargetTransformInfoWrapperPass>(); 199 } 200 bool runOnModule(Module &M) override { 201 if (skipModule(M)) 202 return false; 203 204 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 205 TargetTransformInfoWrapperPass *TTIWP = 206 &getAnalysis<TargetTransformInfoWrapperPass>(); 207 ProfileSummaryInfo *PSI = 208 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 209 210 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 211 [&ACT](Function &F) -> AssumptionCache & { 212 return ACT->getAssumptionCache(F); 213 }; 214 215 std::function<TargetTransformInfo &(Function &)> GetTTI = 216 [&TTIWP](Function &F) -> TargetTransformInfo & { 217 return TTIWP->getTTI(F); 218 }; 219 220 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M); 221 } 222 }; 223 } 224 225 std::unique_ptr<FunctionOutliningInfo> 226 PartialInlinerImpl::computeOutliningInfo(Function *F) { 227 BasicBlock *EntryBlock = &F->front(); 228 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 229 if (!BR || BR->isUnconditional()) 230 return std::unique_ptr<FunctionOutliningInfo>(); 231 232 // Returns true if Succ is BB's successor 233 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 234 return is_contained(successors(BB), Succ); 235 }; 236 237 auto SuccSize = [](BasicBlock *BB) { 238 return std::distance(succ_begin(BB), succ_end(BB)); 239 }; 240 241 auto IsReturnBlock = [](BasicBlock *BB) { 242 TerminatorInst *TI = BB->getTerminator(); 243 return isa<ReturnInst>(TI); 244 }; 245 246 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 247 if (IsReturnBlock(Succ1)) 248 return std::make_tuple(Succ1, Succ2); 249 if (IsReturnBlock(Succ2)) 250 return std::make_tuple(Succ2, Succ1); 251 252 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 253 }; 254 255 // Detect a triangular shape: 256 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 257 if (IsSuccessor(Succ1, Succ2)) 258 return std::make_tuple(Succ1, Succ2); 259 if (IsSuccessor(Succ2, Succ1)) 260 return std::make_tuple(Succ2, Succ1); 261 262 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 263 }; 264 265 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 266 llvm::make_unique<FunctionOutliningInfo>(); 267 268 BasicBlock *CurrEntry = EntryBlock; 269 bool CandidateFound = false; 270 do { 271 // The number of blocks to be inlined has already reached 272 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 273 // disables partial inlining for the function. 274 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 275 break; 276 277 if (SuccSize(CurrEntry) != 2) 278 break; 279 280 BasicBlock *Succ1 = *succ_begin(CurrEntry); 281 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 282 283 BasicBlock *ReturnBlock, *NonReturnBlock; 284 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 285 286 if (ReturnBlock) { 287 OutliningInfo->Entries.push_back(CurrEntry); 288 OutliningInfo->ReturnBlock = ReturnBlock; 289 OutliningInfo->NonReturnBlock = NonReturnBlock; 290 CandidateFound = true; 291 break; 292 } 293 294 BasicBlock *CommSucc; 295 BasicBlock *OtherSucc; 296 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 297 298 if (!CommSucc) 299 break; 300 301 OutliningInfo->Entries.push_back(CurrEntry); 302 CurrEntry = OtherSucc; 303 304 } while (true); 305 306 if (!CandidateFound) 307 return std::unique_ptr<FunctionOutliningInfo>(); 308 309 // Do sanity check of the entries: threre should not 310 // be any successors (not in the entry set) other than 311 // {ReturnBlock, NonReturnBlock} 312 assert(OutliningInfo->Entries[0] == &F->front() && 313 "Function Entry must be the first in Entries vector"); 314 DenseSet<BasicBlock *> Entries; 315 for (BasicBlock *E : OutliningInfo->Entries) 316 Entries.insert(E); 317 318 // Returns true of BB has Predecessor which is not 319 // in Entries set. 320 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 321 for (auto Pred : predecessors(BB)) { 322 if (!Entries.count(Pred)) 323 return true; 324 } 325 return false; 326 }; 327 auto CheckAndNormalizeCandidate = 328 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 329 for (BasicBlock *E : OutliningInfo->Entries) { 330 for (auto Succ : successors(E)) { 331 if (Entries.count(Succ)) 332 continue; 333 if (Succ == OutliningInfo->ReturnBlock) 334 OutliningInfo->ReturnBlockPreds.push_back(E); 335 else if (Succ != OutliningInfo->NonReturnBlock) 336 return false; 337 } 338 // There should not be any outside incoming edges either: 339 if (HasNonEntryPred(E)) 340 return false; 341 } 342 return true; 343 }; 344 345 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 346 return std::unique_ptr<FunctionOutliningInfo>(); 347 348 // Now further growing the candidate's inlining region by 349 // peeling off dominating blocks from the outlining region: 350 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 351 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 352 if (SuccSize(Cand) != 2) 353 break; 354 355 if (HasNonEntryPred(Cand)) 356 break; 357 358 BasicBlock *Succ1 = *succ_begin(Cand); 359 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 360 361 BasicBlock *ReturnBlock, *NonReturnBlock; 362 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 363 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 364 break; 365 366 if (NonReturnBlock->getSinglePredecessor() != Cand) 367 break; 368 369 // Now grow and update OutlininigInfo: 370 OutliningInfo->Entries.push_back(Cand); 371 OutliningInfo->NonReturnBlock = NonReturnBlock; 372 OutliningInfo->ReturnBlockPreds.push_back(Cand); 373 Entries.insert(Cand); 374 } 375 376 return OutliningInfo; 377 } 378 379 // Check if there is PGO data or user annoated branch data: 380 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 381 if (F->getEntryCount()) 382 return true; 383 // Now check if any of the entry block has MD_prof data: 384 for (auto *E : OI->Entries) { 385 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 386 if (!BR || BR->isUnconditional()) 387 continue; 388 uint64_t T, F; 389 if (BR->extractProfMetadata(T, F)) 390 return true; 391 } 392 return false; 393 } 394 395 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 396 Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, 397 BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { 398 399 auto EntryFreq = 400 BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); 401 auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); 402 403 auto OutlineRegionRelFreq = 404 BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), 405 EntryFreq.getFrequency()); 406 407 if (hasProfileData(F, OI)) 408 return OutlineRegionRelFreq; 409 410 // When profile data is not available, we need to be very 411 // conservative in estimating the overall savings. We need to make sure 412 // the outline region relative frequency is not below the threshold 413 // specified by the option. 414 OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 415 416 return OutlineRegionRelFreq; 417 } 418 419 bool PartialInlinerImpl::shouldPartialInline( 420 CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, 421 BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, 422 int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { 423 using namespace ore; 424 if (SkipCostAnalysis) 425 return true; 426 427 Instruction *Call = CS.getInstruction(); 428 Function *Callee = CS.getCalledFunction(); 429 Function *Caller = CS.getCaller(); 430 auto &CalleeTTI = (*GetTTI)(*Callee); 431 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 432 *GetAssumptionCache, GetBFI, PSI); 433 434 if (IC.isAlways()) { 435 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 436 << NV("Callee", F) 437 << " should always be fully inlined, not partially"); 438 return false; 439 } 440 441 if (IC.isNever()) { 442 ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 443 << NV("Callee", F) << " not partially inlined into " 444 << NV("Caller", Caller) 445 << " because it should never be inlined (cost=never)"); 446 return false; 447 } 448 449 if (!IC) { 450 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 451 << NV("Callee", F) << " not partially inlined into " 452 << NV("Caller", Caller) << " because too costly to inline (cost=" 453 << NV("Cost", IC.getCost()) << ", threshold=" 454 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 455 return false; 456 } 457 const DataLayout &DL = Caller->getParent()->getDataLayout(); 458 // The savings of eliminating the call: 459 int NonWeightedSavings = getCallsiteCost(CS, DL); 460 BlockFrequency NormWeightedSavings(NonWeightedSavings); 461 462 auto RelativeFreq = 463 getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); 464 auto NormWeightedRcost = 465 BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; 466 467 // Weighted saving is smaller than weighted cost, return false 468 if (NormWeightedSavings < NormWeightedRcost) { 469 ORE.emit( 470 OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) 471 << NV("Callee", F) << " not partially inlined into " 472 << NV("Caller", Caller) << " runtime overhead (overhead=" 473 << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) 474 << ", savings=" 475 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" 476 << " of making the outlined call is too high"); 477 478 return false; 479 } 480 481 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 482 << NV("Callee", F) << " can be partially inlined into " 483 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 484 << " (threshold=" 485 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); 486 return true; 487 } 488 489 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 490 // For now use a simplified version. The returned 'InlineCost' will be used 491 // to esimate the size cost as well as runtime cost of the BB. 492 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 493 int InlineCost = 0; 494 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 495 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 496 if (isa<DbgInfoIntrinsic>(I)) 497 continue; 498 499 if (CallInst *CI = dyn_cast<CallInst>(I)) { 500 InlineCost += getCallsiteCost(CallSite(CI), DL); 501 continue; 502 } 503 504 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 505 InlineCost += getCallsiteCost(CallSite(II), DL); 506 continue; 507 } 508 509 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 510 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 511 continue; 512 } 513 InlineCost += InlineConstants::InstrCost; 514 } 515 return InlineCost; 516 } 517 518 std::tuple<int, int, int> PartialInlinerImpl::computeOutliningCosts( 519 Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, 520 BasicBlock *OutliningCallBB) { 521 // First compute the cost of the outlined region 'OI' in the original 522 // function 'F': 523 int OutlinedRegionCost = 0; 524 for (BasicBlock &BB : *F) { 525 if (&BB != OI->ReturnBlock && 526 // Assuming Entry set is small -- do a linear search here: 527 std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == 528 OI->Entries.end()) { 529 OutlinedRegionCost += computeBBInlineCost(&BB); 530 } 531 } 532 533 // Now compute the cost of the call sequence to the outlined function 534 // 'OutlinedFunction' in BB 'OutliningCallBB': 535 int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); 536 537 // Now compute the cost of the extracted/outlined function itself: 538 int OutlinedFunctionCost = 0; 539 for (BasicBlock &BB : *OutlinedFunction) { 540 OutlinedFunctionCost += computeBBInlineCost(&BB); 541 } 542 543 assert(OutlinedFunctionCost >= OutlinedRegionCost && 544 "Outlined function cost should be no less than the outlined region"); 545 int OutliningRuntimeOverhead = 546 OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); 547 548 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, 549 OutlinedRegionCost); 550 } 551 552 // Create the callsite to profile count map which is 553 // used to update the original function's entry count, 554 // after the function is partially inlined into the callsite. 555 void PartialInlinerImpl::computeCallsiteToProfCountMap( 556 Function *DuplicateFunction, 557 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 558 std::vector<User *> Users(DuplicateFunction->user_begin(), 559 DuplicateFunction->user_end()); 560 Function *CurrentCaller = nullptr; 561 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 562 563 auto ComputeCurrBFI = [&,this](Function *Caller) { 564 // For the old pass manager: 565 if (!GetBFI) { 566 if (CurrentCallerBFI) 567 delete CurrentCallerBFI; 568 DominatorTree DT(*Caller); 569 LoopInfo LI(DT); 570 BranchProbabilityInfo BPI(*Caller, LI); 571 CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); 572 } else { 573 // New pass manager: 574 CurrentCallerBFI = &(*GetBFI)(*Caller); 575 } 576 }; 577 578 for (User *User : Users) { 579 CallSite CS = getCallSite(User); 580 Function *Caller = CS.getCaller(); 581 if (CurrentCaller != Caller) { 582 CurrentCaller = Caller; 583 ComputeCurrBFI(Caller); 584 } else { 585 assert(CurrentCallerBFI && "CallerBFI is not set"); 586 } 587 BasicBlock *CallBB = CS.getInstruction()->getParent(); 588 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 589 if (Count) 590 CallSiteToProfCountMap[User] = *Count; 591 else 592 CallSiteToProfCountMap[User] = 0; 593 } 594 if (!GetBFI) { 595 if (CurrentCallerBFI) 596 delete CurrentCallerBFI; 597 } 598 } 599 600 Function *PartialInlinerImpl::unswitchFunction(Function *F) { 601 602 if (F->hasAddressTaken()) 603 return nullptr; 604 605 // Let inliner handle it 606 if (F->hasFnAttribute(Attribute::AlwaysInline)) 607 return nullptr; 608 609 if (F->hasFnAttribute(Attribute::NoInline)) 610 return nullptr; 611 612 if (PSI->isFunctionEntryCold(F)) 613 return nullptr; 614 615 if (F->user_begin() == F->user_end()) 616 return nullptr; 617 618 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 619 620 if (!OI) 621 return nullptr; 622 623 // Clone the function, so that we can hack away on it. 624 ValueToValueMapTy VMap; 625 Function *DuplicateFunction = CloneFunction(F, VMap); 626 BasicBlock *NewReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 627 BasicBlock *NewNonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 628 DenseSet<BasicBlock *> NewEntries; 629 for (BasicBlock *BB : OI->Entries) { 630 NewEntries.insert(cast<BasicBlock>(VMap[BB])); 631 } 632 633 // Go ahead and update all uses to the duplicate, so that we can just 634 // use the inliner functionality when we're done hacking. 635 F->replaceAllUsesWith(DuplicateFunction); 636 637 auto getFirstPHI = [](BasicBlock *BB) { 638 BasicBlock::iterator I = BB->begin(); 639 PHINode *FirstPhi = nullptr; 640 while (I != BB->end()) { 641 PHINode *Phi = dyn_cast<PHINode>(I); 642 if (!Phi) 643 break; 644 if (!FirstPhi) { 645 FirstPhi = Phi; 646 break; 647 } 648 } 649 return FirstPhi; 650 }; 651 // Special hackery is needed with PHI nodes that have inputs from more than 652 // one extracted block. For simplicity, just split the PHIs into a two-level 653 // sequence of PHIs, some of which will go in the extracted region, and some 654 // of which will go outside. 655 BasicBlock *PreReturn = NewReturnBlock; 656 // only split block when necessary: 657 PHINode *FirstPhi = getFirstPHI(PreReturn); 658 unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); 659 if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { 660 661 NewReturnBlock = NewReturnBlock->splitBasicBlock( 662 NewReturnBlock->getFirstNonPHI()->getIterator()); 663 BasicBlock::iterator I = PreReturn->begin(); 664 Instruction *Ins = &NewReturnBlock->front(); 665 while (I != PreReturn->end()) { 666 PHINode *OldPhi = dyn_cast<PHINode>(I); 667 if (!OldPhi) 668 break; 669 670 PHINode *RetPhi = 671 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 672 OldPhi->replaceAllUsesWith(RetPhi); 673 Ins = NewReturnBlock->getFirstNonPHI(); 674 675 RetPhi->addIncoming(&*I, PreReturn); 676 for (BasicBlock *E : OI->ReturnBlockPreds) { 677 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 678 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); 679 OldPhi->removeIncomingValue(NewE); 680 } 681 ++I; 682 } 683 for (auto E : OI->ReturnBlockPreds) { 684 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 685 NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); 686 } 687 } 688 689 // Returns true if the block is to be partial inlined into the caller 690 // (i.e. not to be extracted to the out of line function) 691 auto ToBeInlined = [&](BasicBlock *BB) { 692 return BB == NewReturnBlock || NewEntries.count(BB); 693 }; 694 // Gather up the blocks that we're going to extract. 695 std::vector<BasicBlock *> ToExtract; 696 ToExtract.push_back(NewNonReturnBlock); 697 for (BasicBlock &BB : *DuplicateFunction) 698 if (!ToBeInlined(&BB) && &BB != NewNonReturnBlock) 699 ToExtract.push_back(&BB); 700 701 // The CodeExtractor needs a dominator tree. 702 DominatorTree DT; 703 DT.recalculate(*DuplicateFunction); 704 705 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 706 LoopInfo LI(DT); 707 BranchProbabilityInfo BPI(*DuplicateFunction, LI); 708 BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); 709 710 // Extract the body of the if. 711 Function *OutlinedFunction = 712 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) 713 .extractCodeRegion(); 714 715 bool AnyInline = 716 tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); 717 718 // Ditch the duplicate, since we're done with it, and rewrite all remaining 719 // users (function pointers, etc.) back to the original function. 720 DuplicateFunction->replaceAllUsesWith(F); 721 DuplicateFunction->eraseFromParent(); 722 723 if (AnyInline) 724 return OutlinedFunction; 725 726 // Remove the function that is speculatively created: 727 if (OutlinedFunction) 728 OutlinedFunction->eraseFromParent(); 729 730 return nullptr; 731 } 732 733 bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, 734 Function *F, 735 FunctionOutliningInfo *OI, 736 Function *OutlinedFunction, 737 BlockFrequencyInfo *CalleeBFI) { 738 if (OutlinedFunction == nullptr) 739 return false; 740 741 int NonWeightedRcost; 742 int SizeCost; 743 int OutlinedRegionSizeCost; 744 745 auto OutliningCallBB = 746 getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); 747 748 std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = 749 computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); 750 751 // The call sequence to the outlined function is larger than the original 752 // outlined region size, it does not increase the chances of inlining 753 // 'F' with outlining (The inliner usies the size increase to model the 754 // the cost of inlining a callee). 755 if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { 756 OptimizationRemarkEmitter ORE(F); 757 DebugLoc DLoc; 758 BasicBlock *Block; 759 std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); 760 ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 761 DLoc, Block) 762 << ore::NV("Function", F) 763 << " not partially inlined into callers (Original Size = " 764 << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) 765 << ", Size of call sequence to outlined function = " 766 << ore::NV("NewSize", SizeCost) << ")"); 767 return false; 768 } 769 770 assert(F->user_begin() == F->user_end() && 771 "F's users should all be replaced!"); 772 std::vector<User *> Users(DuplicateFunction->user_begin(), 773 DuplicateFunction->user_end()); 774 775 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 776 if (F->getEntryCount()) 777 computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); 778 779 auto CalleeEntryCount = F->getEntryCount(); 780 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 781 bool AnyInline = false; 782 for (User *User : Users) { 783 CallSite CS = getCallSite(User); 784 785 if (IsLimitReached()) 786 continue; 787 788 OptimizationRemarkEmitter ORE(CS.getCaller()); 789 790 if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, 791 NonWeightedRcost, ORE)) 792 continue; 793 794 ORE.emit( 795 OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) 796 << ore::NV("Callee", F) << " partially inlined into " 797 << ore::NV("Caller", CS.getCaller())); 798 799 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 800 InlineFunction(CS, IFI); 801 802 // Now update the entry count: 803 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 804 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 805 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 806 } 807 808 AnyInline = true; 809 NumPartialInlining++; 810 // Update the stats 811 NumPartialInlined++; 812 } 813 814 if (AnyInline && CalleeEntryCount) 815 F->setEntryCount(CalleeEntryCountV); 816 817 return AnyInline; 818 } 819 820 bool PartialInlinerImpl::run(Module &M) { 821 if (DisablePartialInlining) 822 return false; 823 824 std::vector<Function *> Worklist; 825 Worklist.reserve(M.size()); 826 for (Function &F : M) 827 if (!F.use_empty() && !F.isDeclaration()) 828 Worklist.push_back(&F); 829 830 bool Changed = false; 831 while (!Worklist.empty()) { 832 Function *CurrFunc = Worklist.back(); 833 Worklist.pop_back(); 834 835 if (CurrFunc->use_empty()) 836 continue; 837 838 bool Recursive = false; 839 for (User *U : CurrFunc->users()) 840 if (Instruction *I = dyn_cast<Instruction>(U)) 841 if (I->getParent()->getParent() == CurrFunc) { 842 Recursive = true; 843 break; 844 } 845 if (Recursive) 846 continue; 847 848 if (Function *NewFunc = unswitchFunction(CurrFunc)) { 849 Worklist.push_back(NewFunc); 850 Changed = true; 851 } 852 } 853 854 return Changed; 855 } 856 857 char PartialInlinerLegacyPass::ID = 0; 858 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 859 "Partial Inliner", false, false) 860 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 861 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 862 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 863 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 864 "Partial Inliner", false, false) 865 866 ModulePass *llvm::createPartialInliningPass() { 867 return new PartialInlinerLegacyPass(); 868 } 869 870 PreservedAnalyses PartialInlinerPass::run(Module &M, 871 ModuleAnalysisManager &AM) { 872 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 873 874 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 875 [&FAM](Function &F) -> AssumptionCache & { 876 return FAM.getResult<AssumptionAnalysis>(F); 877 }; 878 879 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 880 [&FAM](Function &F) -> BlockFrequencyInfo & { 881 return FAM.getResult<BlockFrequencyAnalysis>(F); 882 }; 883 884 std::function<TargetTransformInfo &(Function &)> GetTTI = 885 [&FAM](Function &F) -> TargetTransformInfo & { 886 return FAM.getResult<TargetIRAnalysis>(F); 887 }; 888 889 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 890 891 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M)) 892 return PreservedAnalyses::none(); 893 return PreservedAnalyses::all(); 894 } 895