1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/BlockFrequencyInfo.h" 24 #include "llvm/Analysis/BranchProbabilityInfo.h" 25 #include "llvm/Analysis/DominanceFrontier.h" 26 #include "llvm/Analysis/InlineCost.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 29 #include "llvm/Analysis/ProfileSummaryInfo.h" 30 #include "llvm/Analysis/TargetLibraryInfo.h" 31 #include "llvm/Analysis/TargetTransformInfo.h" 32 #include "llvm/IR/Attributes.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/CFG.h" 35 #include "llvm/IR/CallSite.h" 36 #include "llvm/IR/DebugLoc.h" 37 #include "llvm/IR/DiagnosticInfo.h" 38 #include "llvm/IR/Dominators.h" 39 #include "llvm/IR/Function.h" 40 #include "llvm/IR/InstrTypes.h" 41 #include "llvm/IR/Instruction.h" 42 #include "llvm/IR/Instructions.h" 43 #include "llvm/IR/IntrinsicInst.h" 44 #include "llvm/IR/Intrinsics.h" 45 #include "llvm/IR/Module.h" 46 #include "llvm/IR/User.h" 47 #include "llvm/Pass.h" 48 #include "llvm/Support/BlockFrequency.h" 49 #include "llvm/Support/BranchProbability.h" 50 #include "llvm/Support/Casting.h" 51 #include "llvm/Support/CommandLine.h" 52 #include "llvm/Support/ErrorHandling.h" 53 #include "llvm/Transforms/IPO.h" 54 #include "llvm/Transforms/Utils/Cloning.h" 55 #include "llvm/Transforms/Utils/CodeExtractor.h" 56 #include "llvm/Transforms/Utils/ValueMapper.h" 57 #include <algorithm> 58 #include <cassert> 59 #include <cstdint> 60 #include <functional> 61 #include <iterator> 62 #include <memory> 63 #include <tuple> 64 #include <vector> 65 66 using namespace llvm; 67 68 #define DEBUG_TYPE "partial-inlining" 69 70 STATISTIC(NumPartialInlined, 71 "Number of callsites functions partially inlined into."); 72 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 73 "cold outlined regions were partially " 74 "inlined into its caller(s)."); 75 STATISTIC(NumColdRegionsFound, 76 "Number of cold single entry/exit regions found."); 77 STATISTIC(NumColdRegionsOutlined, 78 "Number of cold single entry/exit regions outlined."); 79 80 // Command line option to disable partial-inlining. The default is false: 81 static cl::opt<bool> 82 DisablePartialInlining("disable-partial-inlining", cl::init(false), 83 cl::Hidden, cl::desc("Disable partial inlining")); 84 // Command line option to disable multi-region partial-inlining. The default is 85 // false: 86 static cl::opt<bool> DisableMultiRegionPartialInline( 87 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 88 cl::desc("Disable multi-region partial inlining")); 89 90 // Command line option to force outlining in regions with live exit variables. 91 // The default is false: 92 static cl::opt<bool> 93 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 94 cl::desc("Force outline regions with live exits")); 95 96 // Command line option to enable marking outline functions with Cold Calling 97 // Convention. The default is false: 98 static cl::opt<bool> 99 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 100 cl::desc("Mark outline function calls with ColdCC")); 101 102 #ifndef NDEBUG 103 // Command line option to debug partial-inlining. The default is none: 104 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 105 cl::init(false), cl::Hidden, 106 cl::desc("Trace partial inlining.")); 107 #endif 108 109 // This is an option used by testing: 110 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 111 cl::init(false), cl::ZeroOrMore, 112 cl::ReallyHidden, 113 cl::desc("Skip Cost Analysis")); 114 // Used to determine if a cold region is worth outlining based on 115 // its inlining cost compared to the original function. Default is set at 10%. 116 // ie. if the cold region reduces the inlining cost of the original function by 117 // at least 10%. 118 static cl::opt<float> MinRegionSizeRatio( 119 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 120 cl::desc("Minimum ratio comparing relative sizes of each " 121 "outline candidate and original function")); 122 // Used to tune the minimum number of execution counts needed in the predecessor 123 // block to the cold edge. ie. confidence interval. 124 static cl::opt<unsigned> 125 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 126 cl::desc("Minimum block executions to consider " 127 "its BranchProbabilityInfo valid")); 128 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 129 // if the branch probability is 10% or less, then it is deemed as 'cold'. 130 static cl::opt<float> ColdBranchRatio( 131 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 132 cl::desc("Minimum BranchProbability to consider a region cold.")); 133 134 static cl::opt<unsigned> MaxNumInlineBlocks( 135 "max-num-inline-blocks", cl::init(5), cl::Hidden, 136 cl::desc("Max number of blocks to be partially inlined")); 137 138 // Command line option to set the maximum number of partial inlining allowed 139 // for the module. The default value of -1 means no limit. 140 static cl::opt<int> MaxNumPartialInlining( 141 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 142 cl::desc("Max number of partial inlining. The default is unlimited")); 143 144 // Used only when PGO or user annotated branch data is absent. It is 145 // the least value that is used to weigh the outline region. If BFI 146 // produces larger value, the BFI value will be used. 147 static cl::opt<int> 148 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 149 cl::Hidden, cl::ZeroOrMore, 150 cl::desc("Relative frequency of outline region to " 151 "the entry block")); 152 153 static cl::opt<unsigned> ExtraOutliningPenalty( 154 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 155 cl::desc("A debug option to add additional penalty to the computed one.")); 156 157 namespace { 158 159 struct FunctionOutliningInfo { 160 FunctionOutliningInfo() = default; 161 162 // Returns the number of blocks to be inlined including all blocks 163 // in Entries and one return block. 164 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 165 166 // A set of blocks including the function entry that guard 167 // the region to be outlined. 168 SmallVector<BasicBlock *, 4> Entries; 169 170 // The return block that is not included in the outlined region. 171 BasicBlock *ReturnBlock = nullptr; 172 173 // The dominating block of the region to be outlined. 174 BasicBlock *NonReturnBlock = nullptr; 175 176 // The set of blocks in Entries that that are predecessors to ReturnBlock 177 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 178 }; 179 180 struct FunctionOutliningMultiRegionInfo { 181 FunctionOutliningMultiRegionInfo() 182 : ORI() {} 183 184 // Container for outline regions 185 struct OutlineRegionInfo { 186 OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region, 187 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 188 BasicBlock *ReturnBlock) 189 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), 190 ReturnBlock(ReturnBlock) {} 191 SmallVector<BasicBlock *, 8> Region; 192 BasicBlock *EntryBlock; 193 BasicBlock *ExitBlock; 194 BasicBlock *ReturnBlock; 195 }; 196 197 SmallVector<OutlineRegionInfo, 4> ORI; 198 }; 199 200 struct PartialInlinerImpl { 201 202 PartialInlinerImpl( 203 std::function<AssumptionCache &(Function &)> *GetAC, 204 std::function<TargetTransformInfo &(Function &)> *GTTI, 205 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 206 ProfileSummaryInfo *ProfSI, 207 std::function<OptimizationRemarkEmitter &(Function &)> *GORE) 208 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI), 209 GetORE(GORE) {} 210 211 bool run(Module &M); 212 // Main part of the transformation that calls helper functions to find 213 // outlining candidates, clone & outline the function, and attempt to 214 // partially inline the resulting function. Returns true if 215 // inlining was successful, false otherwise. Also returns the outline 216 // function (only if we partially inlined early returns) as there is a 217 // possibility to further "peel" early return statements that were left in the 218 // outline function due to code size. 219 std::pair<bool, Function *> unswitchFunction(Function *F); 220 221 // This class speculatively clones the the function to be partial inlined. 222 // At the end of partial inlining, the remaining callsites to the cloned 223 // function that are not partially inlined will be fixed up to reference 224 // the original function, and the cloned function will be erased. 225 struct FunctionCloner { 226 // Two constructors, one for single region outlining, the other for 227 // multi-region outlining. 228 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 229 OptimizationRemarkEmitter &ORE); 230 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 231 OptimizationRemarkEmitter &ORE); 232 ~FunctionCloner(); 233 234 // Prepare for function outlining: making sure there is only 235 // one incoming edge from the extracted/outlined region to 236 // the return block. 237 void NormalizeReturnBlock(); 238 239 // Do function outlining for cold regions. 240 bool doMultiRegionFunctionOutlining(); 241 // Do function outlining for region after early return block(s). 242 // NOTE: For vararg functions that do the vararg handling in the outlined 243 // function, we temporarily generate IR that does not properly 244 // forward varargs to the outlined function. Calling InlineFunction 245 // will update calls to the outlined functions to properly forward 246 // the varargs. 247 Function *doSingleRegionFunctionOutlining(); 248 249 Function *OrigFunc = nullptr; 250 Function *ClonedFunc = nullptr; 251 252 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 253 // Keep track of Outlined Functions and the basic block they're called from. 254 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 255 256 // ClonedFunc is inlined in one of its callers after function 257 // outlining. 258 bool IsFunctionInlined = false; 259 // The cost of the region to be outlined. 260 int OutlinedRegionCost = 0; 261 // ClonedOI is specific to outlining non-early return blocks. 262 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 263 // ClonedOMRI is specific to outlining cold regions. 264 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 265 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 266 OptimizationRemarkEmitter &ORE; 267 }; 268 269 private: 270 int NumPartialInlining = 0; 271 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 272 std::function<TargetTransformInfo &(Function &)> *GetTTI; 273 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 274 ProfileSummaryInfo *PSI; 275 std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; 276 277 // Return the frequency of the OutlininingBB relative to F's entry point. 278 // The result is no larger than 1 and is represented using BP. 279 // (Note that the outlined region's 'head' block can only have incoming 280 // edges from the guarding entry blocks). 281 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 282 283 // Return true if the callee of CS should be partially inlined with 284 // profit. 285 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 286 BlockFrequency WeightedOutliningRcost); 287 288 // Try to inline DuplicateFunction (cloned from F with call to 289 // the OutlinedFunction into its callers. Return true 290 // if there is any successful inlining. 291 bool tryPartialInline(FunctionCloner &Cloner); 292 293 // Compute the mapping from use site of DuplicationFunction to the enclosing 294 // BB's profile count. 295 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 296 DenseMap<User *, uint64_t> &SiteCountMap); 297 298 bool IsLimitReached() { 299 return (MaxNumPartialInlining != -1 && 300 NumPartialInlining >= MaxNumPartialInlining); 301 } 302 303 static CallSite getCallSite(User *U) { 304 CallSite CS; 305 if (CallInst *CI = dyn_cast<CallInst>(U)) 306 CS = CallSite(CI); 307 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 308 CS = CallSite(II); 309 else 310 llvm_unreachable("All uses must be calls"); 311 return CS; 312 } 313 314 static CallSite getOneCallSiteTo(Function *F) { 315 User *User = *F->user_begin(); 316 return getCallSite(User); 317 } 318 319 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 320 CallSite CS = getOneCallSiteTo(F); 321 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 322 BasicBlock *Block = CS.getParent(); 323 return std::make_tuple(DLoc, Block); 324 } 325 326 // Returns the costs associated with function outlining: 327 // - The first value is the non-weighted runtime cost for making the call 328 // to the outlined function, including the addtional setup cost in the 329 // outlined function itself; 330 // - The second value is the estimated size of the new call sequence in 331 // basic block Cloner.OutliningCallBB; 332 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 333 334 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 335 // approximate both the size and runtime cost (Note that in the current 336 // inline cost analysis, there is no clear distinction there either). 337 static int computeBBInlineCost(BasicBlock *BB); 338 339 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 340 std::unique_ptr<FunctionOutliningMultiRegionInfo> 341 computeOutliningColdRegionsInfo(Function *F); 342 }; 343 344 struct PartialInlinerLegacyPass : public ModulePass { 345 static char ID; // Pass identification, replacement for typeid 346 347 PartialInlinerLegacyPass() : ModulePass(ID) { 348 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 349 } 350 351 void getAnalysisUsage(AnalysisUsage &AU) const override { 352 AU.addRequired<AssumptionCacheTracker>(); 353 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 354 AU.addRequired<TargetTransformInfoWrapperPass>(); 355 } 356 357 bool runOnModule(Module &M) override { 358 if (skipModule(M)) 359 return false; 360 361 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 362 TargetTransformInfoWrapperPass *TTIWP = 363 &getAnalysis<TargetTransformInfoWrapperPass>(); 364 ProfileSummaryInfo *PSI = 365 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 366 std::unique_ptr<OptimizationRemarkEmitter> UPORE; 367 368 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 369 [&ACT](Function &F) -> AssumptionCache & { 370 return ACT->getAssumptionCache(F); 371 }; 372 373 std::function<TargetTransformInfo &(Function &)> GetTTI = 374 [&TTIWP](Function &F) -> TargetTransformInfo & { 375 return TTIWP->getTTI(F); 376 }; 377 378 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 379 [&UPORE](Function &F) -> OptimizationRemarkEmitter & { 380 UPORE.reset(new OptimizationRemarkEmitter(&F)); 381 return *UPORE.get(); 382 }; 383 384 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI, 385 &GetORE) 386 .run(M); 387 } 388 }; 389 390 } // end anonymous namespace 391 392 std::unique_ptr<FunctionOutliningMultiRegionInfo> 393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) { 394 BasicBlock *EntryBlock = &F->front(); 395 396 DominatorTree DT(*F); 397 DominanceFrontier DF; 398 DF.analyze(DT); 399 LoopInfo LI(DT); 400 BranchProbabilityInfo BPI(*F, LI); 401 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 402 BlockFrequencyInfo *BFI; 403 if (!GetBFI) { 404 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 405 BFI = ScopedBFI.get(); 406 } else 407 BFI = &(*GetBFI)(*F); 408 409 auto &ORE = (*GetORE)(*F); 410 411 // Return if we don't have profiling information. 412 if (!PSI->hasInstrumentationProfile()) 413 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 414 415 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 416 llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 417 418 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 419 BasicBlock *Dom = BlockList.front(); 420 return BlockList.size() > 1 && 421 std::distance(pred_begin(Dom), pred_end(Dom)) == 1; 422 }; 423 424 auto IsSingleExit = 425 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 426 BasicBlock *ExitBlock = nullptr; 427 for (auto *Block : BlockList) { 428 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 429 if (!is_contained(BlockList, *SI)) { 430 if (ExitBlock) { 431 ORE.emit([&]() { 432 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 433 &SI->front()) 434 << "Region dominated by " 435 << ore::NV("Block", BlockList.front()->getName()) 436 << " has more than one region exit edge."; 437 }); 438 return nullptr; 439 } else 440 ExitBlock = Block; 441 } 442 } 443 } 444 return ExitBlock; 445 }; 446 447 auto BBProfileCount = [BFI](BasicBlock *BB) { 448 return BFI->getBlockProfileCount(BB) 449 ? BFI->getBlockProfileCount(BB).getValue() 450 : 0; 451 }; 452 453 // Use the same computeBBInlineCost function to compute the cost savings of 454 // the outlining the candidate region. 455 int OverallFunctionCost = 0; 456 for (auto &BB : *F) 457 OverallFunctionCost += computeBBInlineCost(&BB); 458 459 #ifndef NDEBUG 460 if (TracePartialInlining) 461 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 462 #endif 463 int MinOutlineRegionCost = 464 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 465 BranchProbability MinBranchProbability( 466 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 467 MinBlockCounterExecution); 468 bool ColdCandidateFound = false; 469 BasicBlock *CurrEntry = EntryBlock; 470 std::vector<BasicBlock *> DFS; 471 DenseMap<BasicBlock *, bool> VisitedMap; 472 DFS.push_back(CurrEntry); 473 VisitedMap[CurrEntry] = true; 474 // Use Depth First Search on the basic blocks to find CFG edges that are 475 // considered cold. 476 // Cold regions considered must also have its inline cost compared to the 477 // overall inline cost of the original function. The region is outlined only 478 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 479 // more. 480 while (!DFS.empty()) { 481 auto *thisBB = DFS.back(); 482 DFS.pop_back(); 483 // Only consider regions with predecessor blocks that are considered 484 // not-cold (default: part of the top 99.99% of all block counters) 485 // AND greater than our minimum block execution count (default: 100). 486 if (PSI->isColdBB(thisBB, BFI) || 487 BBProfileCount(thisBB) < MinBlockCounterExecution) 488 continue; 489 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 490 if (VisitedMap[*SI]) 491 continue; 492 VisitedMap[*SI] = true; 493 DFS.push_back(*SI); 494 // If branch isn't cold, we skip to the next one. 495 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 496 if (SuccProb > MinBranchProbability) 497 continue; 498 #ifndef NDEBUG 499 if (TracePartialInlining) { 500 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 501 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 502 << "\n"; 503 } 504 #endif 505 SmallVector<BasicBlock *, 8> DominateVector; 506 DT.getDescendants(*SI, DominateVector); 507 // We can only outline single entry regions (for now). 508 if (!IsSingleEntry(DominateVector)) 509 continue; 510 BasicBlock *ExitBlock = nullptr; 511 // We can only outline single exit regions (for now). 512 if (!(ExitBlock = IsSingleExit(DominateVector))) 513 continue; 514 int OutlineRegionCost = 0; 515 for (auto *BB : DominateVector) 516 OutlineRegionCost += computeBBInlineCost(BB); 517 518 #ifndef NDEBUG 519 if (TracePartialInlining) 520 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 521 #endif 522 523 if (OutlineRegionCost < MinOutlineRegionCost) { 524 ORE.emit([&]() { 525 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 526 &SI->front()) 527 << ore::NV("Callee", F) << " inline cost-savings smaller than " 528 << ore::NV("Cost", MinOutlineRegionCost); 529 }); 530 continue; 531 } 532 // For now, ignore blocks that belong to a SISE region that is a 533 // candidate for outlining. In the future, we may want to look 534 // at inner regions because the outer region may have live-exit 535 // variables. 536 for (auto *BB : DominateVector) 537 VisitedMap[BB] = true; 538 // ReturnBlock here means the block after the outline call 539 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 540 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 541 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 542 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 543 RegInfo.Region = DominateVector; 544 OutliningInfo->ORI.push_back(RegInfo); 545 #ifndef NDEBUG 546 if (TracePartialInlining) { 547 dbgs() << "Found Cold Candidate starting at block: " 548 << DominateVector.front()->getName() << "\n"; 549 } 550 #endif 551 ColdCandidateFound = true; 552 NumColdRegionsFound++; 553 } 554 } 555 if (ColdCandidateFound) 556 return OutliningInfo; 557 else 558 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 559 } 560 561 std::unique_ptr<FunctionOutliningInfo> 562 PartialInlinerImpl::computeOutliningInfo(Function *F) { 563 BasicBlock *EntryBlock = &F->front(); 564 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 565 if (!BR || BR->isUnconditional()) 566 return std::unique_ptr<FunctionOutliningInfo>(); 567 568 // Returns true if Succ is BB's successor 569 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 570 return is_contained(successors(BB), Succ); 571 }; 572 573 auto SuccSize = [](BasicBlock *BB) { 574 return std::distance(succ_begin(BB), succ_end(BB)); 575 }; 576 577 auto IsReturnBlock = [](BasicBlock *BB) { 578 TerminatorInst *TI = BB->getTerminator(); 579 return isa<ReturnInst>(TI); 580 }; 581 582 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 583 if (IsReturnBlock(Succ1)) 584 return std::make_tuple(Succ1, Succ2); 585 if (IsReturnBlock(Succ2)) 586 return std::make_tuple(Succ2, Succ1); 587 588 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 589 }; 590 591 // Detect a triangular shape: 592 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 593 if (IsSuccessor(Succ1, Succ2)) 594 return std::make_tuple(Succ1, Succ2); 595 if (IsSuccessor(Succ2, Succ1)) 596 return std::make_tuple(Succ2, Succ1); 597 598 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 599 }; 600 601 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 602 llvm::make_unique<FunctionOutliningInfo>(); 603 604 BasicBlock *CurrEntry = EntryBlock; 605 bool CandidateFound = false; 606 do { 607 // The number of blocks to be inlined has already reached 608 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 609 // disables partial inlining for the function. 610 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 611 break; 612 613 if (SuccSize(CurrEntry) != 2) 614 break; 615 616 BasicBlock *Succ1 = *succ_begin(CurrEntry); 617 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 618 619 BasicBlock *ReturnBlock, *NonReturnBlock; 620 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 621 622 if (ReturnBlock) { 623 OutliningInfo->Entries.push_back(CurrEntry); 624 OutliningInfo->ReturnBlock = ReturnBlock; 625 OutliningInfo->NonReturnBlock = NonReturnBlock; 626 CandidateFound = true; 627 break; 628 } 629 630 BasicBlock *CommSucc; 631 BasicBlock *OtherSucc; 632 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 633 634 if (!CommSucc) 635 break; 636 637 OutliningInfo->Entries.push_back(CurrEntry); 638 CurrEntry = OtherSucc; 639 } while (true); 640 641 if (!CandidateFound) 642 return std::unique_ptr<FunctionOutliningInfo>(); 643 644 // Do sanity check of the entries: threre should not 645 // be any successors (not in the entry set) other than 646 // {ReturnBlock, NonReturnBlock} 647 assert(OutliningInfo->Entries[0] == &F->front() && 648 "Function Entry must be the first in Entries vector"); 649 DenseSet<BasicBlock *> Entries; 650 for (BasicBlock *E : OutliningInfo->Entries) 651 Entries.insert(E); 652 653 // Returns true of BB has Predecessor which is not 654 // in Entries set. 655 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 656 for (auto Pred : predecessors(BB)) { 657 if (!Entries.count(Pred)) 658 return true; 659 } 660 return false; 661 }; 662 auto CheckAndNormalizeCandidate = 663 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 664 for (BasicBlock *E : OutliningInfo->Entries) { 665 for (auto Succ : successors(E)) { 666 if (Entries.count(Succ)) 667 continue; 668 if (Succ == OutliningInfo->ReturnBlock) 669 OutliningInfo->ReturnBlockPreds.push_back(E); 670 else if (Succ != OutliningInfo->NonReturnBlock) 671 return false; 672 } 673 // There should not be any outside incoming edges either: 674 if (HasNonEntryPred(E)) 675 return false; 676 } 677 return true; 678 }; 679 680 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 681 return std::unique_ptr<FunctionOutliningInfo>(); 682 683 // Now further growing the candidate's inlining region by 684 // peeling off dominating blocks from the outlining region: 685 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 686 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 687 if (SuccSize(Cand) != 2) 688 break; 689 690 if (HasNonEntryPred(Cand)) 691 break; 692 693 BasicBlock *Succ1 = *succ_begin(Cand); 694 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 695 696 BasicBlock *ReturnBlock, *NonReturnBlock; 697 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 698 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 699 break; 700 701 if (NonReturnBlock->getSinglePredecessor() != Cand) 702 break; 703 704 // Now grow and update OutlininigInfo: 705 OutliningInfo->Entries.push_back(Cand); 706 OutliningInfo->NonReturnBlock = NonReturnBlock; 707 OutliningInfo->ReturnBlockPreds.push_back(Cand); 708 Entries.insert(Cand); 709 } 710 711 return OutliningInfo; 712 } 713 714 // Check if there is PGO data or user annoated branch data: 715 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 716 if (F->getEntryCount()) 717 return true; 718 // Now check if any of the entry block has MD_prof data: 719 for (auto *E : OI->Entries) { 720 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 721 if (!BR || BR->isUnconditional()) 722 continue; 723 uint64_t T, F; 724 if (BR->extractProfMetadata(T, F)) 725 return true; 726 } 727 return false; 728 } 729 730 BranchProbability 731 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 732 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 733 auto EntryFreq = 734 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 735 auto OutliningCallFreq = 736 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 737 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 738 // we outlined any regions, so we may encounter situations where the 739 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 740 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 741 OutliningCallFreq = EntryFreq; 742 } 743 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 744 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 745 746 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 747 return OutlineRegionRelFreq; 748 749 // When profile data is not available, we need to be conservative in 750 // estimating the overall savings. Static branch prediction can usually 751 // guess the branch direction right (taken/non-taken), but the guessed 752 // branch probability is usually not biased enough. In case when the 753 // outlined region is predicted to be likely, its probability needs 754 // to be made higher (more biased) to not under-estimate the cost of 755 // function outlining. On the other hand, if the outlined region 756 // is predicted to be less likely, the predicted probablity is usually 757 // higher than the actual. For instance, the actual probability of the 758 // less likely target is only 5%, but the guessed probablity can be 759 // 40%. In the latter case, there is no need for further adjustement. 760 // FIXME: add an option for this. 761 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 762 return OutlineRegionRelFreq; 763 764 OutlineRegionRelFreq = std::max( 765 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 766 767 return OutlineRegionRelFreq; 768 } 769 770 bool PartialInlinerImpl::shouldPartialInline( 771 CallSite CS, FunctionCloner &Cloner, 772 BlockFrequency WeightedOutliningRcost) { 773 using namespace ore; 774 775 if (SkipCostAnalysis) 776 return true; 777 778 Instruction *Call = CS.getInstruction(); 779 Function *Callee = CS.getCalledFunction(); 780 assert(Callee == Cloner.ClonedFunc); 781 782 Function *Caller = CS.getCaller(); 783 auto &CalleeTTI = (*GetTTI)(*Callee); 784 auto &ORE = (*GetORE)(*Caller); 785 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 786 *GetAssumptionCache, GetBFI, PSI, &ORE); 787 788 if (IC.isAlways()) { 789 ORE.emit([&]() { 790 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 791 << NV("Callee", Cloner.OrigFunc) 792 << " should always be fully inlined, not partially"; 793 }); 794 return false; 795 } 796 797 if (IC.isNever()) { 798 ORE.emit([&]() { 799 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 800 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 801 << NV("Caller", Caller) 802 << " because it should never be inlined (cost=never)"; 803 }); 804 return false; 805 } 806 807 if (!IC) { 808 ORE.emit([&]() { 809 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 810 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 811 << NV("Caller", Caller) << " because too costly to inline (cost=" 812 << NV("Cost", IC.getCost()) << ", threshold=" 813 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 814 }); 815 return false; 816 } 817 const DataLayout &DL = Caller->getParent()->getDataLayout(); 818 819 // The savings of eliminating the call: 820 int NonWeightedSavings = getCallsiteCost(CS, DL); 821 BlockFrequency NormWeightedSavings(NonWeightedSavings); 822 823 // Weighted saving is smaller than weighted cost, return false 824 if (NormWeightedSavings < WeightedOutliningRcost) { 825 ORE.emit([&]() { 826 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 827 Call) 828 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 829 << NV("Caller", Caller) << " runtime overhead (overhead=" 830 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 831 << ", savings=" 832 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 833 << ")" 834 << " of making the outlined call is too high"; 835 }); 836 837 return false; 838 } 839 840 ORE.emit([&]() { 841 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 842 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 843 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 844 << " (threshold=" 845 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 846 }); 847 return true; 848 } 849 850 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 851 // For now use a simplified version. The returned 'InlineCost' will be used 852 // to esimate the size cost as well as runtime cost of the BB. 853 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 854 int InlineCost = 0; 855 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 856 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 857 if (isa<DbgInfoIntrinsic>(I)) 858 continue; 859 860 switch (I->getOpcode()) { 861 case Instruction::BitCast: 862 case Instruction::PtrToInt: 863 case Instruction::IntToPtr: 864 case Instruction::Alloca: 865 continue; 866 case Instruction::GetElementPtr: 867 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 868 continue; 869 default: 870 break; 871 } 872 873 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 874 if (IntrInst) { 875 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 876 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 877 continue; 878 } 879 880 if (CallInst *CI = dyn_cast<CallInst>(I)) { 881 InlineCost += getCallsiteCost(CallSite(CI), DL); 882 continue; 883 } 884 885 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 886 InlineCost += getCallsiteCost(CallSite(II), DL); 887 continue; 888 } 889 890 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 891 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 892 continue; 893 } 894 InlineCost += InlineConstants::InstrCost; 895 } 896 return InlineCost; 897 } 898 899 std::tuple<int, int> 900 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 901 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 902 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 903 Function *OutlinedFunc = FuncBBPair.first; 904 BasicBlock* OutliningCallBB = FuncBBPair.second; 905 // Now compute the cost of the call sequence to the outlined function 906 // 'OutlinedFunction' in BB 'OutliningCallBB': 907 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 908 909 // Now compute the cost of the extracted/outlined function itself: 910 for (BasicBlock &BB : *OutlinedFunc) 911 OutlinedFunctionCost += computeBBInlineCost(&BB); 912 } 913 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 914 "Outlined function cost should be no less than the outlined region"); 915 916 // The code extractor introduces a new root and exit stub blocks with 917 // additional unconditional branches. Those branches will be eliminated 918 // later with bb layout. The cost should be adjusted accordingly: 919 OutlinedFunctionCost -= 920 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 921 922 int OutliningRuntimeOverhead = 923 OutliningFuncCallCost + 924 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 925 ExtraOutliningPenalty; 926 927 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 928 } 929 930 // Create the callsite to profile count map which is 931 // used to update the original function's entry count, 932 // after the function is partially inlined into the callsite. 933 void PartialInlinerImpl::computeCallsiteToProfCountMap( 934 Function *DuplicateFunction, 935 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 936 std::vector<User *> Users(DuplicateFunction->user_begin(), 937 DuplicateFunction->user_end()); 938 Function *CurrentCaller = nullptr; 939 std::unique_ptr<BlockFrequencyInfo> TempBFI; 940 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 941 942 auto ComputeCurrBFI = [&,this](Function *Caller) { 943 // For the old pass manager: 944 if (!GetBFI) { 945 DominatorTree DT(*Caller); 946 LoopInfo LI(DT); 947 BranchProbabilityInfo BPI(*Caller, LI); 948 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 949 CurrentCallerBFI = TempBFI.get(); 950 } else { 951 // New pass manager: 952 CurrentCallerBFI = &(*GetBFI)(*Caller); 953 } 954 }; 955 956 for (User *User : Users) { 957 CallSite CS = getCallSite(User); 958 Function *Caller = CS.getCaller(); 959 if (CurrentCaller != Caller) { 960 CurrentCaller = Caller; 961 ComputeCurrBFI(Caller); 962 } else { 963 assert(CurrentCallerBFI && "CallerBFI is not set"); 964 } 965 BasicBlock *CallBB = CS.getInstruction()->getParent(); 966 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 967 if (Count) 968 CallSiteToProfCountMap[User] = *Count; 969 else 970 CallSiteToProfCountMap[User] = 0; 971 } 972 } 973 974 PartialInlinerImpl::FunctionCloner::FunctionCloner( 975 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE) 976 : OrigFunc(F), ORE(ORE) { 977 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 978 979 // Clone the function, so that we can hack away on it. 980 ValueToValueMapTy VMap; 981 ClonedFunc = CloneFunction(F, VMap); 982 983 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 984 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 985 for (BasicBlock *BB : OI->Entries) { 986 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 987 } 988 for (BasicBlock *E : OI->ReturnBlockPreds) { 989 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 990 ClonedOI->ReturnBlockPreds.push_back(NewE); 991 } 992 // Go ahead and update all uses to the duplicate, so that we can just 993 // use the inliner functionality when we're done hacking. 994 F->replaceAllUsesWith(ClonedFunc); 995 } 996 997 PartialInlinerImpl::FunctionCloner::FunctionCloner( 998 Function *F, FunctionOutliningMultiRegionInfo *OI, 999 OptimizationRemarkEmitter &ORE) 1000 : OrigFunc(F), ORE(ORE) { 1001 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 1002 1003 // Clone the function, so that we can hack away on it. 1004 ValueToValueMapTy VMap; 1005 ClonedFunc = CloneFunction(F, VMap); 1006 1007 // Go through all Outline Candidate Regions and update all BasicBlock 1008 // information. 1009 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1010 OI->ORI) { 1011 SmallVector<BasicBlock *, 8> Region; 1012 for (BasicBlock *BB : RegionInfo.Region) { 1013 Region.push_back(cast<BasicBlock>(VMap[BB])); 1014 } 1015 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1016 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1017 BasicBlock *NewReturnBlock = nullptr; 1018 if (RegionInfo.ReturnBlock) 1019 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1020 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1021 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1022 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1023 } 1024 // Go ahead and update all uses to the duplicate, so that we can just 1025 // use the inliner functionality when we're done hacking. 1026 F->replaceAllUsesWith(ClonedFunc); 1027 } 1028 1029 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1030 auto getFirstPHI = [](BasicBlock *BB) { 1031 BasicBlock::iterator I = BB->begin(); 1032 PHINode *FirstPhi = nullptr; 1033 while (I != BB->end()) { 1034 PHINode *Phi = dyn_cast<PHINode>(I); 1035 if (!Phi) 1036 break; 1037 if (!FirstPhi) { 1038 FirstPhi = Phi; 1039 break; 1040 } 1041 } 1042 return FirstPhi; 1043 }; 1044 1045 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1046 // blocks. 1047 if (!ClonedOI) 1048 return; 1049 1050 // Special hackery is needed with PHI nodes that have inputs from more than 1051 // one extracted block. For simplicity, just split the PHIs into a two-level 1052 // sequence of PHIs, some of which will go in the extracted region, and some 1053 // of which will go outside. 1054 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1055 // only split block when necessary: 1056 PHINode *FirstPhi = getFirstPHI(PreReturn); 1057 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1058 1059 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1060 return; 1061 1062 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1063 Value *CommonValue = PN->getIncomingValue(0); 1064 if (all_of(PN->incoming_values(), 1065 [&](Value *V) { return V == CommonValue; })) 1066 return CommonValue; 1067 return nullptr; 1068 }; 1069 1070 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1071 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1072 BasicBlock::iterator I = PreReturn->begin(); 1073 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1074 SmallVector<Instruction *, 4> DeadPhis; 1075 while (I != PreReturn->end()) { 1076 PHINode *OldPhi = dyn_cast<PHINode>(I); 1077 if (!OldPhi) 1078 break; 1079 1080 PHINode *RetPhi = 1081 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1082 OldPhi->replaceAllUsesWith(RetPhi); 1083 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1084 1085 RetPhi->addIncoming(&*I, PreReturn); 1086 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1087 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1088 OldPhi->removeIncomingValue(E); 1089 } 1090 1091 // After incoming values splitting, the old phi may become trivial. 1092 // Keeping the trivial phi can introduce definition inside the outline 1093 // region which is live-out, causing necessary overhead (load, store 1094 // arg passing etc). 1095 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1096 OldPhi->replaceAllUsesWith(OldPhiVal); 1097 DeadPhis.push_back(OldPhi); 1098 } 1099 ++I; 1100 } 1101 for (auto *DP : DeadPhis) 1102 DP->eraseFromParent(); 1103 1104 for (auto E : ClonedOI->ReturnBlockPreds) { 1105 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1106 } 1107 } 1108 1109 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1110 1111 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1112 int Cost = 0; 1113 for (BasicBlock* BB : Region) 1114 Cost += computeBBInlineCost(BB); 1115 return Cost; 1116 }; 1117 1118 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1119 1120 if (ClonedOMRI->ORI.empty()) 1121 return false; 1122 1123 // The CodeExtractor needs a dominator tree. 1124 DominatorTree DT; 1125 DT.recalculate(*ClonedFunc); 1126 1127 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1128 LoopInfo LI(DT); 1129 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1130 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1131 1132 SetVector<Value *> Inputs, Outputs, Sinks; 1133 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1134 ClonedOMRI->ORI) { 1135 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1136 1137 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1138 ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false); 1139 1140 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1141 1142 #ifndef NDEBUG 1143 if (TracePartialInlining) { 1144 dbgs() << "inputs: " << Inputs.size() << "\n"; 1145 dbgs() << "outputs: " << Outputs.size() << "\n"; 1146 for (Value *value : Inputs) 1147 dbgs() << "value used in func: " << *value << "\n"; 1148 for (Value *output : Outputs) 1149 dbgs() << "instr used in func: " << *output << "\n"; 1150 } 1151 #endif 1152 // Do not extract regions that have live exit variables. 1153 if (Outputs.size() > 0 && !ForceLiveExit) 1154 continue; 1155 1156 Function *OutlinedFunc = CE.extractCodeRegion(); 1157 1158 if (OutlinedFunc) { 1159 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1160 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1161 assert(OutliningCallBB->getParent() == ClonedFunc); 1162 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1163 NumColdRegionsOutlined++; 1164 OutlinedRegionCost += CurrentOutlinedRegionCost; 1165 1166 if (MarkOutlinedColdCC) { 1167 OutlinedFunc->setCallingConv(CallingConv::Cold); 1168 OCS.setCallingConv(CallingConv::Cold); 1169 } 1170 } else 1171 ORE.emit([&]() { 1172 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1173 &RegionInfo.Region.front()->front()) 1174 << "Failed to extract region at block " 1175 << ore::NV("Block", RegionInfo.Region.front()); 1176 }); 1177 } 1178 1179 return !OutlinedFunctions.empty(); 1180 } 1181 1182 Function * 1183 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1184 // Returns true if the block is to be partial inlined into the caller 1185 // (i.e. not to be extracted to the out of line function) 1186 auto ToBeInlined = [&, this](BasicBlock *BB) { 1187 return BB == ClonedOI->ReturnBlock || 1188 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1189 ClonedOI->Entries.end()); 1190 }; 1191 1192 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1193 // The CodeExtractor needs a dominator tree. 1194 DominatorTree DT; 1195 DT.recalculate(*ClonedFunc); 1196 1197 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1198 LoopInfo LI(DT); 1199 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1200 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1201 1202 // Gather up the blocks that we're going to extract. 1203 std::vector<BasicBlock *> ToExtract; 1204 ToExtract.push_back(ClonedOI->NonReturnBlock); 1205 OutlinedRegionCost += 1206 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1207 for (BasicBlock &BB : *ClonedFunc) 1208 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1209 ToExtract.push_back(&BB); 1210 // FIXME: the code extractor may hoist/sink more code 1211 // into the outlined function which may make the outlining 1212 // overhead (the difference of the outlined function cost 1213 // and OutliningRegionCost) look larger. 1214 OutlinedRegionCost += computeBBInlineCost(&BB); 1215 } 1216 1217 // Extract the body of the if. 1218 Function *OutlinedFunc = 1219 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1220 ClonedFuncBFI.get(), &BPI, 1221 /* AllowVarargs */ true) 1222 .extractCodeRegion(); 1223 1224 if (OutlinedFunc) { 1225 BasicBlock *OutliningCallBB = 1226 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1227 .getInstruction() 1228 ->getParent(); 1229 assert(OutliningCallBB->getParent() == ClonedFunc); 1230 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1231 } else 1232 ORE.emit([&]() { 1233 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1234 &ToExtract.front()->front()) 1235 << "Failed to extract region at block " 1236 << ore::NV("Block", ToExtract.front()); 1237 }); 1238 1239 return OutlinedFunc; 1240 } 1241 1242 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1243 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1244 // users (function pointers, etc.) back to the original function. 1245 ClonedFunc->replaceAllUsesWith(OrigFunc); 1246 ClonedFunc->eraseFromParent(); 1247 if (!IsFunctionInlined) { 1248 // Remove each function that was speculatively created if there is no 1249 // reference. 1250 for (auto FuncBBPair : OutlinedFunctions) { 1251 Function *Func = FuncBBPair.first; 1252 Func->eraseFromParent(); 1253 } 1254 } 1255 } 1256 1257 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1258 1259 if (F->hasAddressTaken()) 1260 return {false, nullptr}; 1261 1262 // Let inliner handle it 1263 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1264 return {false, nullptr}; 1265 1266 if (F->hasFnAttribute(Attribute::NoInline)) 1267 return {false, nullptr}; 1268 1269 if (PSI->isFunctionEntryCold(F)) 1270 return {false, nullptr}; 1271 1272 if (F->user_begin() == F->user_end()) 1273 return {false, nullptr}; 1274 1275 auto &ORE = (*GetORE)(*F); 1276 1277 // Only try to outline cold regions if we have a profile summary, which 1278 // implies we have profiling information. 1279 if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() && 1280 !DisableMultiRegionPartialInline) { 1281 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1282 computeOutliningColdRegionsInfo(F); 1283 if (OMRI) { 1284 FunctionCloner Cloner(F, OMRI.get(), ORE); 1285 1286 #ifndef NDEBUG 1287 if (TracePartialInlining) { 1288 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1289 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1290 << "\n"; 1291 } 1292 #endif 1293 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1294 1295 if (DidOutline) { 1296 #ifndef NDEBUG 1297 if (TracePartialInlining) { 1298 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1299 Cloner.ClonedFunc->print(dbgs()); 1300 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1301 } 1302 #endif 1303 1304 if (tryPartialInline(Cloner)) 1305 return {true, nullptr}; 1306 } 1307 } 1308 } 1309 1310 // Fall-thru to regular partial inlining if we: 1311 // i) can't find any cold regions to outline, or 1312 // ii) can't inline the outlined function anywhere. 1313 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1314 if (!OI) 1315 return {false, nullptr}; 1316 1317 FunctionCloner Cloner(F, OI.get(), ORE); 1318 Cloner.NormalizeReturnBlock(); 1319 1320 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1321 1322 if (!OutlinedFunction) 1323 return {false, nullptr}; 1324 1325 bool AnyInline = tryPartialInline(Cloner); 1326 1327 if (AnyInline) 1328 return {true, OutlinedFunction}; 1329 1330 return {false, nullptr}; 1331 } 1332 1333 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1334 if (Cloner.OutlinedFunctions.empty()) 1335 return false; 1336 1337 int SizeCost = 0; 1338 BlockFrequency WeightedRcost; 1339 int NonWeightedRcost; 1340 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1341 1342 // Only calculate RelativeToEntryFreq when we are doing single region 1343 // outlining. 1344 BranchProbability RelativeToEntryFreq; 1345 if (Cloner.ClonedOI) { 1346 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1347 } else 1348 // RelativeToEntryFreq doesn't make sense when we have more than one 1349 // outlined call because each call will have a different relative frequency 1350 // to the entry block. We can consider using the average, but the 1351 // usefulness of that information is questionable. For now, assume we never 1352 // execute the calls to outlined functions. 1353 RelativeToEntryFreq = BranchProbability(0, 1); 1354 1355 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1356 1357 // The call sequence(s) to the outlined function(s) are larger than the sum of 1358 // the original outlined region size(s), it does not increase the chances of 1359 // inlining the function with outlining (The inliner uses the size increase to 1360 // model the cost of inlining a callee). 1361 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1362 auto &ORE = (*GetORE)(*Cloner.OrigFunc); 1363 DebugLoc DLoc; 1364 BasicBlock *Block; 1365 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1366 ORE.emit([&]() { 1367 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1368 DLoc, Block) 1369 << ore::NV("Function", Cloner.OrigFunc) 1370 << " not partially inlined into callers (Original Size = " 1371 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1372 << ", Size of call sequence to outlined function = " 1373 << ore::NV("NewSize", SizeCost) << ")"; 1374 }); 1375 return false; 1376 } 1377 1378 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 1379 "F's users should all be replaced!"); 1380 1381 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1382 Cloner.ClonedFunc->user_end()); 1383 1384 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1385 if (Cloner.OrigFunc->getEntryCount()) 1386 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1387 1388 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1389 uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); 1390 1391 bool AnyInline = false; 1392 for (User *User : Users) { 1393 CallSite CS = getCallSite(User); 1394 1395 if (IsLimitReached()) 1396 continue; 1397 1398 1399 if (!shouldPartialInline(CS, Cloner, WeightedRcost)) 1400 continue; 1401 1402 auto &ORE = (*GetORE)(*CS.getCaller()); 1403 // Construct remark before doing the inlining, as after successful inlining 1404 // the callsite is removed. 1405 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1406 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1407 << ore::NV("Caller", CS.getCaller()); 1408 1409 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1410 // We can only forward varargs when we outlined a single region, else we 1411 // bail on vararg functions. 1412 if (!InlineFunction(CS, IFI, nullptr, true, 1413 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1414 : nullptr))) 1415 continue; 1416 1417 ORE.emit(OR); 1418 1419 // Now update the entry count: 1420 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1421 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1422 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1423 } 1424 1425 AnyInline = true; 1426 NumPartialInlining++; 1427 // Update the stats 1428 if (Cloner.ClonedOI) 1429 NumPartialInlined++; 1430 else 1431 NumColdOutlinePartialInlined++; 1432 1433 } 1434 1435 if (AnyInline) { 1436 Cloner.IsFunctionInlined = true; 1437 if (CalleeEntryCount) 1438 Cloner.OrigFunc->setEntryCount(CalleeEntryCountV); 1439 auto &ORE = (*GetORE)(*Cloner.OrigFunc); 1440 ORE.emit([&]() { 1441 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1442 << "Partially inlined into at least one caller"; 1443 }); 1444 1445 } 1446 1447 return AnyInline; 1448 } 1449 1450 bool PartialInlinerImpl::run(Module &M) { 1451 if (DisablePartialInlining) 1452 return false; 1453 1454 std::vector<Function *> Worklist; 1455 Worklist.reserve(M.size()); 1456 for (Function &F : M) 1457 if (!F.use_empty() && !F.isDeclaration()) 1458 Worklist.push_back(&F); 1459 1460 bool Changed = false; 1461 while (!Worklist.empty()) { 1462 Function *CurrFunc = Worklist.back(); 1463 Worklist.pop_back(); 1464 1465 if (CurrFunc->use_empty()) 1466 continue; 1467 1468 bool Recursive = false; 1469 for (User *U : CurrFunc->users()) 1470 if (Instruction *I = dyn_cast<Instruction>(U)) 1471 if (I->getParent()->getParent() == CurrFunc) { 1472 Recursive = true; 1473 break; 1474 } 1475 if (Recursive) 1476 continue; 1477 1478 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1479 if (Result.second) 1480 Worklist.push_back(Result.second); 1481 if (Result.first) { 1482 Changed = true; 1483 } 1484 } 1485 1486 return Changed; 1487 } 1488 1489 char PartialInlinerLegacyPass::ID = 0; 1490 1491 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1492 "Partial Inliner", false, false) 1493 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1494 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1495 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1496 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1497 "Partial Inliner", false, false) 1498 1499 ModulePass *llvm::createPartialInliningPass() { 1500 return new PartialInlinerLegacyPass(); 1501 } 1502 1503 PreservedAnalyses PartialInlinerPass::run(Module &M, 1504 ModuleAnalysisManager &AM) { 1505 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1506 1507 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1508 [&FAM](Function &F) -> AssumptionCache & { 1509 return FAM.getResult<AssumptionAnalysis>(F); 1510 }; 1511 1512 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1513 [&FAM](Function &F) -> BlockFrequencyInfo & { 1514 return FAM.getResult<BlockFrequencyAnalysis>(F); 1515 }; 1516 1517 std::function<TargetTransformInfo &(Function &)> GetTTI = 1518 [&FAM](Function &F) -> TargetTransformInfo & { 1519 return FAM.getResult<TargetIRAnalysis>(F); 1520 }; 1521 1522 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 1523 [&FAM](Function &F) -> OptimizationRemarkEmitter & { 1524 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1525 }; 1526 1527 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1528 1529 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE) 1530 .run(M)) 1531 return PreservedAnalyses::none(); 1532 return PreservedAnalyses::all(); 1533 } 1534