1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/BlockFrequencyInfo.h" 24 #include "llvm/Analysis/BranchProbabilityInfo.h" 25 #include "llvm/Analysis/InlineCost.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 28 #include "llvm/Analysis/ProfileSummaryInfo.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/Attributes.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/CFG.h" 34 #include "llvm/IR/CallSite.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/InstrTypes.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/IntrinsicInst.h" 43 #include "llvm/IR/Intrinsics.h" 44 #include "llvm/IR/Module.h" 45 #include "llvm/IR/User.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/BlockFrequency.h" 48 #include "llvm/Support/BranchProbability.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Transforms/IPO.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/CodeExtractor.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <functional> 60 #include <iterator> 61 #include <memory> 62 #include <tuple> 63 #include <vector> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "partial-inlining" 68 69 STATISTIC(NumPartialInlined, 70 "Number of callsites functions partially inlined into."); 71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 72 "cold outlined regions were partially " 73 "inlined into its caller(s)."); 74 STATISTIC(NumColdRegionsFound, 75 "Number of cold single entry/exit regions found."); 76 STATISTIC(NumColdRegionsOutlined, 77 "Number of cold single entry/exit regions outlined."); 78 79 // Command line option to disable partial-inlining. The default is false: 80 static cl::opt<bool> 81 DisablePartialInlining("disable-partial-inlining", cl::init(false), 82 cl::Hidden, cl::desc("Disable partial inlining")); 83 // Command line option to disable multi-region partial-inlining. The default is 84 // false: 85 static cl::opt<bool> DisableMultiRegionPartialInline( 86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 87 cl::desc("Disable multi-region partial inlining")); 88 89 // Command line option to force outlining in regions with live exit variables. 90 // The default is false: 91 static cl::opt<bool> 92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 93 cl::desc("Force outline regions with live exits")); 94 95 // Command line option to enable marking outline functions with Cold Calling 96 // Convention. The default is false: 97 static cl::opt<bool> 98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 99 cl::desc("Mark outline function calls with ColdCC")); 100 101 #ifndef NDEBUG 102 // Command line option to debug partial-inlining. The default is none: 103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 104 cl::init(false), cl::Hidden, 105 cl::desc("Trace partial inlining.")); 106 #endif 107 108 // This is an option used by testing: 109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 110 cl::init(false), cl::ZeroOrMore, 111 cl::ReallyHidden, 112 cl::desc("Skip Cost Analysis")); 113 // Used to determine if a cold region is worth outlining based on 114 // its inlining cost compared to the original function. Default is set at 10%. 115 // ie. if the cold region reduces the inlining cost of the original function by 116 // at least 10%. 117 static cl::opt<float> MinRegionSizeRatio( 118 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 119 cl::desc("Minimum ratio comparing relative sizes of each " 120 "outline candidate and original function")); 121 // Used to tune the minimum number of execution counts needed in the predecessor 122 // block to the cold edge. ie. confidence interval. 123 static cl::opt<unsigned> 124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 125 cl::desc("Minimum block executions to consider " 126 "its BranchProbabilityInfo valid")); 127 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 128 // if the branch probability is 10% or less, then it is deemed as 'cold'. 129 static cl::opt<float> ColdBranchRatio( 130 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 131 cl::desc("Minimum BranchProbability to consider a region cold.")); 132 133 static cl::opt<unsigned> MaxNumInlineBlocks( 134 "max-num-inline-blocks", cl::init(5), cl::Hidden, 135 cl::desc("Max number of blocks to be partially inlined")); 136 137 // Command line option to set the maximum number of partial inlining allowed 138 // for the module. The default value of -1 means no limit. 139 static cl::opt<int> MaxNumPartialInlining( 140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Max number of partial inlining. The default is unlimited")); 142 143 // Used only when PGO or user annotated branch data is absent. It is 144 // the least value that is used to weigh the outline region. If BFI 145 // produces larger value, the BFI value will be used. 146 static cl::opt<int> 147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 148 cl::Hidden, cl::ZeroOrMore, 149 cl::desc("Relative frequency of outline region to " 150 "the entry block")); 151 152 static cl::opt<unsigned> ExtraOutliningPenalty( 153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 154 cl::desc("A debug option to add additional penalty to the computed one.")); 155 156 namespace { 157 158 struct FunctionOutliningInfo { 159 FunctionOutliningInfo() = default; 160 161 // Returns the number of blocks to be inlined including all blocks 162 // in Entries and one return block. 163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 164 165 // A set of blocks including the function entry that guard 166 // the region to be outlined. 167 SmallVector<BasicBlock *, 4> Entries; 168 169 // The return block that is not included in the outlined region. 170 BasicBlock *ReturnBlock = nullptr; 171 172 // The dominating block of the region to be outlined. 173 BasicBlock *NonReturnBlock = nullptr; 174 175 // The set of blocks in Entries that that are predecessors to ReturnBlock 176 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 177 }; 178 179 struct FunctionOutliningMultiRegionInfo { 180 FunctionOutliningMultiRegionInfo() 181 : ORI() {} 182 183 // Container for outline regions 184 struct OutlineRegionInfo { 185 OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region, 186 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 187 BasicBlock *ReturnBlock) 188 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), 189 ReturnBlock(ReturnBlock) {} 190 SmallVector<BasicBlock *, 8> Region; 191 BasicBlock *EntryBlock; 192 BasicBlock *ExitBlock; 193 BasicBlock *ReturnBlock; 194 }; 195 196 SmallVector<OutlineRegionInfo, 4> ORI; 197 }; 198 199 struct PartialInlinerImpl { 200 201 PartialInlinerImpl( 202 std::function<AssumptionCache &(Function &)> *GetAC, 203 std::function<TargetTransformInfo &(Function &)> *GTTI, 204 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 205 ProfileSummaryInfo *ProfSI, 206 std::function<OptimizationRemarkEmitter &(Function &)> *GORE) 207 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI), 208 GetORE(GORE) {} 209 210 bool run(Module &M); 211 // Main part of the transformation that calls helper functions to find 212 // outlining candidates, clone & outline the function, and attempt to 213 // partially inline the resulting function. Returns true if 214 // inlining was successful, false otherwise. Also returns the outline 215 // function (only if we partially inlined early returns) as there is a 216 // possibility to further "peel" early return statements that were left in the 217 // outline function due to code size. 218 std::pair<bool, Function *> unswitchFunction(Function *F); 219 220 // This class speculatively clones the function to be partial inlined. 221 // At the end of partial inlining, the remaining callsites to the cloned 222 // function that are not partially inlined will be fixed up to reference 223 // the original function, and the cloned function will be erased. 224 struct FunctionCloner { 225 // Two constructors, one for single region outlining, the other for 226 // multi-region outlining. 227 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 228 OptimizationRemarkEmitter &ORE); 229 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 230 OptimizationRemarkEmitter &ORE); 231 ~FunctionCloner(); 232 233 // Prepare for function outlining: making sure there is only 234 // one incoming edge from the extracted/outlined region to 235 // the return block. 236 void NormalizeReturnBlock(); 237 238 // Do function outlining for cold regions. 239 bool doMultiRegionFunctionOutlining(); 240 // Do function outlining for region after early return block(s). 241 // NOTE: For vararg functions that do the vararg handling in the outlined 242 // function, we temporarily generate IR that does not properly 243 // forward varargs to the outlined function. Calling InlineFunction 244 // will update calls to the outlined functions to properly forward 245 // the varargs. 246 Function *doSingleRegionFunctionOutlining(); 247 248 Function *OrigFunc = nullptr; 249 Function *ClonedFunc = nullptr; 250 251 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 252 // Keep track of Outlined Functions and the basic block they're called from. 253 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 254 255 // ClonedFunc is inlined in one of its callers after function 256 // outlining. 257 bool IsFunctionInlined = false; 258 // The cost of the region to be outlined. 259 int OutlinedRegionCost = 0; 260 // ClonedOI is specific to outlining non-early return blocks. 261 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 262 // ClonedOMRI is specific to outlining cold regions. 263 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 264 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 265 OptimizationRemarkEmitter &ORE; 266 }; 267 268 private: 269 int NumPartialInlining = 0; 270 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 271 std::function<TargetTransformInfo &(Function &)> *GetTTI; 272 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 273 ProfileSummaryInfo *PSI; 274 std::function<OptimizationRemarkEmitter &(Function &)> *GetORE; 275 276 // Return the frequency of the OutlininingBB relative to F's entry point. 277 // The result is no larger than 1 and is represented using BP. 278 // (Note that the outlined region's 'head' block can only have incoming 279 // edges from the guarding entry blocks). 280 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 281 282 // Return true if the callee of CS should be partially inlined with 283 // profit. 284 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 285 BlockFrequency WeightedOutliningRcost); 286 287 // Try to inline DuplicateFunction (cloned from F with call to 288 // the OutlinedFunction into its callers. Return true 289 // if there is any successful inlining. 290 bool tryPartialInline(FunctionCloner &Cloner); 291 292 // Compute the mapping from use site of DuplicationFunction to the enclosing 293 // BB's profile count. 294 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 295 DenseMap<User *, uint64_t> &SiteCountMap); 296 297 bool IsLimitReached() { 298 return (MaxNumPartialInlining != -1 && 299 NumPartialInlining >= MaxNumPartialInlining); 300 } 301 302 static CallSite getCallSite(User *U) { 303 CallSite CS; 304 if (CallInst *CI = dyn_cast<CallInst>(U)) 305 CS = CallSite(CI); 306 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 307 CS = CallSite(II); 308 else 309 llvm_unreachable("All uses must be calls"); 310 return CS; 311 } 312 313 static CallSite getOneCallSiteTo(Function *F) { 314 User *User = *F->user_begin(); 315 return getCallSite(User); 316 } 317 318 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 319 CallSite CS = getOneCallSiteTo(F); 320 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 321 BasicBlock *Block = CS.getParent(); 322 return std::make_tuple(DLoc, Block); 323 } 324 325 // Returns the costs associated with function outlining: 326 // - The first value is the non-weighted runtime cost for making the call 327 // to the outlined function, including the addtional setup cost in the 328 // outlined function itself; 329 // - The second value is the estimated size of the new call sequence in 330 // basic block Cloner.OutliningCallBB; 331 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 332 333 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 334 // approximate both the size and runtime cost (Note that in the current 335 // inline cost analysis, there is no clear distinction there either). 336 static int computeBBInlineCost(BasicBlock *BB); 337 338 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 339 std::unique_ptr<FunctionOutliningMultiRegionInfo> 340 computeOutliningColdRegionsInfo(Function *F); 341 }; 342 343 struct PartialInlinerLegacyPass : public ModulePass { 344 static char ID; // Pass identification, replacement for typeid 345 346 PartialInlinerLegacyPass() : ModulePass(ID) { 347 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 348 } 349 350 void getAnalysisUsage(AnalysisUsage &AU) const override { 351 AU.addRequired<AssumptionCacheTracker>(); 352 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 353 AU.addRequired<TargetTransformInfoWrapperPass>(); 354 } 355 356 bool runOnModule(Module &M) override { 357 if (skipModule(M)) 358 return false; 359 360 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 361 TargetTransformInfoWrapperPass *TTIWP = 362 &getAnalysis<TargetTransformInfoWrapperPass>(); 363 ProfileSummaryInfo *PSI = 364 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 365 std::unique_ptr<OptimizationRemarkEmitter> UPORE; 366 367 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 368 [&ACT](Function &F) -> AssumptionCache & { 369 return ACT->getAssumptionCache(F); 370 }; 371 372 std::function<TargetTransformInfo &(Function &)> GetTTI = 373 [&TTIWP](Function &F) -> TargetTransformInfo & { 374 return TTIWP->getTTI(F); 375 }; 376 377 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 378 [&UPORE](Function &F) -> OptimizationRemarkEmitter & { 379 UPORE.reset(new OptimizationRemarkEmitter(&F)); 380 return *UPORE.get(); 381 }; 382 383 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI, 384 &GetORE) 385 .run(M); 386 } 387 }; 388 389 } // end anonymous namespace 390 391 std::unique_ptr<FunctionOutliningMultiRegionInfo> 392 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) { 393 BasicBlock *EntryBlock = &F->front(); 394 395 DominatorTree DT(*F); 396 LoopInfo LI(DT); 397 BranchProbabilityInfo BPI(*F, LI); 398 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 399 BlockFrequencyInfo *BFI; 400 if (!GetBFI) { 401 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 402 BFI = ScopedBFI.get(); 403 } else 404 BFI = &(*GetBFI)(*F); 405 406 auto &ORE = (*GetORE)(*F); 407 408 // Return if we don't have profiling information. 409 if (!PSI->hasInstrumentationProfile()) 410 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 411 412 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 413 llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 414 415 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 416 BasicBlock *Dom = BlockList.front(); 417 return BlockList.size() > 1 && 418 std::distance(pred_begin(Dom), pred_end(Dom)) == 1; 419 }; 420 421 auto IsSingleExit = 422 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 423 BasicBlock *ExitBlock = nullptr; 424 for (auto *Block : BlockList) { 425 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 426 if (!is_contained(BlockList, *SI)) { 427 if (ExitBlock) { 428 ORE.emit([&]() { 429 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 430 &SI->front()) 431 << "Region dominated by " 432 << ore::NV("Block", BlockList.front()->getName()) 433 << " has more than one region exit edge."; 434 }); 435 return nullptr; 436 } else 437 ExitBlock = Block; 438 } 439 } 440 } 441 return ExitBlock; 442 }; 443 444 auto BBProfileCount = [BFI](BasicBlock *BB) { 445 return BFI->getBlockProfileCount(BB) 446 ? BFI->getBlockProfileCount(BB).getValue() 447 : 0; 448 }; 449 450 // Use the same computeBBInlineCost function to compute the cost savings of 451 // the outlining the candidate region. 452 int OverallFunctionCost = 0; 453 for (auto &BB : *F) 454 OverallFunctionCost += computeBBInlineCost(&BB); 455 456 #ifndef NDEBUG 457 if (TracePartialInlining) 458 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 459 #endif 460 int MinOutlineRegionCost = 461 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 462 BranchProbability MinBranchProbability( 463 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 464 MinBlockCounterExecution); 465 bool ColdCandidateFound = false; 466 BasicBlock *CurrEntry = EntryBlock; 467 std::vector<BasicBlock *> DFS; 468 DenseMap<BasicBlock *, bool> VisitedMap; 469 DFS.push_back(CurrEntry); 470 VisitedMap[CurrEntry] = true; 471 // Use Depth First Search on the basic blocks to find CFG edges that are 472 // considered cold. 473 // Cold regions considered must also have its inline cost compared to the 474 // overall inline cost of the original function. The region is outlined only 475 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 476 // more. 477 while (!DFS.empty()) { 478 auto *thisBB = DFS.back(); 479 DFS.pop_back(); 480 // Only consider regions with predecessor blocks that are considered 481 // not-cold (default: part of the top 99.99% of all block counters) 482 // AND greater than our minimum block execution count (default: 100). 483 if (PSI->isColdBB(thisBB, BFI) || 484 BBProfileCount(thisBB) < MinBlockCounterExecution) 485 continue; 486 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 487 if (VisitedMap[*SI]) 488 continue; 489 VisitedMap[*SI] = true; 490 DFS.push_back(*SI); 491 // If branch isn't cold, we skip to the next one. 492 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 493 if (SuccProb > MinBranchProbability) 494 continue; 495 #ifndef NDEBUG 496 if (TracePartialInlining) { 497 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 498 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 499 << "\n"; 500 } 501 #endif 502 SmallVector<BasicBlock *, 8> DominateVector; 503 DT.getDescendants(*SI, DominateVector); 504 // We can only outline single entry regions (for now). 505 if (!IsSingleEntry(DominateVector)) 506 continue; 507 BasicBlock *ExitBlock = nullptr; 508 // We can only outline single exit regions (for now). 509 if (!(ExitBlock = IsSingleExit(DominateVector))) 510 continue; 511 int OutlineRegionCost = 0; 512 for (auto *BB : DominateVector) 513 OutlineRegionCost += computeBBInlineCost(BB); 514 515 #ifndef NDEBUG 516 if (TracePartialInlining) 517 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 518 #endif 519 520 if (OutlineRegionCost < MinOutlineRegionCost) { 521 ORE.emit([&]() { 522 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 523 &SI->front()) 524 << ore::NV("Callee", F) << " inline cost-savings smaller than " 525 << ore::NV("Cost", MinOutlineRegionCost); 526 }); 527 continue; 528 } 529 // For now, ignore blocks that belong to a SISE region that is a 530 // candidate for outlining. In the future, we may want to look 531 // at inner regions because the outer region may have live-exit 532 // variables. 533 for (auto *BB : DominateVector) 534 VisitedMap[BB] = true; 535 // ReturnBlock here means the block after the outline call 536 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 537 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 538 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 539 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 540 RegInfo.Region = DominateVector; 541 OutliningInfo->ORI.push_back(RegInfo); 542 #ifndef NDEBUG 543 if (TracePartialInlining) { 544 dbgs() << "Found Cold Candidate starting at block: " 545 << DominateVector.front()->getName() << "\n"; 546 } 547 #endif 548 ColdCandidateFound = true; 549 NumColdRegionsFound++; 550 } 551 } 552 if (ColdCandidateFound) 553 return OutliningInfo; 554 else 555 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 556 } 557 558 std::unique_ptr<FunctionOutliningInfo> 559 PartialInlinerImpl::computeOutliningInfo(Function *F) { 560 BasicBlock *EntryBlock = &F->front(); 561 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 562 if (!BR || BR->isUnconditional()) 563 return std::unique_ptr<FunctionOutliningInfo>(); 564 565 // Returns true if Succ is BB's successor 566 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 567 return is_contained(successors(BB), Succ); 568 }; 569 570 auto SuccSize = [](BasicBlock *BB) { 571 return std::distance(succ_begin(BB), succ_end(BB)); 572 }; 573 574 auto IsReturnBlock = [](BasicBlock *BB) { 575 TerminatorInst *TI = BB->getTerminator(); 576 return isa<ReturnInst>(TI); 577 }; 578 579 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 580 if (IsReturnBlock(Succ1)) 581 return std::make_tuple(Succ1, Succ2); 582 if (IsReturnBlock(Succ2)) 583 return std::make_tuple(Succ2, Succ1); 584 585 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 586 }; 587 588 // Detect a triangular shape: 589 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 590 if (IsSuccessor(Succ1, Succ2)) 591 return std::make_tuple(Succ1, Succ2); 592 if (IsSuccessor(Succ2, Succ1)) 593 return std::make_tuple(Succ2, Succ1); 594 595 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 596 }; 597 598 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 599 llvm::make_unique<FunctionOutliningInfo>(); 600 601 BasicBlock *CurrEntry = EntryBlock; 602 bool CandidateFound = false; 603 do { 604 // The number of blocks to be inlined has already reached 605 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 606 // disables partial inlining for the function. 607 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 608 break; 609 610 if (SuccSize(CurrEntry) != 2) 611 break; 612 613 BasicBlock *Succ1 = *succ_begin(CurrEntry); 614 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 615 616 BasicBlock *ReturnBlock, *NonReturnBlock; 617 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 618 619 if (ReturnBlock) { 620 OutliningInfo->Entries.push_back(CurrEntry); 621 OutliningInfo->ReturnBlock = ReturnBlock; 622 OutliningInfo->NonReturnBlock = NonReturnBlock; 623 CandidateFound = true; 624 break; 625 } 626 627 BasicBlock *CommSucc; 628 BasicBlock *OtherSucc; 629 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 630 631 if (!CommSucc) 632 break; 633 634 OutliningInfo->Entries.push_back(CurrEntry); 635 CurrEntry = OtherSucc; 636 } while (true); 637 638 if (!CandidateFound) 639 return std::unique_ptr<FunctionOutliningInfo>(); 640 641 // Do sanity check of the entries: threre should not 642 // be any successors (not in the entry set) other than 643 // {ReturnBlock, NonReturnBlock} 644 assert(OutliningInfo->Entries[0] == &F->front() && 645 "Function Entry must be the first in Entries vector"); 646 DenseSet<BasicBlock *> Entries; 647 for (BasicBlock *E : OutliningInfo->Entries) 648 Entries.insert(E); 649 650 // Returns true of BB has Predecessor which is not 651 // in Entries set. 652 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 653 for (auto Pred : predecessors(BB)) { 654 if (!Entries.count(Pred)) 655 return true; 656 } 657 return false; 658 }; 659 auto CheckAndNormalizeCandidate = 660 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 661 for (BasicBlock *E : OutliningInfo->Entries) { 662 for (auto Succ : successors(E)) { 663 if (Entries.count(Succ)) 664 continue; 665 if (Succ == OutliningInfo->ReturnBlock) 666 OutliningInfo->ReturnBlockPreds.push_back(E); 667 else if (Succ != OutliningInfo->NonReturnBlock) 668 return false; 669 } 670 // There should not be any outside incoming edges either: 671 if (HasNonEntryPred(E)) 672 return false; 673 } 674 return true; 675 }; 676 677 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 678 return std::unique_ptr<FunctionOutliningInfo>(); 679 680 // Now further growing the candidate's inlining region by 681 // peeling off dominating blocks from the outlining region: 682 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 683 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 684 if (SuccSize(Cand) != 2) 685 break; 686 687 if (HasNonEntryPred(Cand)) 688 break; 689 690 BasicBlock *Succ1 = *succ_begin(Cand); 691 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 692 693 BasicBlock *ReturnBlock, *NonReturnBlock; 694 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 695 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 696 break; 697 698 if (NonReturnBlock->getSinglePredecessor() != Cand) 699 break; 700 701 // Now grow and update OutlininigInfo: 702 OutliningInfo->Entries.push_back(Cand); 703 OutliningInfo->NonReturnBlock = NonReturnBlock; 704 OutliningInfo->ReturnBlockPreds.push_back(Cand); 705 Entries.insert(Cand); 706 } 707 708 return OutliningInfo; 709 } 710 711 // Check if there is PGO data or user annoated branch data: 712 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 713 if (F->hasProfileData()) 714 return true; 715 // Now check if any of the entry block has MD_prof data: 716 for (auto *E : OI->Entries) { 717 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 718 if (!BR || BR->isUnconditional()) 719 continue; 720 uint64_t T, F; 721 if (BR->extractProfMetadata(T, F)) 722 return true; 723 } 724 return false; 725 } 726 727 BranchProbability 728 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 729 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 730 auto EntryFreq = 731 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 732 auto OutliningCallFreq = 733 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 734 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 735 // we outlined any regions, so we may encounter situations where the 736 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 737 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 738 OutliningCallFreq = EntryFreq; 739 } 740 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 741 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 742 743 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 744 return OutlineRegionRelFreq; 745 746 // When profile data is not available, we need to be conservative in 747 // estimating the overall savings. Static branch prediction can usually 748 // guess the branch direction right (taken/non-taken), but the guessed 749 // branch probability is usually not biased enough. In case when the 750 // outlined region is predicted to be likely, its probability needs 751 // to be made higher (more biased) to not under-estimate the cost of 752 // function outlining. On the other hand, if the outlined region 753 // is predicted to be less likely, the predicted probablity is usually 754 // higher than the actual. For instance, the actual probability of the 755 // less likely target is only 5%, but the guessed probablity can be 756 // 40%. In the latter case, there is no need for further adjustement. 757 // FIXME: add an option for this. 758 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 759 return OutlineRegionRelFreq; 760 761 OutlineRegionRelFreq = std::max( 762 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 763 764 return OutlineRegionRelFreq; 765 } 766 767 bool PartialInlinerImpl::shouldPartialInline( 768 CallSite CS, FunctionCloner &Cloner, 769 BlockFrequency WeightedOutliningRcost) { 770 using namespace ore; 771 772 if (SkipCostAnalysis) 773 return true; 774 775 Instruction *Call = CS.getInstruction(); 776 Function *Callee = CS.getCalledFunction(); 777 assert(Callee == Cloner.ClonedFunc); 778 779 Function *Caller = CS.getCaller(); 780 auto &CalleeTTI = (*GetTTI)(*Callee); 781 auto &ORE = (*GetORE)(*Caller); 782 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 783 *GetAssumptionCache, GetBFI, PSI, &ORE); 784 785 if (IC.isAlways()) { 786 ORE.emit([&]() { 787 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 788 << NV("Callee", Cloner.OrigFunc) 789 << " should always be fully inlined, not partially"; 790 }); 791 return false; 792 } 793 794 if (IC.isNever()) { 795 ORE.emit([&]() { 796 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 797 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 798 << NV("Caller", Caller) 799 << " because it should never be inlined (cost=never)"; 800 }); 801 return false; 802 } 803 804 if (!IC) { 805 ORE.emit([&]() { 806 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 807 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 808 << NV("Caller", Caller) << " because too costly to inline (cost=" 809 << NV("Cost", IC.getCost()) << ", threshold=" 810 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 811 }); 812 return false; 813 } 814 const DataLayout &DL = Caller->getParent()->getDataLayout(); 815 816 // The savings of eliminating the call: 817 int NonWeightedSavings = getCallsiteCost(CS, DL); 818 BlockFrequency NormWeightedSavings(NonWeightedSavings); 819 820 // Weighted saving is smaller than weighted cost, return false 821 if (NormWeightedSavings < WeightedOutliningRcost) { 822 ORE.emit([&]() { 823 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 824 Call) 825 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 826 << NV("Caller", Caller) << " runtime overhead (overhead=" 827 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 828 << ", savings=" 829 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 830 << ")" 831 << " of making the outlined call is too high"; 832 }); 833 834 return false; 835 } 836 837 ORE.emit([&]() { 838 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 839 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 840 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 841 << " (threshold=" 842 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 843 }); 844 return true; 845 } 846 847 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 848 // For now use a simplified version. The returned 'InlineCost' will be used 849 // to esimate the size cost as well as runtime cost of the BB. 850 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 851 int InlineCost = 0; 852 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 853 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 854 if (isa<DbgInfoIntrinsic>(I)) 855 continue; 856 857 switch (I->getOpcode()) { 858 case Instruction::BitCast: 859 case Instruction::PtrToInt: 860 case Instruction::IntToPtr: 861 case Instruction::Alloca: 862 continue; 863 case Instruction::GetElementPtr: 864 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 865 continue; 866 break; 867 default: 868 break; 869 } 870 871 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 872 if (IntrInst) { 873 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 874 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 875 continue; 876 } 877 878 if (CallInst *CI = dyn_cast<CallInst>(I)) { 879 InlineCost += getCallsiteCost(CallSite(CI), DL); 880 continue; 881 } 882 883 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 884 InlineCost += getCallsiteCost(CallSite(II), DL); 885 continue; 886 } 887 888 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 889 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 890 continue; 891 } 892 InlineCost += InlineConstants::InstrCost; 893 } 894 return InlineCost; 895 } 896 897 std::tuple<int, int> 898 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 899 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 900 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 901 Function *OutlinedFunc = FuncBBPair.first; 902 BasicBlock* OutliningCallBB = FuncBBPair.second; 903 // Now compute the cost of the call sequence to the outlined function 904 // 'OutlinedFunction' in BB 'OutliningCallBB': 905 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 906 907 // Now compute the cost of the extracted/outlined function itself: 908 for (BasicBlock &BB : *OutlinedFunc) 909 OutlinedFunctionCost += computeBBInlineCost(&BB); 910 } 911 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 912 "Outlined function cost should be no less than the outlined region"); 913 914 // The code extractor introduces a new root and exit stub blocks with 915 // additional unconditional branches. Those branches will be eliminated 916 // later with bb layout. The cost should be adjusted accordingly: 917 OutlinedFunctionCost -= 918 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 919 920 int OutliningRuntimeOverhead = 921 OutliningFuncCallCost + 922 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 923 ExtraOutliningPenalty; 924 925 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 926 } 927 928 // Create the callsite to profile count map which is 929 // used to update the original function's entry count, 930 // after the function is partially inlined into the callsite. 931 void PartialInlinerImpl::computeCallsiteToProfCountMap( 932 Function *DuplicateFunction, 933 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 934 std::vector<User *> Users(DuplicateFunction->user_begin(), 935 DuplicateFunction->user_end()); 936 Function *CurrentCaller = nullptr; 937 std::unique_ptr<BlockFrequencyInfo> TempBFI; 938 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 939 940 auto ComputeCurrBFI = [&,this](Function *Caller) { 941 // For the old pass manager: 942 if (!GetBFI) { 943 DominatorTree DT(*Caller); 944 LoopInfo LI(DT); 945 BranchProbabilityInfo BPI(*Caller, LI); 946 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 947 CurrentCallerBFI = TempBFI.get(); 948 } else { 949 // New pass manager: 950 CurrentCallerBFI = &(*GetBFI)(*Caller); 951 } 952 }; 953 954 for (User *User : Users) { 955 CallSite CS = getCallSite(User); 956 Function *Caller = CS.getCaller(); 957 if (CurrentCaller != Caller) { 958 CurrentCaller = Caller; 959 ComputeCurrBFI(Caller); 960 } else { 961 assert(CurrentCallerBFI && "CallerBFI is not set"); 962 } 963 BasicBlock *CallBB = CS.getInstruction()->getParent(); 964 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 965 if (Count) 966 CallSiteToProfCountMap[User] = *Count; 967 else 968 CallSiteToProfCountMap[User] = 0; 969 } 970 } 971 972 PartialInlinerImpl::FunctionCloner::FunctionCloner( 973 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE) 974 : OrigFunc(F), ORE(ORE) { 975 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 976 977 // Clone the function, so that we can hack away on it. 978 ValueToValueMapTy VMap; 979 ClonedFunc = CloneFunction(F, VMap); 980 981 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 982 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 983 for (BasicBlock *BB : OI->Entries) { 984 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 985 } 986 for (BasicBlock *E : OI->ReturnBlockPreds) { 987 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 988 ClonedOI->ReturnBlockPreds.push_back(NewE); 989 } 990 // Go ahead and update all uses to the duplicate, so that we can just 991 // use the inliner functionality when we're done hacking. 992 F->replaceAllUsesWith(ClonedFunc); 993 } 994 995 PartialInlinerImpl::FunctionCloner::FunctionCloner( 996 Function *F, FunctionOutliningMultiRegionInfo *OI, 997 OptimizationRemarkEmitter &ORE) 998 : OrigFunc(F), ORE(ORE) { 999 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 1000 1001 // Clone the function, so that we can hack away on it. 1002 ValueToValueMapTy VMap; 1003 ClonedFunc = CloneFunction(F, VMap); 1004 1005 // Go through all Outline Candidate Regions and update all BasicBlock 1006 // information. 1007 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1008 OI->ORI) { 1009 SmallVector<BasicBlock *, 8> Region; 1010 for (BasicBlock *BB : RegionInfo.Region) { 1011 Region.push_back(cast<BasicBlock>(VMap[BB])); 1012 } 1013 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1014 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1015 BasicBlock *NewReturnBlock = nullptr; 1016 if (RegionInfo.ReturnBlock) 1017 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1018 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1019 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1020 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1021 } 1022 // Go ahead and update all uses to the duplicate, so that we can just 1023 // use the inliner functionality when we're done hacking. 1024 F->replaceAllUsesWith(ClonedFunc); 1025 } 1026 1027 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1028 auto getFirstPHI = [](BasicBlock *BB) { 1029 BasicBlock::iterator I = BB->begin(); 1030 PHINode *FirstPhi = nullptr; 1031 while (I != BB->end()) { 1032 PHINode *Phi = dyn_cast<PHINode>(I); 1033 if (!Phi) 1034 break; 1035 if (!FirstPhi) { 1036 FirstPhi = Phi; 1037 break; 1038 } 1039 } 1040 return FirstPhi; 1041 }; 1042 1043 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1044 // blocks. 1045 if (!ClonedOI) 1046 return; 1047 1048 // Special hackery is needed with PHI nodes that have inputs from more than 1049 // one extracted block. For simplicity, just split the PHIs into a two-level 1050 // sequence of PHIs, some of which will go in the extracted region, and some 1051 // of which will go outside. 1052 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1053 // only split block when necessary: 1054 PHINode *FirstPhi = getFirstPHI(PreReturn); 1055 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1056 1057 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1058 return; 1059 1060 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1061 Value *CommonValue = PN->getIncomingValue(0); 1062 if (all_of(PN->incoming_values(), 1063 [&](Value *V) { return V == CommonValue; })) 1064 return CommonValue; 1065 return nullptr; 1066 }; 1067 1068 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1069 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1070 BasicBlock::iterator I = PreReturn->begin(); 1071 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1072 SmallVector<Instruction *, 4> DeadPhis; 1073 while (I != PreReturn->end()) { 1074 PHINode *OldPhi = dyn_cast<PHINode>(I); 1075 if (!OldPhi) 1076 break; 1077 1078 PHINode *RetPhi = 1079 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1080 OldPhi->replaceAllUsesWith(RetPhi); 1081 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1082 1083 RetPhi->addIncoming(&*I, PreReturn); 1084 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1085 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1086 OldPhi->removeIncomingValue(E); 1087 } 1088 1089 // After incoming values splitting, the old phi may become trivial. 1090 // Keeping the trivial phi can introduce definition inside the outline 1091 // region which is live-out, causing necessary overhead (load, store 1092 // arg passing etc). 1093 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1094 OldPhi->replaceAllUsesWith(OldPhiVal); 1095 DeadPhis.push_back(OldPhi); 1096 } 1097 ++I; 1098 } 1099 for (auto *DP : DeadPhis) 1100 DP->eraseFromParent(); 1101 1102 for (auto E : ClonedOI->ReturnBlockPreds) { 1103 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1104 } 1105 } 1106 1107 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1108 1109 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1110 int Cost = 0; 1111 for (BasicBlock* BB : Region) 1112 Cost += computeBBInlineCost(BB); 1113 return Cost; 1114 }; 1115 1116 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1117 1118 if (ClonedOMRI->ORI.empty()) 1119 return false; 1120 1121 // The CodeExtractor needs a dominator tree. 1122 DominatorTree DT; 1123 DT.recalculate(*ClonedFunc); 1124 1125 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1126 LoopInfo LI(DT); 1127 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1128 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1129 1130 SetVector<Value *> Inputs, Outputs, Sinks; 1131 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1132 ClonedOMRI->ORI) { 1133 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1134 1135 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1136 ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false); 1137 1138 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1139 1140 #ifndef NDEBUG 1141 if (TracePartialInlining) { 1142 dbgs() << "inputs: " << Inputs.size() << "\n"; 1143 dbgs() << "outputs: " << Outputs.size() << "\n"; 1144 for (Value *value : Inputs) 1145 dbgs() << "value used in func: " << *value << "\n"; 1146 for (Value *output : Outputs) 1147 dbgs() << "instr used in func: " << *output << "\n"; 1148 } 1149 #endif 1150 // Do not extract regions that have live exit variables. 1151 if (Outputs.size() > 0 && !ForceLiveExit) 1152 continue; 1153 1154 Function *OutlinedFunc = CE.extractCodeRegion(); 1155 1156 if (OutlinedFunc) { 1157 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1158 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1159 assert(OutliningCallBB->getParent() == ClonedFunc); 1160 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1161 NumColdRegionsOutlined++; 1162 OutlinedRegionCost += CurrentOutlinedRegionCost; 1163 1164 if (MarkOutlinedColdCC) { 1165 OutlinedFunc->setCallingConv(CallingConv::Cold); 1166 OCS.setCallingConv(CallingConv::Cold); 1167 } 1168 } else 1169 ORE.emit([&]() { 1170 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1171 &RegionInfo.Region.front()->front()) 1172 << "Failed to extract region at block " 1173 << ore::NV("Block", RegionInfo.Region.front()); 1174 }); 1175 } 1176 1177 return !OutlinedFunctions.empty(); 1178 } 1179 1180 Function * 1181 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1182 // Returns true if the block is to be partial inlined into the caller 1183 // (i.e. not to be extracted to the out of line function) 1184 auto ToBeInlined = [&, this](BasicBlock *BB) { 1185 return BB == ClonedOI->ReturnBlock || 1186 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1187 ClonedOI->Entries.end()); 1188 }; 1189 1190 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1191 // The CodeExtractor needs a dominator tree. 1192 DominatorTree DT; 1193 DT.recalculate(*ClonedFunc); 1194 1195 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1196 LoopInfo LI(DT); 1197 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1198 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1199 1200 // Gather up the blocks that we're going to extract. 1201 std::vector<BasicBlock *> ToExtract; 1202 ToExtract.push_back(ClonedOI->NonReturnBlock); 1203 OutlinedRegionCost += 1204 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1205 for (BasicBlock &BB : *ClonedFunc) 1206 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1207 ToExtract.push_back(&BB); 1208 // FIXME: the code extractor may hoist/sink more code 1209 // into the outlined function which may make the outlining 1210 // overhead (the difference of the outlined function cost 1211 // and OutliningRegionCost) look larger. 1212 OutlinedRegionCost += computeBBInlineCost(&BB); 1213 } 1214 1215 // Extract the body of the if. 1216 Function *OutlinedFunc = 1217 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1218 ClonedFuncBFI.get(), &BPI, 1219 /* AllowVarargs */ true) 1220 .extractCodeRegion(); 1221 1222 if (OutlinedFunc) { 1223 BasicBlock *OutliningCallBB = 1224 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1225 .getInstruction() 1226 ->getParent(); 1227 assert(OutliningCallBB->getParent() == ClonedFunc); 1228 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1229 } else 1230 ORE.emit([&]() { 1231 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1232 &ToExtract.front()->front()) 1233 << "Failed to extract region at block " 1234 << ore::NV("Block", ToExtract.front()); 1235 }); 1236 1237 return OutlinedFunc; 1238 } 1239 1240 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1241 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1242 // users (function pointers, etc.) back to the original function. 1243 ClonedFunc->replaceAllUsesWith(OrigFunc); 1244 ClonedFunc->eraseFromParent(); 1245 if (!IsFunctionInlined) { 1246 // Remove each function that was speculatively created if there is no 1247 // reference. 1248 for (auto FuncBBPair : OutlinedFunctions) { 1249 Function *Func = FuncBBPair.first; 1250 Func->eraseFromParent(); 1251 } 1252 } 1253 } 1254 1255 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1256 1257 if (F->hasAddressTaken()) 1258 return {false, nullptr}; 1259 1260 // Let inliner handle it 1261 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1262 return {false, nullptr}; 1263 1264 if (F->hasFnAttribute(Attribute::NoInline)) 1265 return {false, nullptr}; 1266 1267 if (PSI->isFunctionEntryCold(F)) 1268 return {false, nullptr}; 1269 1270 if (F->user_begin() == F->user_end()) 1271 return {false, nullptr}; 1272 1273 auto &ORE = (*GetORE)(*F); 1274 1275 // Only try to outline cold regions if we have a profile summary, which 1276 // implies we have profiling information. 1277 if (PSI->hasProfileSummary() && F->hasProfileData() && 1278 !DisableMultiRegionPartialInline) { 1279 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1280 computeOutliningColdRegionsInfo(F); 1281 if (OMRI) { 1282 FunctionCloner Cloner(F, OMRI.get(), ORE); 1283 1284 #ifndef NDEBUG 1285 if (TracePartialInlining) { 1286 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1287 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1288 << "\n"; 1289 } 1290 #endif 1291 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1292 1293 if (DidOutline) { 1294 #ifndef NDEBUG 1295 if (TracePartialInlining) { 1296 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1297 Cloner.ClonedFunc->print(dbgs()); 1298 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1299 } 1300 #endif 1301 1302 if (tryPartialInline(Cloner)) 1303 return {true, nullptr}; 1304 } 1305 } 1306 } 1307 1308 // Fall-thru to regular partial inlining if we: 1309 // i) can't find any cold regions to outline, or 1310 // ii) can't inline the outlined function anywhere. 1311 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1312 if (!OI) 1313 return {false, nullptr}; 1314 1315 FunctionCloner Cloner(F, OI.get(), ORE); 1316 Cloner.NormalizeReturnBlock(); 1317 1318 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1319 1320 if (!OutlinedFunction) 1321 return {false, nullptr}; 1322 1323 bool AnyInline = tryPartialInline(Cloner); 1324 1325 if (AnyInline) 1326 return {true, OutlinedFunction}; 1327 1328 return {false, nullptr}; 1329 } 1330 1331 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1332 if (Cloner.OutlinedFunctions.empty()) 1333 return false; 1334 1335 int SizeCost = 0; 1336 BlockFrequency WeightedRcost; 1337 int NonWeightedRcost; 1338 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1339 1340 // Only calculate RelativeToEntryFreq when we are doing single region 1341 // outlining. 1342 BranchProbability RelativeToEntryFreq; 1343 if (Cloner.ClonedOI) { 1344 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1345 } else 1346 // RelativeToEntryFreq doesn't make sense when we have more than one 1347 // outlined call because each call will have a different relative frequency 1348 // to the entry block. We can consider using the average, but the 1349 // usefulness of that information is questionable. For now, assume we never 1350 // execute the calls to outlined functions. 1351 RelativeToEntryFreq = BranchProbability(0, 1); 1352 1353 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1354 1355 // The call sequence(s) to the outlined function(s) are larger than the sum of 1356 // the original outlined region size(s), it does not increase the chances of 1357 // inlining the function with outlining (The inliner uses the size increase to 1358 // model the cost of inlining a callee). 1359 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1360 auto &ORE = (*GetORE)(*Cloner.OrigFunc); 1361 DebugLoc DLoc; 1362 BasicBlock *Block; 1363 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1364 ORE.emit([&]() { 1365 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1366 DLoc, Block) 1367 << ore::NV("Function", Cloner.OrigFunc) 1368 << " not partially inlined into callers (Original Size = " 1369 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1370 << ", Size of call sequence to outlined function = " 1371 << ore::NV("NewSize", SizeCost) << ")"; 1372 }); 1373 return false; 1374 } 1375 1376 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 1377 "F's users should all be replaced!"); 1378 1379 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1380 Cloner.ClonedFunc->user_end()); 1381 1382 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1383 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1384 if (CalleeEntryCount) 1385 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1386 1387 uint64_t CalleeEntryCountV = 1388 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1389 1390 bool AnyInline = false; 1391 for (User *User : Users) { 1392 CallSite CS = getCallSite(User); 1393 1394 if (IsLimitReached()) 1395 continue; 1396 1397 1398 if (!shouldPartialInline(CS, Cloner, WeightedRcost)) 1399 continue; 1400 1401 auto &ORE = (*GetORE)(*CS.getCaller()); 1402 // Construct remark before doing the inlining, as after successful inlining 1403 // the callsite is removed. 1404 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1405 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1406 << ore::NV("Caller", CS.getCaller()); 1407 1408 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1409 // We can only forward varargs when we outlined a single region, else we 1410 // bail on vararg functions. 1411 if (!InlineFunction(CS, IFI, nullptr, true, 1412 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1413 : nullptr))) 1414 continue; 1415 1416 ORE.emit(OR); 1417 1418 // Now update the entry count: 1419 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1420 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1421 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1422 } 1423 1424 AnyInline = true; 1425 NumPartialInlining++; 1426 // Update the stats 1427 if (Cloner.ClonedOI) 1428 NumPartialInlined++; 1429 else 1430 NumColdOutlinePartialInlined++; 1431 1432 } 1433 1434 if (AnyInline) { 1435 Cloner.IsFunctionInlined = true; 1436 if (CalleeEntryCount) 1437 Cloner.OrigFunc->setEntryCount( 1438 CalleeEntryCount.setCount(CalleeEntryCountV)); 1439 auto &ORE = (*GetORE)(*Cloner.OrigFunc); 1440 ORE.emit([&]() { 1441 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1442 << "Partially inlined into at least one caller"; 1443 }); 1444 1445 } 1446 1447 return AnyInline; 1448 } 1449 1450 bool PartialInlinerImpl::run(Module &M) { 1451 if (DisablePartialInlining) 1452 return false; 1453 1454 std::vector<Function *> Worklist; 1455 Worklist.reserve(M.size()); 1456 for (Function &F : M) 1457 if (!F.use_empty() && !F.isDeclaration()) 1458 Worklist.push_back(&F); 1459 1460 bool Changed = false; 1461 while (!Worklist.empty()) { 1462 Function *CurrFunc = Worklist.back(); 1463 Worklist.pop_back(); 1464 1465 if (CurrFunc->use_empty()) 1466 continue; 1467 1468 bool Recursive = false; 1469 for (User *U : CurrFunc->users()) 1470 if (Instruction *I = dyn_cast<Instruction>(U)) 1471 if (I->getParent()->getParent() == CurrFunc) { 1472 Recursive = true; 1473 break; 1474 } 1475 if (Recursive) 1476 continue; 1477 1478 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1479 if (Result.second) 1480 Worklist.push_back(Result.second); 1481 if (Result.first) { 1482 Changed = true; 1483 } 1484 } 1485 1486 return Changed; 1487 } 1488 1489 char PartialInlinerLegacyPass::ID = 0; 1490 1491 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1492 "Partial Inliner", false, false) 1493 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1494 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1495 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1496 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1497 "Partial Inliner", false, false) 1498 1499 ModulePass *llvm::createPartialInliningPass() { 1500 return new PartialInlinerLegacyPass(); 1501 } 1502 1503 PreservedAnalyses PartialInlinerPass::run(Module &M, 1504 ModuleAnalysisManager &AM) { 1505 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1506 1507 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1508 [&FAM](Function &F) -> AssumptionCache & { 1509 return FAM.getResult<AssumptionAnalysis>(F); 1510 }; 1511 1512 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1513 [&FAM](Function &F) -> BlockFrequencyInfo & { 1514 return FAM.getResult<BlockFrequencyAnalysis>(F); 1515 }; 1516 1517 std::function<TargetTransformInfo &(Function &)> GetTTI = 1518 [&FAM](Function &F) -> TargetTransformInfo & { 1519 return FAM.getResult<TargetIRAnalysis>(F); 1520 }; 1521 1522 std::function<OptimizationRemarkEmitter &(Function &)> GetORE = 1523 [&FAM](Function &F) -> OptimizationRemarkEmitter & { 1524 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 1525 }; 1526 1527 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1528 1529 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE) 1530 .run(M)) 1531 return PreservedAnalyses::none(); 1532 return PreservedAnalyses::all(); 1533 } 1534