1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass performs partial inlining, typically by inlining an if statement 11 // that surrounds the body of the function. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/Transforms/IPO/PartialInlining.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/DenseSet.h" 18 #include "llvm/ADT/None.h" 19 #include "llvm/ADT/Optional.h" 20 #include "llvm/ADT/STLExtras.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/Statistic.h" 23 #include "llvm/Analysis/BlockFrequencyInfo.h" 24 #include "llvm/Analysis/BranchProbabilityInfo.h" 25 #include "llvm/Analysis/InlineCost.h" 26 #include "llvm/Analysis/LoopInfo.h" 27 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 28 #include "llvm/Analysis/ProfileSummaryInfo.h" 29 #include "llvm/Analysis/TargetLibraryInfo.h" 30 #include "llvm/Analysis/TargetTransformInfo.h" 31 #include "llvm/IR/Attributes.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/CFG.h" 34 #include "llvm/IR/CallSite.h" 35 #include "llvm/IR/DebugLoc.h" 36 #include "llvm/IR/DiagnosticInfo.h" 37 #include "llvm/IR/Dominators.h" 38 #include "llvm/IR/Function.h" 39 #include "llvm/IR/InstrTypes.h" 40 #include "llvm/IR/Instruction.h" 41 #include "llvm/IR/Instructions.h" 42 #include "llvm/IR/IntrinsicInst.h" 43 #include "llvm/IR/Intrinsics.h" 44 #include "llvm/IR/Module.h" 45 #include "llvm/IR/User.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/BlockFrequency.h" 48 #include "llvm/Support/BranchProbability.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Transforms/IPO.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/CodeExtractor.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <functional> 60 #include <iterator> 61 #include <memory> 62 #include <tuple> 63 #include <vector> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "partial-inlining" 68 69 STATISTIC(NumPartialInlined, 70 "Number of callsites functions partially inlined into."); 71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 72 "cold outlined regions were partially " 73 "inlined into its caller(s)."); 74 STATISTIC(NumColdRegionsFound, 75 "Number of cold single entry/exit regions found."); 76 STATISTIC(NumColdRegionsOutlined, 77 "Number of cold single entry/exit regions outlined."); 78 79 // Command line option to disable partial-inlining. The default is false: 80 static cl::opt<bool> 81 DisablePartialInlining("disable-partial-inlining", cl::init(false), 82 cl::Hidden, cl::desc("Disable partial inlining")); 83 // Command line option to disable multi-region partial-inlining. The default is 84 // false: 85 static cl::opt<bool> DisableMultiRegionPartialInline( 86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 87 cl::desc("Disable multi-region partial inlining")); 88 89 // Command line option to force outlining in regions with live exit variables. 90 // The default is false: 91 static cl::opt<bool> 92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 93 cl::desc("Force outline regions with live exits")); 94 95 // Command line option to enable marking outline functions with Cold Calling 96 // Convention. The default is false: 97 static cl::opt<bool> 98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 99 cl::desc("Mark outline function calls with ColdCC")); 100 101 #ifndef NDEBUG 102 // Command line option to debug partial-inlining. The default is none: 103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 104 cl::init(false), cl::Hidden, 105 cl::desc("Trace partial inlining.")); 106 #endif 107 108 // This is an option used by testing: 109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 110 cl::init(false), cl::ZeroOrMore, 111 cl::ReallyHidden, 112 cl::desc("Skip Cost Analysis")); 113 // Used to determine if a cold region is worth outlining based on 114 // its inlining cost compared to the original function. Default is set at 10%. 115 // ie. if the cold region reduces the inlining cost of the original function by 116 // at least 10%. 117 static cl::opt<float> MinRegionSizeRatio( 118 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 119 cl::desc("Minimum ratio comparing relative sizes of each " 120 "outline candidate and original function")); 121 // Used to tune the minimum number of execution counts needed in the predecessor 122 // block to the cold edge. ie. confidence interval. 123 static cl::opt<unsigned> 124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 125 cl::desc("Minimum block executions to consider " 126 "its BranchProbabilityInfo valid")); 127 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 128 // if the branch probability is 10% or less, then it is deemed as 'cold'. 129 static cl::opt<float> ColdBranchRatio( 130 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 131 cl::desc("Minimum BranchProbability to consider a region cold.")); 132 133 static cl::opt<unsigned> MaxNumInlineBlocks( 134 "max-num-inline-blocks", cl::init(5), cl::Hidden, 135 cl::desc("Max number of blocks to be partially inlined")); 136 137 // Command line option to set the maximum number of partial inlining allowed 138 // for the module. The default value of -1 means no limit. 139 static cl::opt<int> MaxNumPartialInlining( 140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Max number of partial inlining. The default is unlimited")); 142 143 // Used only when PGO or user annotated branch data is absent. It is 144 // the least value that is used to weigh the outline region. If BFI 145 // produces larger value, the BFI value will be used. 146 static cl::opt<int> 147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 148 cl::Hidden, cl::ZeroOrMore, 149 cl::desc("Relative frequency of outline region to " 150 "the entry block")); 151 152 static cl::opt<unsigned> ExtraOutliningPenalty( 153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 154 cl::desc("A debug option to add additional penalty to the computed one.")); 155 156 namespace { 157 158 struct FunctionOutliningInfo { 159 FunctionOutliningInfo() = default; 160 161 // Returns the number of blocks to be inlined including all blocks 162 // in Entries and one return block. 163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 164 165 // A set of blocks including the function entry that guard 166 // the region to be outlined. 167 SmallVector<BasicBlock *, 4> Entries; 168 169 // The return block that is not included in the outlined region. 170 BasicBlock *ReturnBlock = nullptr; 171 172 // The dominating block of the region to be outlined. 173 BasicBlock *NonReturnBlock = nullptr; 174 175 // The set of blocks in Entries that that are predecessors to ReturnBlock 176 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 177 }; 178 179 struct FunctionOutliningMultiRegionInfo { 180 FunctionOutliningMultiRegionInfo() 181 : ORI() {} 182 183 // Container for outline regions 184 struct OutlineRegionInfo { 185 OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region, 186 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 187 BasicBlock *ReturnBlock) 188 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), 189 ReturnBlock(ReturnBlock) {} 190 SmallVector<BasicBlock *, 8> Region; 191 BasicBlock *EntryBlock; 192 BasicBlock *ExitBlock; 193 BasicBlock *ReturnBlock; 194 }; 195 196 SmallVector<OutlineRegionInfo, 4> ORI; 197 }; 198 199 struct PartialInlinerImpl { 200 201 PartialInlinerImpl( 202 std::function<AssumptionCache &(Function &)> *GetAC, 203 std::function<TargetTransformInfo &(Function &)> *GTTI, 204 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 205 ProfileSummaryInfo *ProfSI) 206 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 207 208 bool run(Module &M); 209 // Main part of the transformation that calls helper functions to find 210 // outlining candidates, clone & outline the function, and attempt to 211 // partially inline the resulting function. Returns true if 212 // inlining was successful, false otherwise. Also returns the outline 213 // function (only if we partially inlined early returns) as there is a 214 // possibility to further "peel" early return statements that were left in the 215 // outline function due to code size. 216 std::pair<bool, Function *> unswitchFunction(Function *F); 217 218 // This class speculatively clones the function to be partial inlined. 219 // At the end of partial inlining, the remaining callsites to the cloned 220 // function that are not partially inlined will be fixed up to reference 221 // the original function, and the cloned function will be erased. 222 struct FunctionCloner { 223 // Two constructors, one for single region outlining, the other for 224 // multi-region outlining. 225 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 226 OptimizationRemarkEmitter &ORE); 227 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 228 OptimizationRemarkEmitter &ORE); 229 ~FunctionCloner(); 230 231 // Prepare for function outlining: making sure there is only 232 // one incoming edge from the extracted/outlined region to 233 // the return block. 234 void NormalizeReturnBlock(); 235 236 // Do function outlining for cold regions. 237 bool doMultiRegionFunctionOutlining(); 238 // Do function outlining for region after early return block(s). 239 // NOTE: For vararg functions that do the vararg handling in the outlined 240 // function, we temporarily generate IR that does not properly 241 // forward varargs to the outlined function. Calling InlineFunction 242 // will update calls to the outlined functions to properly forward 243 // the varargs. 244 Function *doSingleRegionFunctionOutlining(); 245 246 Function *OrigFunc = nullptr; 247 Function *ClonedFunc = nullptr; 248 249 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 250 // Keep track of Outlined Functions and the basic block they're called from. 251 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 252 253 // ClonedFunc is inlined in one of its callers after function 254 // outlining. 255 bool IsFunctionInlined = false; 256 // The cost of the region to be outlined. 257 int OutlinedRegionCost = 0; 258 // ClonedOI is specific to outlining non-early return blocks. 259 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 260 // ClonedOMRI is specific to outlining cold regions. 261 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 262 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 263 OptimizationRemarkEmitter &ORE; 264 }; 265 266 private: 267 int NumPartialInlining = 0; 268 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 269 std::function<TargetTransformInfo &(Function &)> *GetTTI; 270 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 271 ProfileSummaryInfo *PSI; 272 273 // Return the frequency of the OutlininingBB relative to F's entry point. 274 // The result is no larger than 1 and is represented using BP. 275 // (Note that the outlined region's 'head' block can only have incoming 276 // edges from the guarding entry blocks). 277 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 278 279 // Return true if the callee of CS should be partially inlined with 280 // profit. 281 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 282 BlockFrequency WeightedOutliningRcost, 283 OptimizationRemarkEmitter &ORE); 284 285 // Try to inline DuplicateFunction (cloned from F with call to 286 // the OutlinedFunction into its callers. Return true 287 // if there is any successful inlining. 288 bool tryPartialInline(FunctionCloner &Cloner); 289 290 // Compute the mapping from use site of DuplicationFunction to the enclosing 291 // BB's profile count. 292 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 293 DenseMap<User *, uint64_t> &SiteCountMap); 294 295 bool IsLimitReached() { 296 return (MaxNumPartialInlining != -1 && 297 NumPartialInlining >= MaxNumPartialInlining); 298 } 299 300 static CallSite getCallSite(User *U) { 301 CallSite CS; 302 if (CallInst *CI = dyn_cast<CallInst>(U)) 303 CS = CallSite(CI); 304 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 305 CS = CallSite(II); 306 else 307 llvm_unreachable("All uses must be calls"); 308 return CS; 309 } 310 311 static CallSite getOneCallSiteTo(Function *F) { 312 User *User = *F->user_begin(); 313 return getCallSite(User); 314 } 315 316 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 317 CallSite CS = getOneCallSiteTo(F); 318 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 319 BasicBlock *Block = CS.getParent(); 320 return std::make_tuple(DLoc, Block); 321 } 322 323 // Returns the costs associated with function outlining: 324 // - The first value is the non-weighted runtime cost for making the call 325 // to the outlined function, including the addtional setup cost in the 326 // outlined function itself; 327 // - The second value is the estimated size of the new call sequence in 328 // basic block Cloner.OutliningCallBB; 329 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 330 331 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 332 // approximate both the size and runtime cost (Note that in the current 333 // inline cost analysis, there is no clear distinction there either). 334 static int computeBBInlineCost(BasicBlock *BB); 335 336 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 337 std::unique_ptr<FunctionOutliningMultiRegionInfo> 338 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 339 }; 340 341 struct PartialInlinerLegacyPass : public ModulePass { 342 static char ID; // Pass identification, replacement for typeid 343 344 PartialInlinerLegacyPass() : ModulePass(ID) { 345 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 346 } 347 348 void getAnalysisUsage(AnalysisUsage &AU) const override { 349 AU.addRequired<AssumptionCacheTracker>(); 350 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 351 AU.addRequired<TargetTransformInfoWrapperPass>(); 352 } 353 354 bool runOnModule(Module &M) override { 355 if (skipModule(M)) 356 return false; 357 358 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 359 TargetTransformInfoWrapperPass *TTIWP = 360 &getAnalysis<TargetTransformInfoWrapperPass>(); 361 ProfileSummaryInfo *PSI = 362 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 363 364 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 365 [&ACT](Function &F) -> AssumptionCache & { 366 return ACT->getAssumptionCache(F); 367 }; 368 369 std::function<TargetTransformInfo &(Function &)> GetTTI = 370 [&TTIWP](Function &F) -> TargetTransformInfo & { 371 return TTIWP->getTTI(F); 372 }; 373 374 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI) 375 .run(M); 376 } 377 }; 378 379 } // end anonymous namespace 380 381 std::unique_ptr<FunctionOutliningMultiRegionInfo> 382 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 383 OptimizationRemarkEmitter &ORE) { 384 BasicBlock *EntryBlock = &F->front(); 385 386 DominatorTree DT(*F); 387 LoopInfo LI(DT); 388 BranchProbabilityInfo BPI(*F, LI); 389 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 390 BlockFrequencyInfo *BFI; 391 if (!GetBFI) { 392 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 393 BFI = ScopedBFI.get(); 394 } else 395 BFI = &(*GetBFI)(*F); 396 397 // Return if we don't have profiling information. 398 if (!PSI->hasInstrumentationProfile()) 399 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 400 401 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 402 llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 403 404 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 405 BasicBlock *Dom = BlockList.front(); 406 return BlockList.size() > 1 && 407 std::distance(pred_begin(Dom), pred_end(Dom)) == 1; 408 }; 409 410 auto IsSingleExit = 411 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 412 BasicBlock *ExitBlock = nullptr; 413 for (auto *Block : BlockList) { 414 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 415 if (!is_contained(BlockList, *SI)) { 416 if (ExitBlock) { 417 ORE.emit([&]() { 418 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 419 &SI->front()) 420 << "Region dominated by " 421 << ore::NV("Block", BlockList.front()->getName()) 422 << " has more than one region exit edge."; 423 }); 424 return nullptr; 425 } else 426 ExitBlock = Block; 427 } 428 } 429 } 430 return ExitBlock; 431 }; 432 433 auto BBProfileCount = [BFI](BasicBlock *BB) { 434 return BFI->getBlockProfileCount(BB) 435 ? BFI->getBlockProfileCount(BB).getValue() 436 : 0; 437 }; 438 439 // Use the same computeBBInlineCost function to compute the cost savings of 440 // the outlining the candidate region. 441 int OverallFunctionCost = 0; 442 for (auto &BB : *F) 443 OverallFunctionCost += computeBBInlineCost(&BB); 444 445 #ifndef NDEBUG 446 if (TracePartialInlining) 447 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 448 #endif 449 int MinOutlineRegionCost = 450 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 451 BranchProbability MinBranchProbability( 452 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 453 MinBlockCounterExecution); 454 bool ColdCandidateFound = false; 455 BasicBlock *CurrEntry = EntryBlock; 456 std::vector<BasicBlock *> DFS; 457 DenseMap<BasicBlock *, bool> VisitedMap; 458 DFS.push_back(CurrEntry); 459 VisitedMap[CurrEntry] = true; 460 // Use Depth First Search on the basic blocks to find CFG edges that are 461 // considered cold. 462 // Cold regions considered must also have its inline cost compared to the 463 // overall inline cost of the original function. The region is outlined only 464 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 465 // more. 466 while (!DFS.empty()) { 467 auto *thisBB = DFS.back(); 468 DFS.pop_back(); 469 // Only consider regions with predecessor blocks that are considered 470 // not-cold (default: part of the top 99.99% of all block counters) 471 // AND greater than our minimum block execution count (default: 100). 472 if (PSI->isColdBB(thisBB, BFI) || 473 BBProfileCount(thisBB) < MinBlockCounterExecution) 474 continue; 475 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 476 if (VisitedMap[*SI]) 477 continue; 478 VisitedMap[*SI] = true; 479 DFS.push_back(*SI); 480 // If branch isn't cold, we skip to the next one. 481 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 482 if (SuccProb > MinBranchProbability) 483 continue; 484 #ifndef NDEBUG 485 if (TracePartialInlining) { 486 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 487 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 488 << "\n"; 489 } 490 #endif 491 SmallVector<BasicBlock *, 8> DominateVector; 492 DT.getDescendants(*SI, DominateVector); 493 // We can only outline single entry regions (for now). 494 if (!IsSingleEntry(DominateVector)) 495 continue; 496 BasicBlock *ExitBlock = nullptr; 497 // We can only outline single exit regions (for now). 498 if (!(ExitBlock = IsSingleExit(DominateVector))) 499 continue; 500 int OutlineRegionCost = 0; 501 for (auto *BB : DominateVector) 502 OutlineRegionCost += computeBBInlineCost(BB); 503 504 #ifndef NDEBUG 505 if (TracePartialInlining) 506 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 507 #endif 508 509 if (OutlineRegionCost < MinOutlineRegionCost) { 510 ORE.emit([&]() { 511 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 512 &SI->front()) 513 << ore::NV("Callee", F) << " inline cost-savings smaller than " 514 << ore::NV("Cost", MinOutlineRegionCost); 515 }); 516 continue; 517 } 518 // For now, ignore blocks that belong to a SISE region that is a 519 // candidate for outlining. In the future, we may want to look 520 // at inner regions because the outer region may have live-exit 521 // variables. 522 for (auto *BB : DominateVector) 523 VisitedMap[BB] = true; 524 // ReturnBlock here means the block after the outline call 525 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 526 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 527 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 528 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 529 RegInfo.Region = DominateVector; 530 OutliningInfo->ORI.push_back(RegInfo); 531 #ifndef NDEBUG 532 if (TracePartialInlining) { 533 dbgs() << "Found Cold Candidate starting at block: " 534 << DominateVector.front()->getName() << "\n"; 535 } 536 #endif 537 ColdCandidateFound = true; 538 NumColdRegionsFound++; 539 } 540 } 541 if (ColdCandidateFound) 542 return OutliningInfo; 543 else 544 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 545 } 546 547 std::unique_ptr<FunctionOutliningInfo> 548 PartialInlinerImpl::computeOutliningInfo(Function *F) { 549 BasicBlock *EntryBlock = &F->front(); 550 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 551 if (!BR || BR->isUnconditional()) 552 return std::unique_ptr<FunctionOutliningInfo>(); 553 554 // Returns true if Succ is BB's successor 555 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 556 return is_contained(successors(BB), Succ); 557 }; 558 559 auto SuccSize = [](BasicBlock *BB) { 560 return std::distance(succ_begin(BB), succ_end(BB)); 561 }; 562 563 auto IsReturnBlock = [](BasicBlock *BB) { 564 TerminatorInst *TI = BB->getTerminator(); 565 return isa<ReturnInst>(TI); 566 }; 567 568 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 569 if (IsReturnBlock(Succ1)) 570 return std::make_tuple(Succ1, Succ2); 571 if (IsReturnBlock(Succ2)) 572 return std::make_tuple(Succ2, Succ1); 573 574 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 575 }; 576 577 // Detect a triangular shape: 578 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 579 if (IsSuccessor(Succ1, Succ2)) 580 return std::make_tuple(Succ1, Succ2); 581 if (IsSuccessor(Succ2, Succ1)) 582 return std::make_tuple(Succ2, Succ1); 583 584 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 585 }; 586 587 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 588 llvm::make_unique<FunctionOutliningInfo>(); 589 590 BasicBlock *CurrEntry = EntryBlock; 591 bool CandidateFound = false; 592 do { 593 // The number of blocks to be inlined has already reached 594 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 595 // disables partial inlining for the function. 596 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 597 break; 598 599 if (SuccSize(CurrEntry) != 2) 600 break; 601 602 BasicBlock *Succ1 = *succ_begin(CurrEntry); 603 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 604 605 BasicBlock *ReturnBlock, *NonReturnBlock; 606 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 607 608 if (ReturnBlock) { 609 OutliningInfo->Entries.push_back(CurrEntry); 610 OutliningInfo->ReturnBlock = ReturnBlock; 611 OutliningInfo->NonReturnBlock = NonReturnBlock; 612 CandidateFound = true; 613 break; 614 } 615 616 BasicBlock *CommSucc; 617 BasicBlock *OtherSucc; 618 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 619 620 if (!CommSucc) 621 break; 622 623 OutliningInfo->Entries.push_back(CurrEntry); 624 CurrEntry = OtherSucc; 625 } while (true); 626 627 if (!CandidateFound) 628 return std::unique_ptr<FunctionOutliningInfo>(); 629 630 // Do sanity check of the entries: threre should not 631 // be any successors (not in the entry set) other than 632 // {ReturnBlock, NonReturnBlock} 633 assert(OutliningInfo->Entries[0] == &F->front() && 634 "Function Entry must be the first in Entries vector"); 635 DenseSet<BasicBlock *> Entries; 636 for (BasicBlock *E : OutliningInfo->Entries) 637 Entries.insert(E); 638 639 // Returns true of BB has Predecessor which is not 640 // in Entries set. 641 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 642 for (auto Pred : predecessors(BB)) { 643 if (!Entries.count(Pred)) 644 return true; 645 } 646 return false; 647 }; 648 auto CheckAndNormalizeCandidate = 649 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 650 for (BasicBlock *E : OutliningInfo->Entries) { 651 for (auto Succ : successors(E)) { 652 if (Entries.count(Succ)) 653 continue; 654 if (Succ == OutliningInfo->ReturnBlock) 655 OutliningInfo->ReturnBlockPreds.push_back(E); 656 else if (Succ != OutliningInfo->NonReturnBlock) 657 return false; 658 } 659 // There should not be any outside incoming edges either: 660 if (HasNonEntryPred(E)) 661 return false; 662 } 663 return true; 664 }; 665 666 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 667 return std::unique_ptr<FunctionOutliningInfo>(); 668 669 // Now further growing the candidate's inlining region by 670 // peeling off dominating blocks from the outlining region: 671 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 672 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 673 if (SuccSize(Cand) != 2) 674 break; 675 676 if (HasNonEntryPred(Cand)) 677 break; 678 679 BasicBlock *Succ1 = *succ_begin(Cand); 680 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 681 682 BasicBlock *ReturnBlock, *NonReturnBlock; 683 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 684 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 685 break; 686 687 if (NonReturnBlock->getSinglePredecessor() != Cand) 688 break; 689 690 // Now grow and update OutlininigInfo: 691 OutliningInfo->Entries.push_back(Cand); 692 OutliningInfo->NonReturnBlock = NonReturnBlock; 693 OutliningInfo->ReturnBlockPreds.push_back(Cand); 694 Entries.insert(Cand); 695 } 696 697 return OutliningInfo; 698 } 699 700 // Check if there is PGO data or user annoated branch data: 701 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 702 if (F->hasProfileData()) 703 return true; 704 // Now check if any of the entry block has MD_prof data: 705 for (auto *E : OI->Entries) { 706 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 707 if (!BR || BR->isUnconditional()) 708 continue; 709 uint64_t T, F; 710 if (BR->extractProfMetadata(T, F)) 711 return true; 712 } 713 return false; 714 } 715 716 BranchProbability 717 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 718 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 719 auto EntryFreq = 720 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 721 auto OutliningCallFreq = 722 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 723 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 724 // we outlined any regions, so we may encounter situations where the 725 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 726 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 727 OutliningCallFreq = EntryFreq; 728 } 729 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 730 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 731 732 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 733 return OutlineRegionRelFreq; 734 735 // When profile data is not available, we need to be conservative in 736 // estimating the overall savings. Static branch prediction can usually 737 // guess the branch direction right (taken/non-taken), but the guessed 738 // branch probability is usually not biased enough. In case when the 739 // outlined region is predicted to be likely, its probability needs 740 // to be made higher (more biased) to not under-estimate the cost of 741 // function outlining. On the other hand, if the outlined region 742 // is predicted to be less likely, the predicted probablity is usually 743 // higher than the actual. For instance, the actual probability of the 744 // less likely target is only 5%, but the guessed probablity can be 745 // 40%. In the latter case, there is no need for further adjustement. 746 // FIXME: add an option for this. 747 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 748 return OutlineRegionRelFreq; 749 750 OutlineRegionRelFreq = std::max( 751 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 752 753 return OutlineRegionRelFreq; 754 } 755 756 bool PartialInlinerImpl::shouldPartialInline( 757 CallSite CS, FunctionCloner &Cloner, 758 BlockFrequency WeightedOutliningRcost, 759 OptimizationRemarkEmitter &ORE) { 760 using namespace ore; 761 762 Instruction *Call = CS.getInstruction(); 763 Function *Callee = CS.getCalledFunction(); 764 assert(Callee == Cloner.ClonedFunc); 765 766 if (SkipCostAnalysis) 767 return isInlineViable(*Callee); 768 769 Function *Caller = CS.getCaller(); 770 auto &CalleeTTI = (*GetTTI)(*Callee); 771 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 772 *GetAssumptionCache, GetBFI, PSI, &ORE); 773 774 if (IC.isAlways()) { 775 ORE.emit([&]() { 776 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 777 << NV("Callee", Cloner.OrigFunc) 778 << " should always be fully inlined, not partially"; 779 }); 780 return false; 781 } 782 783 if (IC.isNever()) { 784 ORE.emit([&]() { 785 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 786 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 787 << NV("Caller", Caller) 788 << " because it should never be inlined (cost=never)"; 789 }); 790 return false; 791 } 792 793 if (!IC) { 794 ORE.emit([&]() { 795 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 796 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 797 << NV("Caller", Caller) << " because too costly to inline (cost=" 798 << NV("Cost", IC.getCost()) << ", threshold=" 799 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 800 }); 801 return false; 802 } 803 const DataLayout &DL = Caller->getParent()->getDataLayout(); 804 805 // The savings of eliminating the call: 806 int NonWeightedSavings = getCallsiteCost(CS, DL); 807 BlockFrequency NormWeightedSavings(NonWeightedSavings); 808 809 // Weighted saving is smaller than weighted cost, return false 810 if (NormWeightedSavings < WeightedOutliningRcost) { 811 ORE.emit([&]() { 812 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 813 Call) 814 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 815 << NV("Caller", Caller) << " runtime overhead (overhead=" 816 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 817 << ", savings=" 818 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 819 << ")" 820 << " of making the outlined call is too high"; 821 }); 822 823 return false; 824 } 825 826 ORE.emit([&]() { 827 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 828 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 829 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 830 << " (threshold=" 831 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 832 }); 833 return true; 834 } 835 836 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 837 // For now use a simplified version. The returned 'InlineCost' will be used 838 // to esimate the size cost as well as runtime cost of the BB. 839 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 840 int InlineCost = 0; 841 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 842 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { 843 if (isa<DbgInfoIntrinsic>(I)) 844 continue; 845 846 switch (I->getOpcode()) { 847 case Instruction::BitCast: 848 case Instruction::PtrToInt: 849 case Instruction::IntToPtr: 850 case Instruction::Alloca: 851 continue; 852 case Instruction::GetElementPtr: 853 if (cast<GetElementPtrInst>(I)->hasAllZeroIndices()) 854 continue; 855 break; 856 default: 857 break; 858 } 859 860 IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I); 861 if (IntrInst) { 862 if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start || 863 IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) 864 continue; 865 } 866 867 if (CallInst *CI = dyn_cast<CallInst>(I)) { 868 InlineCost += getCallsiteCost(CallSite(CI), DL); 869 continue; 870 } 871 872 if (InvokeInst *II = dyn_cast<InvokeInst>(I)) { 873 InlineCost += getCallsiteCost(CallSite(II), DL); 874 continue; 875 } 876 877 if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) { 878 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 879 continue; 880 } 881 InlineCost += InlineConstants::InstrCost; 882 } 883 return InlineCost; 884 } 885 886 std::tuple<int, int> 887 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 888 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 889 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 890 Function *OutlinedFunc = FuncBBPair.first; 891 BasicBlock* OutliningCallBB = FuncBBPair.second; 892 // Now compute the cost of the call sequence to the outlined function 893 // 'OutlinedFunction' in BB 'OutliningCallBB': 894 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 895 896 // Now compute the cost of the extracted/outlined function itself: 897 for (BasicBlock &BB : *OutlinedFunc) 898 OutlinedFunctionCost += computeBBInlineCost(&BB); 899 } 900 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 901 "Outlined function cost should be no less than the outlined region"); 902 903 // The code extractor introduces a new root and exit stub blocks with 904 // additional unconditional branches. Those branches will be eliminated 905 // later with bb layout. The cost should be adjusted accordingly: 906 OutlinedFunctionCost -= 907 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 908 909 int OutliningRuntimeOverhead = 910 OutliningFuncCallCost + 911 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 912 ExtraOutliningPenalty; 913 914 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 915 } 916 917 // Create the callsite to profile count map which is 918 // used to update the original function's entry count, 919 // after the function is partially inlined into the callsite. 920 void PartialInlinerImpl::computeCallsiteToProfCountMap( 921 Function *DuplicateFunction, 922 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 923 std::vector<User *> Users(DuplicateFunction->user_begin(), 924 DuplicateFunction->user_end()); 925 Function *CurrentCaller = nullptr; 926 std::unique_ptr<BlockFrequencyInfo> TempBFI; 927 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 928 929 auto ComputeCurrBFI = [&,this](Function *Caller) { 930 // For the old pass manager: 931 if (!GetBFI) { 932 DominatorTree DT(*Caller); 933 LoopInfo LI(DT); 934 BranchProbabilityInfo BPI(*Caller, LI); 935 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 936 CurrentCallerBFI = TempBFI.get(); 937 } else { 938 // New pass manager: 939 CurrentCallerBFI = &(*GetBFI)(*Caller); 940 } 941 }; 942 943 for (User *User : Users) { 944 CallSite CS = getCallSite(User); 945 Function *Caller = CS.getCaller(); 946 if (CurrentCaller != Caller) { 947 CurrentCaller = Caller; 948 ComputeCurrBFI(Caller); 949 } else { 950 assert(CurrentCallerBFI && "CallerBFI is not set"); 951 } 952 BasicBlock *CallBB = CS.getInstruction()->getParent(); 953 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 954 if (Count) 955 CallSiteToProfCountMap[User] = *Count; 956 else 957 CallSiteToProfCountMap[User] = 0; 958 } 959 } 960 961 PartialInlinerImpl::FunctionCloner::FunctionCloner( 962 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE) 963 : OrigFunc(F), ORE(ORE) { 964 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 965 966 // Clone the function, so that we can hack away on it. 967 ValueToValueMapTy VMap; 968 ClonedFunc = CloneFunction(F, VMap); 969 970 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 971 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 972 for (BasicBlock *BB : OI->Entries) { 973 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 974 } 975 for (BasicBlock *E : OI->ReturnBlockPreds) { 976 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 977 ClonedOI->ReturnBlockPreds.push_back(NewE); 978 } 979 // Go ahead and update all uses to the duplicate, so that we can just 980 // use the inliner functionality when we're done hacking. 981 F->replaceAllUsesWith(ClonedFunc); 982 } 983 984 PartialInlinerImpl::FunctionCloner::FunctionCloner( 985 Function *F, FunctionOutliningMultiRegionInfo *OI, 986 OptimizationRemarkEmitter &ORE) 987 : OrigFunc(F), ORE(ORE) { 988 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 989 990 // Clone the function, so that we can hack away on it. 991 ValueToValueMapTy VMap; 992 ClonedFunc = CloneFunction(F, VMap); 993 994 // Go through all Outline Candidate Regions and update all BasicBlock 995 // information. 996 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 997 OI->ORI) { 998 SmallVector<BasicBlock *, 8> Region; 999 for (BasicBlock *BB : RegionInfo.Region) { 1000 Region.push_back(cast<BasicBlock>(VMap[BB])); 1001 } 1002 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1003 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1004 BasicBlock *NewReturnBlock = nullptr; 1005 if (RegionInfo.ReturnBlock) 1006 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1007 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1008 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1009 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1010 } 1011 // Go ahead and update all uses to the duplicate, so that we can just 1012 // use the inliner functionality when we're done hacking. 1013 F->replaceAllUsesWith(ClonedFunc); 1014 } 1015 1016 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1017 auto getFirstPHI = [](BasicBlock *BB) { 1018 BasicBlock::iterator I = BB->begin(); 1019 PHINode *FirstPhi = nullptr; 1020 while (I != BB->end()) { 1021 PHINode *Phi = dyn_cast<PHINode>(I); 1022 if (!Phi) 1023 break; 1024 if (!FirstPhi) { 1025 FirstPhi = Phi; 1026 break; 1027 } 1028 } 1029 return FirstPhi; 1030 }; 1031 1032 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1033 // blocks. 1034 if (!ClonedOI) 1035 return; 1036 1037 // Special hackery is needed with PHI nodes that have inputs from more than 1038 // one extracted block. For simplicity, just split the PHIs into a two-level 1039 // sequence of PHIs, some of which will go in the extracted region, and some 1040 // of which will go outside. 1041 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1042 // only split block when necessary: 1043 PHINode *FirstPhi = getFirstPHI(PreReturn); 1044 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1045 1046 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1047 return; 1048 1049 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1050 Value *CommonValue = PN->getIncomingValue(0); 1051 if (all_of(PN->incoming_values(), 1052 [&](Value *V) { return V == CommonValue; })) 1053 return CommonValue; 1054 return nullptr; 1055 }; 1056 1057 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1058 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1059 BasicBlock::iterator I = PreReturn->begin(); 1060 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1061 SmallVector<Instruction *, 4> DeadPhis; 1062 while (I != PreReturn->end()) { 1063 PHINode *OldPhi = dyn_cast<PHINode>(I); 1064 if (!OldPhi) 1065 break; 1066 1067 PHINode *RetPhi = 1068 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1069 OldPhi->replaceAllUsesWith(RetPhi); 1070 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1071 1072 RetPhi->addIncoming(&*I, PreReturn); 1073 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1074 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1075 OldPhi->removeIncomingValue(E); 1076 } 1077 1078 // After incoming values splitting, the old phi may become trivial. 1079 // Keeping the trivial phi can introduce definition inside the outline 1080 // region which is live-out, causing necessary overhead (load, store 1081 // arg passing etc). 1082 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1083 OldPhi->replaceAllUsesWith(OldPhiVal); 1084 DeadPhis.push_back(OldPhi); 1085 } 1086 ++I; 1087 } 1088 for (auto *DP : DeadPhis) 1089 DP->eraseFromParent(); 1090 1091 for (auto E : ClonedOI->ReturnBlockPreds) { 1092 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1093 } 1094 } 1095 1096 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1097 1098 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1099 int Cost = 0; 1100 for (BasicBlock* BB : Region) 1101 Cost += computeBBInlineCost(BB); 1102 return Cost; 1103 }; 1104 1105 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1106 1107 if (ClonedOMRI->ORI.empty()) 1108 return false; 1109 1110 // The CodeExtractor needs a dominator tree. 1111 DominatorTree DT; 1112 DT.recalculate(*ClonedFunc); 1113 1114 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1115 LoopInfo LI(DT); 1116 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1117 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1118 1119 SetVector<Value *> Inputs, Outputs, Sinks; 1120 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1121 ClonedOMRI->ORI) { 1122 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1123 1124 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1125 ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false); 1126 1127 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1128 1129 #ifndef NDEBUG 1130 if (TracePartialInlining) { 1131 dbgs() << "inputs: " << Inputs.size() << "\n"; 1132 dbgs() << "outputs: " << Outputs.size() << "\n"; 1133 for (Value *value : Inputs) 1134 dbgs() << "value used in func: " << *value << "\n"; 1135 for (Value *output : Outputs) 1136 dbgs() << "instr used in func: " << *output << "\n"; 1137 } 1138 #endif 1139 // Do not extract regions that have live exit variables. 1140 if (Outputs.size() > 0 && !ForceLiveExit) 1141 continue; 1142 1143 Function *OutlinedFunc = CE.extractCodeRegion(); 1144 1145 if (OutlinedFunc) { 1146 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1147 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1148 assert(OutliningCallBB->getParent() == ClonedFunc); 1149 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1150 NumColdRegionsOutlined++; 1151 OutlinedRegionCost += CurrentOutlinedRegionCost; 1152 1153 if (MarkOutlinedColdCC) { 1154 OutlinedFunc->setCallingConv(CallingConv::Cold); 1155 OCS.setCallingConv(CallingConv::Cold); 1156 } 1157 } else 1158 ORE.emit([&]() { 1159 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1160 &RegionInfo.Region.front()->front()) 1161 << "Failed to extract region at block " 1162 << ore::NV("Block", RegionInfo.Region.front()); 1163 }); 1164 } 1165 1166 return !OutlinedFunctions.empty(); 1167 } 1168 1169 Function * 1170 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1171 // Returns true if the block is to be partial inlined into the caller 1172 // (i.e. not to be extracted to the out of line function) 1173 auto ToBeInlined = [&, this](BasicBlock *BB) { 1174 return BB == ClonedOI->ReturnBlock || 1175 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1176 ClonedOI->Entries.end()); 1177 }; 1178 1179 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1180 // The CodeExtractor needs a dominator tree. 1181 DominatorTree DT; 1182 DT.recalculate(*ClonedFunc); 1183 1184 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1185 LoopInfo LI(DT); 1186 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1187 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1188 1189 // Gather up the blocks that we're going to extract. 1190 std::vector<BasicBlock *> ToExtract; 1191 ToExtract.push_back(ClonedOI->NonReturnBlock); 1192 OutlinedRegionCost += 1193 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1194 for (BasicBlock &BB : *ClonedFunc) 1195 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1196 ToExtract.push_back(&BB); 1197 // FIXME: the code extractor may hoist/sink more code 1198 // into the outlined function which may make the outlining 1199 // overhead (the difference of the outlined function cost 1200 // and OutliningRegionCost) look larger. 1201 OutlinedRegionCost += computeBBInlineCost(&BB); 1202 } 1203 1204 // Extract the body of the if. 1205 Function *OutlinedFunc = 1206 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1207 ClonedFuncBFI.get(), &BPI, 1208 /* AllowVarargs */ true) 1209 .extractCodeRegion(); 1210 1211 if (OutlinedFunc) { 1212 BasicBlock *OutliningCallBB = 1213 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1214 .getInstruction() 1215 ->getParent(); 1216 assert(OutliningCallBB->getParent() == ClonedFunc); 1217 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1218 } else 1219 ORE.emit([&]() { 1220 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1221 &ToExtract.front()->front()) 1222 << "Failed to extract region at block " 1223 << ore::NV("Block", ToExtract.front()); 1224 }); 1225 1226 return OutlinedFunc; 1227 } 1228 1229 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1230 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1231 // users (function pointers, etc.) back to the original function. 1232 ClonedFunc->replaceAllUsesWith(OrigFunc); 1233 ClonedFunc->eraseFromParent(); 1234 if (!IsFunctionInlined) { 1235 // Remove each function that was speculatively created if there is no 1236 // reference. 1237 for (auto FuncBBPair : OutlinedFunctions) { 1238 Function *Func = FuncBBPair.first; 1239 Func->eraseFromParent(); 1240 } 1241 } 1242 } 1243 1244 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1245 1246 if (F->hasAddressTaken()) 1247 return {false, nullptr}; 1248 1249 // Let inliner handle it 1250 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1251 return {false, nullptr}; 1252 1253 if (F->hasFnAttribute(Attribute::NoInline)) 1254 return {false, nullptr}; 1255 1256 if (PSI->isFunctionEntryCold(F)) 1257 return {false, nullptr}; 1258 1259 if (F->user_begin() == F->user_end()) 1260 return {false, nullptr}; 1261 1262 OptimizationRemarkEmitter ORE(F); 1263 1264 // Only try to outline cold regions if we have a profile summary, which 1265 // implies we have profiling information. 1266 if (PSI->hasProfileSummary() && F->hasProfileData() && 1267 !DisableMultiRegionPartialInline) { 1268 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1269 computeOutliningColdRegionsInfo(F, ORE); 1270 if (OMRI) { 1271 FunctionCloner Cloner(F, OMRI.get(), ORE); 1272 1273 #ifndef NDEBUG 1274 if (TracePartialInlining) { 1275 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1276 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1277 << "\n"; 1278 } 1279 #endif 1280 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1281 1282 if (DidOutline) { 1283 #ifndef NDEBUG 1284 if (TracePartialInlining) { 1285 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1286 Cloner.ClonedFunc->print(dbgs()); 1287 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1288 } 1289 #endif 1290 1291 if (tryPartialInline(Cloner)) 1292 return {true, nullptr}; 1293 } 1294 } 1295 } 1296 1297 // Fall-thru to regular partial inlining if we: 1298 // i) can't find any cold regions to outline, or 1299 // ii) can't inline the outlined function anywhere. 1300 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1301 if (!OI) 1302 return {false, nullptr}; 1303 1304 FunctionCloner Cloner(F, OI.get(), ORE); 1305 Cloner.NormalizeReturnBlock(); 1306 1307 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1308 1309 if (!OutlinedFunction) 1310 return {false, nullptr}; 1311 1312 bool AnyInline = tryPartialInline(Cloner); 1313 1314 if (AnyInline) 1315 return {true, OutlinedFunction}; 1316 1317 return {false, nullptr}; 1318 } 1319 1320 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1321 if (Cloner.OutlinedFunctions.empty()) 1322 return false; 1323 1324 int SizeCost = 0; 1325 BlockFrequency WeightedRcost; 1326 int NonWeightedRcost; 1327 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1328 1329 // Only calculate RelativeToEntryFreq when we are doing single region 1330 // outlining. 1331 BranchProbability RelativeToEntryFreq; 1332 if (Cloner.ClonedOI) { 1333 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1334 } else 1335 // RelativeToEntryFreq doesn't make sense when we have more than one 1336 // outlined call because each call will have a different relative frequency 1337 // to the entry block. We can consider using the average, but the 1338 // usefulness of that information is questionable. For now, assume we never 1339 // execute the calls to outlined functions. 1340 RelativeToEntryFreq = BranchProbability(0, 1); 1341 1342 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1343 1344 // The call sequence(s) to the outlined function(s) are larger than the sum of 1345 // the original outlined region size(s), it does not increase the chances of 1346 // inlining the function with outlining (The inliner uses the size increase to 1347 // model the cost of inlining a callee). 1348 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1349 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1350 DebugLoc DLoc; 1351 BasicBlock *Block; 1352 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1353 OrigFuncORE.emit([&]() { 1354 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1355 DLoc, Block) 1356 << ore::NV("Function", Cloner.OrigFunc) 1357 << " not partially inlined into callers (Original Size = " 1358 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1359 << ", Size of call sequence to outlined function = " 1360 << ore::NV("NewSize", SizeCost) << ")"; 1361 }); 1362 return false; 1363 } 1364 1365 assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() && 1366 "F's users should all be replaced!"); 1367 1368 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1369 Cloner.ClonedFunc->user_end()); 1370 1371 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1372 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1373 if (CalleeEntryCount) 1374 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1375 1376 uint64_t CalleeEntryCountV = 1377 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1378 1379 bool AnyInline = false; 1380 for (User *User : Users) { 1381 CallSite CS = getCallSite(User); 1382 1383 if (IsLimitReached()) 1384 continue; 1385 1386 OptimizationRemarkEmitter CallerORE(CS.getCaller()); 1387 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE)) 1388 continue; 1389 1390 // Construct remark before doing the inlining, as after successful inlining 1391 // the callsite is removed. 1392 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1393 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1394 << ore::NV("Caller", CS.getCaller()); 1395 1396 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1397 // We can only forward varargs when we outlined a single region, else we 1398 // bail on vararg functions. 1399 if (!InlineFunction(CS, IFI, nullptr, true, 1400 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1401 : nullptr))) 1402 continue; 1403 1404 CallerORE.emit(OR); 1405 1406 // Now update the entry count: 1407 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1408 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1409 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1410 } 1411 1412 AnyInline = true; 1413 NumPartialInlining++; 1414 // Update the stats 1415 if (Cloner.ClonedOI) 1416 NumPartialInlined++; 1417 else 1418 NumColdOutlinePartialInlined++; 1419 1420 } 1421 1422 if (AnyInline) { 1423 Cloner.IsFunctionInlined = true; 1424 if (CalleeEntryCount) 1425 Cloner.OrigFunc->setEntryCount( 1426 CalleeEntryCount.setCount(CalleeEntryCountV)); 1427 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1428 OrigFuncORE.emit([&]() { 1429 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1430 << "Partially inlined into at least one caller"; 1431 }); 1432 1433 } 1434 1435 return AnyInline; 1436 } 1437 1438 bool PartialInlinerImpl::run(Module &M) { 1439 if (DisablePartialInlining) 1440 return false; 1441 1442 std::vector<Function *> Worklist; 1443 Worklist.reserve(M.size()); 1444 for (Function &F : M) 1445 if (!F.use_empty() && !F.isDeclaration()) 1446 Worklist.push_back(&F); 1447 1448 bool Changed = false; 1449 while (!Worklist.empty()) { 1450 Function *CurrFunc = Worklist.back(); 1451 Worklist.pop_back(); 1452 1453 if (CurrFunc->use_empty()) 1454 continue; 1455 1456 bool Recursive = false; 1457 for (User *U : CurrFunc->users()) 1458 if (Instruction *I = dyn_cast<Instruction>(U)) 1459 if (I->getParent()->getParent() == CurrFunc) { 1460 Recursive = true; 1461 break; 1462 } 1463 if (Recursive) 1464 continue; 1465 1466 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1467 if (Result.second) 1468 Worklist.push_back(Result.second); 1469 if (Result.first) { 1470 Changed = true; 1471 } 1472 } 1473 1474 return Changed; 1475 } 1476 1477 char PartialInlinerLegacyPass::ID = 0; 1478 1479 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1480 "Partial Inliner", false, false) 1481 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1482 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1483 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1484 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1485 "Partial Inliner", false, false) 1486 1487 ModulePass *llvm::createPartialInliningPass() { 1488 return new PartialInlinerLegacyPass(); 1489 } 1490 1491 PreservedAnalyses PartialInlinerPass::run(Module &M, 1492 ModuleAnalysisManager &AM) { 1493 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1494 1495 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1496 [&FAM](Function &F) -> AssumptionCache & { 1497 return FAM.getResult<AssumptionAnalysis>(F); 1498 }; 1499 1500 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1501 [&FAM](Function &F) -> BlockFrequencyInfo & { 1502 return FAM.getResult<BlockFrequencyAnalysis>(F); 1503 }; 1504 1505 std::function<TargetTransformInfo &(Function &)> GetTTI = 1506 [&FAM](Function &F) -> TargetTransformInfo & { 1507 return FAM.getResult<TargetIRAnalysis>(F); 1508 }; 1509 1510 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1511 1512 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI) 1513 .run(M)) 1514 return PreservedAnalyses::none(); 1515 return PreservedAnalyses::all(); 1516 } 1517