1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 71 "cold outlined regions were partially " 72 "inlined into its caller(s)."); 73 STATISTIC(NumColdRegionsFound, 74 "Number of cold single entry/exit regions found."); 75 STATISTIC(NumColdRegionsOutlined, 76 "Number of cold single entry/exit regions outlined."); 77 78 // Command line option to disable partial-inlining. The default is false: 79 static cl::opt<bool> 80 DisablePartialInlining("disable-partial-inlining", cl::init(false), 81 cl::Hidden, cl::desc("Disable partial inlining")); 82 // Command line option to disable multi-region partial-inlining. The default is 83 // false: 84 static cl::opt<bool> DisableMultiRegionPartialInline( 85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 86 cl::desc("Disable multi-region partial inlining")); 87 88 // Command line option to force outlining in regions with live exit variables. 89 // The default is false: 90 static cl::opt<bool> 91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 92 cl::desc("Force outline regions with live exits")); 93 94 // Command line option to enable marking outline functions with Cold Calling 95 // Convention. The default is false: 96 static cl::opt<bool> 97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 98 cl::desc("Mark outline function calls with ColdCC")); 99 100 #ifndef NDEBUG 101 // Command line option to debug partial-inlining. The default is none: 102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 103 cl::init(false), cl::Hidden, 104 cl::desc("Trace partial inlining.")); 105 #endif 106 107 // This is an option used by testing: 108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 109 cl::init(false), cl::ZeroOrMore, 110 cl::ReallyHidden, 111 cl::desc("Skip Cost Analysis")); 112 // Used to determine if a cold region is worth outlining based on 113 // its inlining cost compared to the original function. Default is set at 10%. 114 // ie. if the cold region reduces the inlining cost of the original function by 115 // at least 10%. 116 static cl::opt<float> MinRegionSizeRatio( 117 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 118 cl::desc("Minimum ratio comparing relative sizes of each " 119 "outline candidate and original function")); 120 // Used to tune the minimum number of execution counts needed in the predecessor 121 // block to the cold edge. ie. confidence interval. 122 static cl::opt<unsigned> 123 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 124 cl::desc("Minimum block executions to consider " 125 "its BranchProbabilityInfo valid")); 126 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 127 // if the branch probability is 10% or less, then it is deemed as 'cold'. 128 static cl::opt<float> ColdBranchRatio( 129 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 130 cl::desc("Minimum BranchProbability to consider a region cold.")); 131 132 static cl::opt<unsigned> MaxNumInlineBlocks( 133 "max-num-inline-blocks", cl::init(5), cl::Hidden, 134 cl::desc("Max number of blocks to be partially inlined")); 135 136 // Command line option to set the maximum number of partial inlining allowed 137 // for the module. The default value of -1 means no limit. 138 static cl::opt<int> MaxNumPartialInlining( 139 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 140 cl::desc("Max number of partial inlining. The default is unlimited")); 141 142 // Used only when PGO or user annotated branch data is absent. It is 143 // the least value that is used to weigh the outline region. If BFI 144 // produces larger value, the BFI value will be used. 145 static cl::opt<int> 146 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 147 cl::Hidden, cl::ZeroOrMore, 148 cl::desc("Relative frequency of outline region to " 149 "the entry block")); 150 151 static cl::opt<unsigned> ExtraOutliningPenalty( 152 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 153 cl::desc("A debug option to add additional penalty to the computed one.")); 154 155 namespace { 156 157 struct FunctionOutliningInfo { 158 FunctionOutliningInfo() = default; 159 160 // Returns the number of blocks to be inlined including all blocks 161 // in Entries and one return block. 162 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 163 164 // A set of blocks including the function entry that guard 165 // the region to be outlined. 166 SmallVector<BasicBlock *, 4> Entries; 167 168 // The return block that is not included in the outlined region. 169 BasicBlock *ReturnBlock = nullptr; 170 171 // The dominating block of the region to be outlined. 172 BasicBlock *NonReturnBlock = nullptr; 173 174 // The set of blocks in Entries that that are predecessors to ReturnBlock 175 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 176 }; 177 178 struct FunctionOutliningMultiRegionInfo { 179 FunctionOutliningMultiRegionInfo() 180 : ORI() {} 181 182 // Container for outline regions 183 struct OutlineRegionInfo { 184 OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region, 185 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 186 BasicBlock *ReturnBlock) 187 : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock), 188 ReturnBlock(ReturnBlock) {} 189 SmallVector<BasicBlock *, 8> Region; 190 BasicBlock *EntryBlock; 191 BasicBlock *ExitBlock; 192 BasicBlock *ReturnBlock; 193 }; 194 195 SmallVector<OutlineRegionInfo, 4> ORI; 196 }; 197 198 struct PartialInlinerImpl { 199 200 PartialInlinerImpl( 201 std::function<AssumptionCache &(Function &)> *GetAC, 202 std::function<TargetTransformInfo &(Function &)> *GTTI, 203 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 204 ProfileSummaryInfo *ProfSI) 205 : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {} 206 207 bool run(Module &M); 208 // Main part of the transformation that calls helper functions to find 209 // outlining candidates, clone & outline the function, and attempt to 210 // partially inline the resulting function. Returns true if 211 // inlining was successful, false otherwise. Also returns the outline 212 // function (only if we partially inlined early returns) as there is a 213 // possibility to further "peel" early return statements that were left in the 214 // outline function due to code size. 215 std::pair<bool, Function *> unswitchFunction(Function *F); 216 217 // This class speculatively clones the function to be partial inlined. 218 // At the end of partial inlining, the remaining callsites to the cloned 219 // function that are not partially inlined will be fixed up to reference 220 // the original function, and the cloned function will be erased. 221 struct FunctionCloner { 222 // Two constructors, one for single region outlining, the other for 223 // multi-region outlining. 224 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 225 OptimizationRemarkEmitter &ORE); 226 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 227 OptimizationRemarkEmitter &ORE); 228 ~FunctionCloner(); 229 230 // Prepare for function outlining: making sure there is only 231 // one incoming edge from the extracted/outlined region to 232 // the return block. 233 void NormalizeReturnBlock(); 234 235 // Do function outlining for cold regions. 236 bool doMultiRegionFunctionOutlining(); 237 // Do function outlining for region after early return block(s). 238 // NOTE: For vararg functions that do the vararg handling in the outlined 239 // function, we temporarily generate IR that does not properly 240 // forward varargs to the outlined function. Calling InlineFunction 241 // will update calls to the outlined functions to properly forward 242 // the varargs. 243 Function *doSingleRegionFunctionOutlining(); 244 245 Function *OrigFunc = nullptr; 246 Function *ClonedFunc = nullptr; 247 248 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 249 // Keep track of Outlined Functions and the basic block they're called from. 250 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 251 252 // ClonedFunc is inlined in one of its callers after function 253 // outlining. 254 bool IsFunctionInlined = false; 255 // The cost of the region to be outlined. 256 int OutlinedRegionCost = 0; 257 // ClonedOI is specific to outlining non-early return blocks. 258 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 259 // ClonedOMRI is specific to outlining cold regions. 260 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 261 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 262 OptimizationRemarkEmitter &ORE; 263 }; 264 265 private: 266 int NumPartialInlining = 0; 267 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 268 std::function<TargetTransformInfo &(Function &)> *GetTTI; 269 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 270 ProfileSummaryInfo *PSI; 271 272 // Return the frequency of the OutlininingBB relative to F's entry point. 273 // The result is no larger than 1 and is represented using BP. 274 // (Note that the outlined region's 'head' block can only have incoming 275 // edges from the guarding entry blocks). 276 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 277 278 // Return true if the callee of CS should be partially inlined with 279 // profit. 280 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 281 BlockFrequency WeightedOutliningRcost, 282 OptimizationRemarkEmitter &ORE); 283 284 // Try to inline DuplicateFunction (cloned from F with call to 285 // the OutlinedFunction into its callers. Return true 286 // if there is any successful inlining. 287 bool tryPartialInline(FunctionCloner &Cloner); 288 289 // Compute the mapping from use site of DuplicationFunction to the enclosing 290 // BB's profile count. 291 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 292 DenseMap<User *, uint64_t> &SiteCountMap); 293 294 bool IsLimitReached() { 295 return (MaxNumPartialInlining != -1 && 296 NumPartialInlining >= MaxNumPartialInlining); 297 } 298 299 static CallSite getCallSite(User *U) { 300 CallSite CS; 301 if (CallInst *CI = dyn_cast<CallInst>(U)) 302 CS = CallSite(CI); 303 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 304 CS = CallSite(II); 305 else 306 llvm_unreachable("All uses must be calls"); 307 return CS; 308 } 309 310 static CallSite getOneCallSiteTo(Function *F) { 311 User *User = *F->user_begin(); 312 return getCallSite(User); 313 } 314 315 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 316 CallSite CS = getOneCallSiteTo(F); 317 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 318 BasicBlock *Block = CS.getParent(); 319 return std::make_tuple(DLoc, Block); 320 } 321 322 // Returns the costs associated with function outlining: 323 // - The first value is the non-weighted runtime cost for making the call 324 // to the outlined function, including the addtional setup cost in the 325 // outlined function itself; 326 // - The second value is the estimated size of the new call sequence in 327 // basic block Cloner.OutliningCallBB; 328 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 329 330 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 331 // approximate both the size and runtime cost (Note that in the current 332 // inline cost analysis, there is no clear distinction there either). 333 static int computeBBInlineCost(BasicBlock *BB); 334 335 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 336 std::unique_ptr<FunctionOutliningMultiRegionInfo> 337 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 338 }; 339 340 struct PartialInlinerLegacyPass : public ModulePass { 341 static char ID; // Pass identification, replacement for typeid 342 343 PartialInlinerLegacyPass() : ModulePass(ID) { 344 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 345 } 346 347 void getAnalysisUsage(AnalysisUsage &AU) const override { 348 AU.addRequired<AssumptionCacheTracker>(); 349 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 350 AU.addRequired<TargetTransformInfoWrapperPass>(); 351 } 352 353 bool runOnModule(Module &M) override { 354 if (skipModule(M)) 355 return false; 356 357 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 358 TargetTransformInfoWrapperPass *TTIWP = 359 &getAnalysis<TargetTransformInfoWrapperPass>(); 360 ProfileSummaryInfo *PSI = 361 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 362 363 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 364 [&ACT](Function &F) -> AssumptionCache & { 365 return ACT->getAssumptionCache(F); 366 }; 367 368 std::function<TargetTransformInfo &(Function &)> GetTTI = 369 [&TTIWP](Function &F) -> TargetTransformInfo & { 370 return TTIWP->getTTI(F); 371 }; 372 373 return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI) 374 .run(M); 375 } 376 }; 377 378 } // end anonymous namespace 379 380 std::unique_ptr<FunctionOutliningMultiRegionInfo> 381 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 382 OptimizationRemarkEmitter &ORE) { 383 BasicBlock *EntryBlock = &F->front(); 384 385 DominatorTree DT(*F); 386 LoopInfo LI(DT); 387 BranchProbabilityInfo BPI(*F, LI); 388 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 389 BlockFrequencyInfo *BFI; 390 if (!GetBFI) { 391 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 392 BFI = ScopedBFI.get(); 393 } else 394 BFI = &(*GetBFI)(*F); 395 396 // Return if we don't have profiling information. 397 if (!PSI->hasInstrumentationProfile()) 398 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 399 400 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 401 llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 402 403 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 404 BasicBlock *Dom = BlockList.front(); 405 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 406 }; 407 408 auto IsSingleExit = 409 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 410 BasicBlock *ExitBlock = nullptr; 411 for (auto *Block : BlockList) { 412 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 413 if (!is_contained(BlockList, *SI)) { 414 if (ExitBlock) { 415 ORE.emit([&]() { 416 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 417 &SI->front()) 418 << "Region dominated by " 419 << ore::NV("Block", BlockList.front()->getName()) 420 << " has more than one region exit edge."; 421 }); 422 return nullptr; 423 } else 424 ExitBlock = Block; 425 } 426 } 427 } 428 return ExitBlock; 429 }; 430 431 auto BBProfileCount = [BFI](BasicBlock *BB) { 432 return BFI->getBlockProfileCount(BB) 433 ? BFI->getBlockProfileCount(BB).getValue() 434 : 0; 435 }; 436 437 // Use the same computeBBInlineCost function to compute the cost savings of 438 // the outlining the candidate region. 439 int OverallFunctionCost = 0; 440 for (auto &BB : *F) 441 OverallFunctionCost += computeBBInlineCost(&BB); 442 443 #ifndef NDEBUG 444 if (TracePartialInlining) 445 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 446 #endif 447 int MinOutlineRegionCost = 448 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 449 BranchProbability MinBranchProbability( 450 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 451 MinBlockCounterExecution); 452 bool ColdCandidateFound = false; 453 BasicBlock *CurrEntry = EntryBlock; 454 std::vector<BasicBlock *> DFS; 455 DenseMap<BasicBlock *, bool> VisitedMap; 456 DFS.push_back(CurrEntry); 457 VisitedMap[CurrEntry] = true; 458 // Use Depth First Search on the basic blocks to find CFG edges that are 459 // considered cold. 460 // Cold regions considered must also have its inline cost compared to the 461 // overall inline cost of the original function. The region is outlined only 462 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 463 // more. 464 while (!DFS.empty()) { 465 auto *thisBB = DFS.back(); 466 DFS.pop_back(); 467 // Only consider regions with predecessor blocks that are considered 468 // not-cold (default: part of the top 99.99% of all block counters) 469 // AND greater than our minimum block execution count (default: 100). 470 if (PSI->isColdBlock(thisBB, BFI) || 471 BBProfileCount(thisBB) < MinBlockCounterExecution) 472 continue; 473 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 474 if (VisitedMap[*SI]) 475 continue; 476 VisitedMap[*SI] = true; 477 DFS.push_back(*SI); 478 // If branch isn't cold, we skip to the next one. 479 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 480 if (SuccProb > MinBranchProbability) 481 continue; 482 #ifndef NDEBUG 483 if (TracePartialInlining) { 484 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 485 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 486 << "\n"; 487 } 488 #endif 489 SmallVector<BasicBlock *, 8> DominateVector; 490 DT.getDescendants(*SI, DominateVector); 491 // We can only outline single entry regions (for now). 492 if (!IsSingleEntry(DominateVector)) 493 continue; 494 BasicBlock *ExitBlock = nullptr; 495 // We can only outline single exit regions (for now). 496 if (!(ExitBlock = IsSingleExit(DominateVector))) 497 continue; 498 int OutlineRegionCost = 0; 499 for (auto *BB : DominateVector) 500 OutlineRegionCost += computeBBInlineCost(BB); 501 502 #ifndef NDEBUG 503 if (TracePartialInlining) 504 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 505 #endif 506 507 if (OutlineRegionCost < MinOutlineRegionCost) { 508 ORE.emit([&]() { 509 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 510 &SI->front()) 511 << ore::NV("Callee", F) << " inline cost-savings smaller than " 512 << ore::NV("Cost", MinOutlineRegionCost); 513 }); 514 continue; 515 } 516 // For now, ignore blocks that belong to a SISE region that is a 517 // candidate for outlining. In the future, we may want to look 518 // at inner regions because the outer region may have live-exit 519 // variables. 520 for (auto *BB : DominateVector) 521 VisitedMap[BB] = true; 522 // ReturnBlock here means the block after the outline call 523 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 524 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 525 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 526 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 527 RegInfo.Region = DominateVector; 528 OutliningInfo->ORI.push_back(RegInfo); 529 #ifndef NDEBUG 530 if (TracePartialInlining) { 531 dbgs() << "Found Cold Candidate starting at block: " 532 << DominateVector.front()->getName() << "\n"; 533 } 534 #endif 535 ColdCandidateFound = true; 536 NumColdRegionsFound++; 537 } 538 } 539 if (ColdCandidateFound) 540 return OutliningInfo; 541 else 542 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 543 } 544 545 std::unique_ptr<FunctionOutliningInfo> 546 PartialInlinerImpl::computeOutliningInfo(Function *F) { 547 BasicBlock *EntryBlock = &F->front(); 548 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 549 if (!BR || BR->isUnconditional()) 550 return std::unique_ptr<FunctionOutliningInfo>(); 551 552 // Returns true if Succ is BB's successor 553 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 554 return is_contained(successors(BB), Succ); 555 }; 556 557 auto IsReturnBlock = [](BasicBlock *BB) { 558 Instruction *TI = BB->getTerminator(); 559 return isa<ReturnInst>(TI); 560 }; 561 562 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 563 if (IsReturnBlock(Succ1)) 564 return std::make_tuple(Succ1, Succ2); 565 if (IsReturnBlock(Succ2)) 566 return std::make_tuple(Succ2, Succ1); 567 568 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 569 }; 570 571 // Detect a triangular shape: 572 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 573 if (IsSuccessor(Succ1, Succ2)) 574 return std::make_tuple(Succ1, Succ2); 575 if (IsSuccessor(Succ2, Succ1)) 576 return std::make_tuple(Succ2, Succ1); 577 578 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 579 }; 580 581 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 582 llvm::make_unique<FunctionOutliningInfo>(); 583 584 BasicBlock *CurrEntry = EntryBlock; 585 bool CandidateFound = false; 586 do { 587 // The number of blocks to be inlined has already reached 588 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 589 // disables partial inlining for the function. 590 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 591 break; 592 593 if (succ_size(CurrEntry) != 2) 594 break; 595 596 BasicBlock *Succ1 = *succ_begin(CurrEntry); 597 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 598 599 BasicBlock *ReturnBlock, *NonReturnBlock; 600 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 601 602 if (ReturnBlock) { 603 OutliningInfo->Entries.push_back(CurrEntry); 604 OutliningInfo->ReturnBlock = ReturnBlock; 605 OutliningInfo->NonReturnBlock = NonReturnBlock; 606 CandidateFound = true; 607 break; 608 } 609 610 BasicBlock *CommSucc; 611 BasicBlock *OtherSucc; 612 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 613 614 if (!CommSucc) 615 break; 616 617 OutliningInfo->Entries.push_back(CurrEntry); 618 CurrEntry = OtherSucc; 619 } while (true); 620 621 if (!CandidateFound) 622 return std::unique_ptr<FunctionOutliningInfo>(); 623 624 // Do sanity check of the entries: threre should not 625 // be any successors (not in the entry set) other than 626 // {ReturnBlock, NonReturnBlock} 627 assert(OutliningInfo->Entries[0] == &F->front() && 628 "Function Entry must be the first in Entries vector"); 629 DenseSet<BasicBlock *> Entries; 630 for (BasicBlock *E : OutliningInfo->Entries) 631 Entries.insert(E); 632 633 // Returns true of BB has Predecessor which is not 634 // in Entries set. 635 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 636 for (auto Pred : predecessors(BB)) { 637 if (!Entries.count(Pred)) 638 return true; 639 } 640 return false; 641 }; 642 auto CheckAndNormalizeCandidate = 643 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 644 for (BasicBlock *E : OutliningInfo->Entries) { 645 for (auto Succ : successors(E)) { 646 if (Entries.count(Succ)) 647 continue; 648 if (Succ == OutliningInfo->ReturnBlock) 649 OutliningInfo->ReturnBlockPreds.push_back(E); 650 else if (Succ != OutliningInfo->NonReturnBlock) 651 return false; 652 } 653 // There should not be any outside incoming edges either: 654 if (HasNonEntryPred(E)) 655 return false; 656 } 657 return true; 658 }; 659 660 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 661 return std::unique_ptr<FunctionOutliningInfo>(); 662 663 // Now further growing the candidate's inlining region by 664 // peeling off dominating blocks from the outlining region: 665 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 666 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 667 if (succ_size(Cand) != 2) 668 break; 669 670 if (HasNonEntryPred(Cand)) 671 break; 672 673 BasicBlock *Succ1 = *succ_begin(Cand); 674 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 675 676 BasicBlock *ReturnBlock, *NonReturnBlock; 677 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 678 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 679 break; 680 681 if (NonReturnBlock->getSinglePredecessor() != Cand) 682 break; 683 684 // Now grow and update OutlininigInfo: 685 OutliningInfo->Entries.push_back(Cand); 686 OutliningInfo->NonReturnBlock = NonReturnBlock; 687 OutliningInfo->ReturnBlockPreds.push_back(Cand); 688 Entries.insert(Cand); 689 } 690 691 return OutliningInfo; 692 } 693 694 // Check if there is PGO data or user annoated branch data: 695 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 696 if (F->hasProfileData()) 697 return true; 698 // Now check if any of the entry block has MD_prof data: 699 for (auto *E : OI->Entries) { 700 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 701 if (!BR || BR->isUnconditional()) 702 continue; 703 uint64_t T, F; 704 if (BR->extractProfMetadata(T, F)) 705 return true; 706 } 707 return false; 708 } 709 710 BranchProbability 711 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 712 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 713 auto EntryFreq = 714 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 715 auto OutliningCallFreq = 716 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 717 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 718 // we outlined any regions, so we may encounter situations where the 719 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 720 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 721 OutliningCallFreq = EntryFreq; 722 } 723 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 724 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 725 726 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 727 return OutlineRegionRelFreq; 728 729 // When profile data is not available, we need to be conservative in 730 // estimating the overall savings. Static branch prediction can usually 731 // guess the branch direction right (taken/non-taken), but the guessed 732 // branch probability is usually not biased enough. In case when the 733 // outlined region is predicted to be likely, its probability needs 734 // to be made higher (more biased) to not under-estimate the cost of 735 // function outlining. On the other hand, if the outlined region 736 // is predicted to be less likely, the predicted probablity is usually 737 // higher than the actual. For instance, the actual probability of the 738 // less likely target is only 5%, but the guessed probablity can be 739 // 40%. In the latter case, there is no need for further adjustement. 740 // FIXME: add an option for this. 741 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 742 return OutlineRegionRelFreq; 743 744 OutlineRegionRelFreq = std::max( 745 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 746 747 return OutlineRegionRelFreq; 748 } 749 750 bool PartialInlinerImpl::shouldPartialInline( 751 CallSite CS, FunctionCloner &Cloner, 752 BlockFrequency WeightedOutliningRcost, 753 OptimizationRemarkEmitter &ORE) { 754 using namespace ore; 755 756 Instruction *Call = CS.getInstruction(); 757 Function *Callee = CS.getCalledFunction(); 758 assert(Callee == Cloner.ClonedFunc); 759 760 if (SkipCostAnalysis) 761 return isInlineViable(*Callee); 762 763 Function *Caller = CS.getCaller(); 764 auto &CalleeTTI = (*GetTTI)(*Callee); 765 InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI, 766 *GetAssumptionCache, GetBFI, PSI, &ORE); 767 768 if (IC.isAlways()) { 769 ORE.emit([&]() { 770 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 771 << NV("Callee", Cloner.OrigFunc) 772 << " should always be fully inlined, not partially"; 773 }); 774 return false; 775 } 776 777 if (IC.isNever()) { 778 ORE.emit([&]() { 779 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 780 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 781 << NV("Caller", Caller) 782 << " because it should never be inlined (cost=never)"; 783 }); 784 return false; 785 } 786 787 if (!IC) { 788 ORE.emit([&]() { 789 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 790 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 791 << NV("Caller", Caller) << " because too costly to inline (cost=" 792 << NV("Cost", IC.getCost()) << ", threshold=" 793 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 794 }); 795 return false; 796 } 797 const DataLayout &DL = Caller->getParent()->getDataLayout(); 798 799 // The savings of eliminating the call: 800 int NonWeightedSavings = getCallsiteCost(CS, DL); 801 BlockFrequency NormWeightedSavings(NonWeightedSavings); 802 803 // Weighted saving is smaller than weighted cost, return false 804 if (NormWeightedSavings < WeightedOutliningRcost) { 805 ORE.emit([&]() { 806 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 807 Call) 808 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 809 << NV("Caller", Caller) << " runtime overhead (overhead=" 810 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 811 << ", savings=" 812 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 813 << ")" 814 << " of making the outlined call is too high"; 815 }); 816 817 return false; 818 } 819 820 ORE.emit([&]() { 821 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 822 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 823 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 824 << " (threshold=" 825 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 826 }); 827 return true; 828 } 829 830 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 831 // For now use a simplified version. The returned 'InlineCost' will be used 832 // to esimate the size cost as well as runtime cost of the BB. 833 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 834 int InlineCost = 0; 835 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 836 for (Instruction &I : BB->instructionsWithoutDebug()) { 837 // Skip free instructions. 838 switch (I.getOpcode()) { 839 case Instruction::BitCast: 840 case Instruction::PtrToInt: 841 case Instruction::IntToPtr: 842 case Instruction::Alloca: 843 case Instruction::PHI: 844 continue; 845 case Instruction::GetElementPtr: 846 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 847 continue; 848 break; 849 default: 850 break; 851 } 852 853 if (I.isLifetimeStartOrEnd()) 854 continue; 855 856 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 857 InlineCost += getCallsiteCost(CallSite(CI), DL); 858 continue; 859 } 860 861 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 862 InlineCost += getCallsiteCost(CallSite(II), DL); 863 continue; 864 } 865 866 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 867 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 868 continue; 869 } 870 InlineCost += InlineConstants::InstrCost; 871 } 872 return InlineCost; 873 } 874 875 std::tuple<int, int> 876 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 877 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 878 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 879 Function *OutlinedFunc = FuncBBPair.first; 880 BasicBlock* OutliningCallBB = FuncBBPair.second; 881 // Now compute the cost of the call sequence to the outlined function 882 // 'OutlinedFunction' in BB 'OutliningCallBB': 883 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 884 885 // Now compute the cost of the extracted/outlined function itself: 886 for (BasicBlock &BB : *OutlinedFunc) 887 OutlinedFunctionCost += computeBBInlineCost(&BB); 888 } 889 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 890 "Outlined function cost should be no less than the outlined region"); 891 892 // The code extractor introduces a new root and exit stub blocks with 893 // additional unconditional branches. Those branches will be eliminated 894 // later with bb layout. The cost should be adjusted accordingly: 895 OutlinedFunctionCost -= 896 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 897 898 int OutliningRuntimeOverhead = 899 OutliningFuncCallCost + 900 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 901 ExtraOutliningPenalty; 902 903 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 904 } 905 906 // Create the callsite to profile count map which is 907 // used to update the original function's entry count, 908 // after the function is partially inlined into the callsite. 909 void PartialInlinerImpl::computeCallsiteToProfCountMap( 910 Function *DuplicateFunction, 911 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 912 std::vector<User *> Users(DuplicateFunction->user_begin(), 913 DuplicateFunction->user_end()); 914 Function *CurrentCaller = nullptr; 915 std::unique_ptr<BlockFrequencyInfo> TempBFI; 916 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 917 918 auto ComputeCurrBFI = [&,this](Function *Caller) { 919 // For the old pass manager: 920 if (!GetBFI) { 921 DominatorTree DT(*Caller); 922 LoopInfo LI(DT); 923 BranchProbabilityInfo BPI(*Caller, LI); 924 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 925 CurrentCallerBFI = TempBFI.get(); 926 } else { 927 // New pass manager: 928 CurrentCallerBFI = &(*GetBFI)(*Caller); 929 } 930 }; 931 932 for (User *User : Users) { 933 CallSite CS = getCallSite(User); 934 Function *Caller = CS.getCaller(); 935 if (CurrentCaller != Caller) { 936 CurrentCaller = Caller; 937 ComputeCurrBFI(Caller); 938 } else { 939 assert(CurrentCallerBFI && "CallerBFI is not set"); 940 } 941 BasicBlock *CallBB = CS.getInstruction()->getParent(); 942 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 943 if (Count) 944 CallSiteToProfCountMap[User] = *Count; 945 else 946 CallSiteToProfCountMap[User] = 0; 947 } 948 } 949 950 PartialInlinerImpl::FunctionCloner::FunctionCloner( 951 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE) 952 : OrigFunc(F), ORE(ORE) { 953 ClonedOI = llvm::make_unique<FunctionOutliningInfo>(); 954 955 // Clone the function, so that we can hack away on it. 956 ValueToValueMapTy VMap; 957 ClonedFunc = CloneFunction(F, VMap); 958 959 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 960 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 961 for (BasicBlock *BB : OI->Entries) { 962 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 963 } 964 for (BasicBlock *E : OI->ReturnBlockPreds) { 965 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 966 ClonedOI->ReturnBlockPreds.push_back(NewE); 967 } 968 // Go ahead and update all uses to the duplicate, so that we can just 969 // use the inliner functionality when we're done hacking. 970 F->replaceAllUsesWith(ClonedFunc); 971 } 972 973 PartialInlinerImpl::FunctionCloner::FunctionCloner( 974 Function *F, FunctionOutliningMultiRegionInfo *OI, 975 OptimizationRemarkEmitter &ORE) 976 : OrigFunc(F), ORE(ORE) { 977 ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>(); 978 979 // Clone the function, so that we can hack away on it. 980 ValueToValueMapTy VMap; 981 ClonedFunc = CloneFunction(F, VMap); 982 983 // Go through all Outline Candidate Regions and update all BasicBlock 984 // information. 985 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 986 OI->ORI) { 987 SmallVector<BasicBlock *, 8> Region; 988 for (BasicBlock *BB : RegionInfo.Region) { 989 Region.push_back(cast<BasicBlock>(VMap[BB])); 990 } 991 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 992 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 993 BasicBlock *NewReturnBlock = nullptr; 994 if (RegionInfo.ReturnBlock) 995 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 996 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 997 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 998 ClonedOMRI->ORI.push_back(MappedRegionInfo); 999 } 1000 // Go ahead and update all uses to the duplicate, so that we can just 1001 // use the inliner functionality when we're done hacking. 1002 F->replaceAllUsesWith(ClonedFunc); 1003 } 1004 1005 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1006 auto getFirstPHI = [](BasicBlock *BB) { 1007 BasicBlock::iterator I = BB->begin(); 1008 PHINode *FirstPhi = nullptr; 1009 while (I != BB->end()) { 1010 PHINode *Phi = dyn_cast<PHINode>(I); 1011 if (!Phi) 1012 break; 1013 if (!FirstPhi) { 1014 FirstPhi = Phi; 1015 break; 1016 } 1017 } 1018 return FirstPhi; 1019 }; 1020 1021 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1022 // blocks. 1023 if (!ClonedOI) 1024 return; 1025 1026 // Special hackery is needed with PHI nodes that have inputs from more than 1027 // one extracted block. For simplicity, just split the PHIs into a two-level 1028 // sequence of PHIs, some of which will go in the extracted region, and some 1029 // of which will go outside. 1030 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1031 // only split block when necessary: 1032 PHINode *FirstPhi = getFirstPHI(PreReturn); 1033 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1034 1035 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1036 return; 1037 1038 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1039 Value *CommonValue = PN->getIncomingValue(0); 1040 if (all_of(PN->incoming_values(), 1041 [&](Value *V) { return V == CommonValue; })) 1042 return CommonValue; 1043 return nullptr; 1044 }; 1045 1046 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1047 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1048 BasicBlock::iterator I = PreReturn->begin(); 1049 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1050 SmallVector<Instruction *, 4> DeadPhis; 1051 while (I != PreReturn->end()) { 1052 PHINode *OldPhi = dyn_cast<PHINode>(I); 1053 if (!OldPhi) 1054 break; 1055 1056 PHINode *RetPhi = 1057 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1058 OldPhi->replaceAllUsesWith(RetPhi); 1059 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1060 1061 RetPhi->addIncoming(&*I, PreReturn); 1062 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1063 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1064 OldPhi->removeIncomingValue(E); 1065 } 1066 1067 // After incoming values splitting, the old phi may become trivial. 1068 // Keeping the trivial phi can introduce definition inside the outline 1069 // region which is live-out, causing necessary overhead (load, store 1070 // arg passing etc). 1071 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1072 OldPhi->replaceAllUsesWith(OldPhiVal); 1073 DeadPhis.push_back(OldPhi); 1074 } 1075 ++I; 1076 } 1077 for (auto *DP : DeadPhis) 1078 DP->eraseFromParent(); 1079 1080 for (auto E : ClonedOI->ReturnBlockPreds) { 1081 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1082 } 1083 } 1084 1085 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1086 1087 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1088 int Cost = 0; 1089 for (BasicBlock* BB : Region) 1090 Cost += computeBBInlineCost(BB); 1091 return Cost; 1092 }; 1093 1094 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1095 1096 if (ClonedOMRI->ORI.empty()) 1097 return false; 1098 1099 // The CodeExtractor needs a dominator tree. 1100 DominatorTree DT; 1101 DT.recalculate(*ClonedFunc); 1102 1103 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1104 LoopInfo LI(DT); 1105 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1106 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1107 1108 SetVector<Value *> Inputs, Outputs, Sinks; 1109 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1110 ClonedOMRI->ORI) { 1111 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1112 1113 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1114 ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false); 1115 1116 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1117 1118 #ifndef NDEBUG 1119 if (TracePartialInlining) { 1120 dbgs() << "inputs: " << Inputs.size() << "\n"; 1121 dbgs() << "outputs: " << Outputs.size() << "\n"; 1122 for (Value *value : Inputs) 1123 dbgs() << "value used in func: " << *value << "\n"; 1124 for (Value *output : Outputs) 1125 dbgs() << "instr used in func: " << *output << "\n"; 1126 } 1127 #endif 1128 // Do not extract regions that have live exit variables. 1129 if (Outputs.size() > 0 && !ForceLiveExit) 1130 continue; 1131 1132 Function *OutlinedFunc = CE.extractCodeRegion(); 1133 1134 if (OutlinedFunc) { 1135 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1136 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1137 assert(OutliningCallBB->getParent() == ClonedFunc); 1138 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1139 NumColdRegionsOutlined++; 1140 OutlinedRegionCost += CurrentOutlinedRegionCost; 1141 1142 if (MarkOutlinedColdCC) { 1143 OutlinedFunc->setCallingConv(CallingConv::Cold); 1144 OCS.setCallingConv(CallingConv::Cold); 1145 } 1146 } else 1147 ORE.emit([&]() { 1148 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1149 &RegionInfo.Region.front()->front()) 1150 << "Failed to extract region at block " 1151 << ore::NV("Block", RegionInfo.Region.front()); 1152 }); 1153 } 1154 1155 return !OutlinedFunctions.empty(); 1156 } 1157 1158 Function * 1159 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1160 // Returns true if the block is to be partial inlined into the caller 1161 // (i.e. not to be extracted to the out of line function) 1162 auto ToBeInlined = [&, this](BasicBlock *BB) { 1163 return BB == ClonedOI->ReturnBlock || 1164 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1165 ClonedOI->Entries.end()); 1166 }; 1167 1168 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1169 // The CodeExtractor needs a dominator tree. 1170 DominatorTree DT; 1171 DT.recalculate(*ClonedFunc); 1172 1173 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1174 LoopInfo LI(DT); 1175 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1176 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1177 1178 // Gather up the blocks that we're going to extract. 1179 std::vector<BasicBlock *> ToExtract; 1180 ToExtract.push_back(ClonedOI->NonReturnBlock); 1181 OutlinedRegionCost += 1182 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1183 for (BasicBlock &BB : *ClonedFunc) 1184 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1185 ToExtract.push_back(&BB); 1186 // FIXME: the code extractor may hoist/sink more code 1187 // into the outlined function which may make the outlining 1188 // overhead (the difference of the outlined function cost 1189 // and OutliningRegionCost) look larger. 1190 OutlinedRegionCost += computeBBInlineCost(&BB); 1191 } 1192 1193 // Extract the body of the if. 1194 Function *OutlinedFunc = 1195 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1196 ClonedFuncBFI.get(), &BPI, 1197 /* AllowVarargs */ true) 1198 .extractCodeRegion(); 1199 1200 if (OutlinedFunc) { 1201 BasicBlock *OutliningCallBB = 1202 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1203 .getInstruction() 1204 ->getParent(); 1205 assert(OutliningCallBB->getParent() == ClonedFunc); 1206 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1207 } else 1208 ORE.emit([&]() { 1209 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1210 &ToExtract.front()->front()) 1211 << "Failed to extract region at block " 1212 << ore::NV("Block", ToExtract.front()); 1213 }); 1214 1215 return OutlinedFunc; 1216 } 1217 1218 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1219 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1220 // users (function pointers, etc.) back to the original function. 1221 ClonedFunc->replaceAllUsesWith(OrigFunc); 1222 ClonedFunc->eraseFromParent(); 1223 if (!IsFunctionInlined) { 1224 // Remove each function that was speculatively created if there is no 1225 // reference. 1226 for (auto FuncBBPair : OutlinedFunctions) { 1227 Function *Func = FuncBBPair.first; 1228 Func->eraseFromParent(); 1229 } 1230 } 1231 } 1232 1233 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1234 1235 if (F->hasAddressTaken()) 1236 return {false, nullptr}; 1237 1238 // Let inliner handle it 1239 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1240 return {false, nullptr}; 1241 1242 if (F->hasFnAttribute(Attribute::NoInline)) 1243 return {false, nullptr}; 1244 1245 if (PSI->isFunctionEntryCold(F)) 1246 return {false, nullptr}; 1247 1248 if (empty(F->users())) 1249 return {false, nullptr}; 1250 1251 OptimizationRemarkEmitter ORE(F); 1252 1253 // Only try to outline cold regions if we have a profile summary, which 1254 // implies we have profiling information. 1255 if (PSI->hasProfileSummary() && F->hasProfileData() && 1256 !DisableMultiRegionPartialInline) { 1257 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1258 computeOutliningColdRegionsInfo(F, ORE); 1259 if (OMRI) { 1260 FunctionCloner Cloner(F, OMRI.get(), ORE); 1261 1262 #ifndef NDEBUG 1263 if (TracePartialInlining) { 1264 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1265 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1266 << "\n"; 1267 } 1268 #endif 1269 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1270 1271 if (DidOutline) { 1272 #ifndef NDEBUG 1273 if (TracePartialInlining) { 1274 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1275 Cloner.ClonedFunc->print(dbgs()); 1276 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1277 } 1278 #endif 1279 1280 if (tryPartialInline(Cloner)) 1281 return {true, nullptr}; 1282 } 1283 } 1284 } 1285 1286 // Fall-thru to regular partial inlining if we: 1287 // i) can't find any cold regions to outline, or 1288 // ii) can't inline the outlined function anywhere. 1289 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1290 if (!OI) 1291 return {false, nullptr}; 1292 1293 FunctionCloner Cloner(F, OI.get(), ORE); 1294 Cloner.NormalizeReturnBlock(); 1295 1296 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1297 1298 if (!OutlinedFunction) 1299 return {false, nullptr}; 1300 1301 bool AnyInline = tryPartialInline(Cloner); 1302 1303 if (AnyInline) 1304 return {true, OutlinedFunction}; 1305 1306 return {false, nullptr}; 1307 } 1308 1309 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1310 if (Cloner.OutlinedFunctions.empty()) 1311 return false; 1312 1313 int SizeCost = 0; 1314 BlockFrequency WeightedRcost; 1315 int NonWeightedRcost; 1316 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1317 1318 // Only calculate RelativeToEntryFreq when we are doing single region 1319 // outlining. 1320 BranchProbability RelativeToEntryFreq; 1321 if (Cloner.ClonedOI) { 1322 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1323 } else 1324 // RelativeToEntryFreq doesn't make sense when we have more than one 1325 // outlined call because each call will have a different relative frequency 1326 // to the entry block. We can consider using the average, but the 1327 // usefulness of that information is questionable. For now, assume we never 1328 // execute the calls to outlined functions. 1329 RelativeToEntryFreq = BranchProbability(0, 1); 1330 1331 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1332 1333 // The call sequence(s) to the outlined function(s) are larger than the sum of 1334 // the original outlined region size(s), it does not increase the chances of 1335 // inlining the function with outlining (The inliner uses the size increase to 1336 // model the cost of inlining a callee). 1337 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1338 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1339 DebugLoc DLoc; 1340 BasicBlock *Block; 1341 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1342 OrigFuncORE.emit([&]() { 1343 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1344 DLoc, Block) 1345 << ore::NV("Function", Cloner.OrigFunc) 1346 << " not partially inlined into callers (Original Size = " 1347 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1348 << ", Size of call sequence to outlined function = " 1349 << ore::NV("NewSize", SizeCost) << ")"; 1350 }); 1351 return false; 1352 } 1353 1354 assert(empty(Cloner.OrigFunc->users()) && 1355 "F's users should all be replaced!"); 1356 1357 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1358 Cloner.ClonedFunc->user_end()); 1359 1360 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1361 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1362 if (CalleeEntryCount) 1363 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1364 1365 uint64_t CalleeEntryCountV = 1366 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1367 1368 bool AnyInline = false; 1369 for (User *User : Users) { 1370 CallSite CS = getCallSite(User); 1371 1372 if (IsLimitReached()) 1373 continue; 1374 1375 OptimizationRemarkEmitter CallerORE(CS.getCaller()); 1376 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE)) 1377 continue; 1378 1379 // Construct remark before doing the inlining, as after successful inlining 1380 // the callsite is removed. 1381 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1382 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1383 << ore::NV("Caller", CS.getCaller()); 1384 1385 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1386 // We can only forward varargs when we outlined a single region, else we 1387 // bail on vararg functions. 1388 if (!InlineFunction(CS, IFI, nullptr, true, 1389 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1390 : nullptr))) 1391 continue; 1392 1393 CallerORE.emit(OR); 1394 1395 // Now update the entry count: 1396 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1397 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1398 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1399 } 1400 1401 AnyInline = true; 1402 NumPartialInlining++; 1403 // Update the stats 1404 if (Cloner.ClonedOI) 1405 NumPartialInlined++; 1406 else 1407 NumColdOutlinePartialInlined++; 1408 1409 } 1410 1411 if (AnyInline) { 1412 Cloner.IsFunctionInlined = true; 1413 if (CalleeEntryCount) 1414 Cloner.OrigFunc->setEntryCount( 1415 CalleeEntryCount.setCount(CalleeEntryCountV)); 1416 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1417 OrigFuncORE.emit([&]() { 1418 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1419 << "Partially inlined into at least one caller"; 1420 }); 1421 1422 } 1423 1424 return AnyInline; 1425 } 1426 1427 bool PartialInlinerImpl::run(Module &M) { 1428 if (DisablePartialInlining) 1429 return false; 1430 1431 std::vector<Function *> Worklist; 1432 Worklist.reserve(M.size()); 1433 for (Function &F : M) 1434 if (!F.use_empty() && !F.isDeclaration()) 1435 Worklist.push_back(&F); 1436 1437 bool Changed = false; 1438 while (!Worklist.empty()) { 1439 Function *CurrFunc = Worklist.back(); 1440 Worklist.pop_back(); 1441 1442 if (CurrFunc->use_empty()) 1443 continue; 1444 1445 bool Recursive = false; 1446 for (User *U : CurrFunc->users()) 1447 if (Instruction *I = dyn_cast<Instruction>(U)) 1448 if (I->getParent()->getParent() == CurrFunc) { 1449 Recursive = true; 1450 break; 1451 } 1452 if (Recursive) 1453 continue; 1454 1455 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1456 if (Result.second) 1457 Worklist.push_back(Result.second); 1458 Changed |= Result.first; 1459 } 1460 1461 return Changed; 1462 } 1463 1464 char PartialInlinerLegacyPass::ID = 0; 1465 1466 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1467 "Partial Inliner", false, false) 1468 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1469 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1470 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1471 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1472 "Partial Inliner", false, false) 1473 1474 ModulePass *llvm::createPartialInliningPass() { 1475 return new PartialInlinerLegacyPass(); 1476 } 1477 1478 PreservedAnalyses PartialInlinerPass::run(Module &M, 1479 ModuleAnalysisManager &AM) { 1480 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1481 1482 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1483 [&FAM](Function &F) -> AssumptionCache & { 1484 return FAM.getResult<AssumptionAnalysis>(F); 1485 }; 1486 1487 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1488 [&FAM](Function &F) -> BlockFrequencyInfo & { 1489 return FAM.getResult<BlockFrequencyAnalysis>(F); 1490 }; 1491 1492 std::function<TargetTransformInfo &(Function &)> GetTTI = 1493 [&FAM](Function &F) -> TargetTransformInfo & { 1494 return FAM.getResult<TargetIRAnalysis>(F); 1495 }; 1496 1497 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1498 1499 if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI) 1500 .run(M)) 1501 return PreservedAnalyses::none(); 1502 return PreservedAnalyses::all(); 1503 } 1504