1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/CallSite.h" 34 #include "llvm/IR/DebugLoc.h" 35 #include "llvm/IR/DiagnosticInfo.h" 36 #include "llvm/IR/Dominators.h" 37 #include "llvm/IR/Function.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/IntrinsicInst.h" 42 #include "llvm/IR/Intrinsics.h" 43 #include "llvm/IR/Module.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/BlockFrequency.h" 48 #include "llvm/Support/BranchProbability.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Transforms/IPO.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/CodeExtractor.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <functional> 60 #include <iterator> 61 #include <memory> 62 #include <tuple> 63 #include <vector> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "partial-inlining" 68 69 STATISTIC(NumPartialInlined, 70 "Number of callsites functions partially inlined into."); 71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 72 "cold outlined regions were partially " 73 "inlined into its caller(s)."); 74 STATISTIC(NumColdRegionsFound, 75 "Number of cold single entry/exit regions found."); 76 STATISTIC(NumColdRegionsOutlined, 77 "Number of cold single entry/exit regions outlined."); 78 79 // Command line option to disable partial-inlining. The default is false: 80 static cl::opt<bool> 81 DisablePartialInlining("disable-partial-inlining", cl::init(false), 82 cl::Hidden, cl::desc("Disable partial inlining")); 83 // Command line option to disable multi-region partial-inlining. The default is 84 // false: 85 static cl::opt<bool> DisableMultiRegionPartialInline( 86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 87 cl::desc("Disable multi-region partial inlining")); 88 89 // Command line option to force outlining in regions with live exit variables. 90 // The default is false: 91 static cl::opt<bool> 92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 93 cl::desc("Force outline regions with live exits")); 94 95 // Command line option to enable marking outline functions with Cold Calling 96 // Convention. The default is false: 97 static cl::opt<bool> 98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 99 cl::desc("Mark outline function calls with ColdCC")); 100 101 #ifndef NDEBUG 102 // Command line option to debug partial-inlining. The default is none: 103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 104 cl::init(false), cl::Hidden, 105 cl::desc("Trace partial inlining.")); 106 #endif 107 108 // This is an option used by testing: 109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 110 cl::init(false), cl::ZeroOrMore, 111 cl::ReallyHidden, 112 cl::desc("Skip Cost Analysis")); 113 // Used to determine if a cold region is worth outlining based on 114 // its inlining cost compared to the original function. Default is set at 10%. 115 // ie. if the cold region reduces the inlining cost of the original function by 116 // at least 10%. 117 static cl::opt<float> MinRegionSizeRatio( 118 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 119 cl::desc("Minimum ratio comparing relative sizes of each " 120 "outline candidate and original function")); 121 // Used to tune the minimum number of execution counts needed in the predecessor 122 // block to the cold edge. ie. confidence interval. 123 static cl::opt<unsigned> 124 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 125 cl::desc("Minimum block executions to consider " 126 "its BranchProbabilityInfo valid")); 127 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 128 // if the branch probability is 10% or less, then it is deemed as 'cold'. 129 static cl::opt<float> ColdBranchRatio( 130 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 131 cl::desc("Minimum BranchProbability to consider a region cold.")); 132 133 static cl::opt<unsigned> MaxNumInlineBlocks( 134 "max-num-inline-blocks", cl::init(5), cl::Hidden, 135 cl::desc("Max number of blocks to be partially inlined")); 136 137 // Command line option to set the maximum number of partial inlining allowed 138 // for the module. The default value of -1 means no limit. 139 static cl::opt<int> MaxNumPartialInlining( 140 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Max number of partial inlining. The default is unlimited")); 142 143 // Used only when PGO or user annotated branch data is absent. It is 144 // the least value that is used to weigh the outline region. If BFI 145 // produces larger value, the BFI value will be used. 146 static cl::opt<int> 147 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 148 cl::Hidden, cl::ZeroOrMore, 149 cl::desc("Relative frequency of outline region to " 150 "the entry block")); 151 152 static cl::opt<unsigned> ExtraOutliningPenalty( 153 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 154 cl::desc("A debug option to add additional penalty to the computed one.")); 155 156 namespace { 157 158 struct FunctionOutliningInfo { 159 FunctionOutliningInfo() = default; 160 161 // Returns the number of blocks to be inlined including all blocks 162 // in Entries and one return block. 163 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 164 165 // A set of blocks including the function entry that guard 166 // the region to be outlined. 167 SmallVector<BasicBlock *, 4> Entries; 168 169 // The return block that is not included in the outlined region. 170 BasicBlock *ReturnBlock = nullptr; 171 172 // The dominating block of the region to be outlined. 173 BasicBlock *NonReturnBlock = nullptr; 174 175 // The set of blocks in Entries that that are predecessors to ReturnBlock 176 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 177 }; 178 179 struct FunctionOutliningMultiRegionInfo { 180 FunctionOutliningMultiRegionInfo() 181 : ORI() {} 182 183 // Container for outline regions 184 struct OutlineRegionInfo { 185 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 186 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 187 BasicBlock *ReturnBlock) 188 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 189 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 190 SmallVector<BasicBlock *, 8> Region; 191 BasicBlock *EntryBlock; 192 BasicBlock *ExitBlock; 193 BasicBlock *ReturnBlock; 194 }; 195 196 SmallVector<OutlineRegionInfo, 4> ORI; 197 }; 198 199 struct PartialInlinerImpl { 200 201 PartialInlinerImpl( 202 std::function<AssumptionCache &(Function &)> *GetAC, 203 function_ref<AssumptionCache *(Function &)> LookupAC, 204 std::function<TargetTransformInfo &(Function &)> *GTTI, 205 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI, 206 std::function<const TargetLibraryInfo &(Function &)> *GTLI, 207 ProfileSummaryInfo *ProfSI) 208 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 209 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 210 211 bool run(Module &M); 212 // Main part of the transformation that calls helper functions to find 213 // outlining candidates, clone & outline the function, and attempt to 214 // partially inline the resulting function. Returns true if 215 // inlining was successful, false otherwise. Also returns the outline 216 // function (only if we partially inlined early returns) as there is a 217 // possibility to further "peel" early return statements that were left in the 218 // outline function due to code size. 219 std::pair<bool, Function *> unswitchFunction(Function *F); 220 221 // This class speculatively clones the function to be partial inlined. 222 // At the end of partial inlining, the remaining callsites to the cloned 223 // function that are not partially inlined will be fixed up to reference 224 // the original function, and the cloned function will be erased. 225 struct FunctionCloner { 226 // Two constructors, one for single region outlining, the other for 227 // multi-region outlining. 228 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 229 OptimizationRemarkEmitter &ORE, 230 function_ref<AssumptionCache *(Function &)> LookupAC); 231 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 232 OptimizationRemarkEmitter &ORE, 233 function_ref<AssumptionCache *(Function &)> LookupAC); 234 ~FunctionCloner(); 235 236 // Prepare for function outlining: making sure there is only 237 // one incoming edge from the extracted/outlined region to 238 // the return block. 239 void NormalizeReturnBlock(); 240 241 // Do function outlining for cold regions. 242 bool doMultiRegionFunctionOutlining(); 243 // Do function outlining for region after early return block(s). 244 // NOTE: For vararg functions that do the vararg handling in the outlined 245 // function, we temporarily generate IR that does not properly 246 // forward varargs to the outlined function. Calling InlineFunction 247 // will update calls to the outlined functions to properly forward 248 // the varargs. 249 Function *doSingleRegionFunctionOutlining(); 250 251 Function *OrigFunc = nullptr; 252 Function *ClonedFunc = nullptr; 253 254 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 255 // Keep track of Outlined Functions and the basic block they're called from. 256 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 257 258 // ClonedFunc is inlined in one of its callers after function 259 // outlining. 260 bool IsFunctionInlined = false; 261 // The cost of the region to be outlined. 262 int OutlinedRegionCost = 0; 263 // ClonedOI is specific to outlining non-early return blocks. 264 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 265 // ClonedOMRI is specific to outlining cold regions. 266 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 267 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 268 OptimizationRemarkEmitter &ORE; 269 function_ref<AssumptionCache *(Function &)> LookupAC; 270 }; 271 272 private: 273 int NumPartialInlining = 0; 274 std::function<AssumptionCache &(Function &)> *GetAssumptionCache; 275 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 276 std::function<TargetTransformInfo &(Function &)> *GetTTI; 277 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI; 278 std::function<const TargetLibraryInfo &(Function &)> *GetTLI; 279 ProfileSummaryInfo *PSI; 280 281 // Return the frequency of the OutlininingBB relative to F's entry point. 282 // The result is no larger than 1 and is represented using BP. 283 // (Note that the outlined region's 'head' block can only have incoming 284 // edges from the guarding entry blocks). 285 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 286 287 // Return true if the callee of CS should be partially inlined with 288 // profit. 289 bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner, 290 BlockFrequency WeightedOutliningRcost, 291 OptimizationRemarkEmitter &ORE); 292 293 // Try to inline DuplicateFunction (cloned from F with call to 294 // the OutlinedFunction into its callers. Return true 295 // if there is any successful inlining. 296 bool tryPartialInline(FunctionCloner &Cloner); 297 298 // Compute the mapping from use site of DuplicationFunction to the enclosing 299 // BB's profile count. 300 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 301 DenseMap<User *, uint64_t> &SiteCountMap); 302 303 bool IsLimitReached() { 304 return (MaxNumPartialInlining != -1 && 305 NumPartialInlining >= MaxNumPartialInlining); 306 } 307 308 static CallSite getCallSite(User *U) { 309 CallSite CS; 310 if (CallInst *CI = dyn_cast<CallInst>(U)) 311 CS = CallSite(CI); 312 else if (InvokeInst *II = dyn_cast<InvokeInst>(U)) 313 CS = CallSite(II); 314 else 315 llvm_unreachable("All uses must be calls"); 316 return CS; 317 } 318 319 static CallSite getOneCallSiteTo(Function *F) { 320 User *User = *F->user_begin(); 321 return getCallSite(User); 322 } 323 324 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 325 CallSite CS = getOneCallSiteTo(F); 326 DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); 327 BasicBlock *Block = CS.getParent(); 328 return std::make_tuple(DLoc, Block); 329 } 330 331 // Returns the costs associated with function outlining: 332 // - The first value is the non-weighted runtime cost for making the call 333 // to the outlined function, including the addtional setup cost in the 334 // outlined function itself; 335 // - The second value is the estimated size of the new call sequence in 336 // basic block Cloner.OutliningCallBB; 337 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 338 339 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 340 // approximate both the size and runtime cost (Note that in the current 341 // inline cost analysis, there is no clear distinction there either). 342 static int computeBBInlineCost(BasicBlock *BB); 343 344 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 345 std::unique_ptr<FunctionOutliningMultiRegionInfo> 346 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 347 }; 348 349 struct PartialInlinerLegacyPass : public ModulePass { 350 static char ID; // Pass identification, replacement for typeid 351 352 PartialInlinerLegacyPass() : ModulePass(ID) { 353 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 354 } 355 356 void getAnalysisUsage(AnalysisUsage &AU) const override { 357 AU.addRequired<AssumptionCacheTracker>(); 358 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 359 AU.addRequired<TargetTransformInfoWrapperPass>(); 360 AU.addRequired<TargetLibraryInfoWrapperPass>(); 361 } 362 363 bool runOnModule(Module &M) override { 364 if (skipModule(M)) 365 return false; 366 367 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 368 TargetTransformInfoWrapperPass *TTIWP = 369 &getAnalysis<TargetTransformInfoWrapperPass>(); 370 ProfileSummaryInfo *PSI = 371 &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 372 373 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 374 [&ACT](Function &F) -> AssumptionCache & { 375 return ACT->getAssumptionCache(F); 376 }; 377 378 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 379 return ACT->lookupAssumptionCache(F); 380 }; 381 382 std::function<TargetTransformInfo &(Function &)> GetTTI = 383 [&TTIWP](Function &F) -> TargetTransformInfo & { 384 return TTIWP->getTTI(F); 385 }; 386 387 std::function<const TargetLibraryInfo &(Function &)> GetTLI = 388 [this](Function &F) -> TargetLibraryInfo & { 389 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 390 }; 391 392 return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, 393 &GetTTI, NoneType::None, &GetTLI, PSI) 394 .run(M); 395 } 396 }; 397 398 } // end anonymous namespace 399 400 std::unique_ptr<FunctionOutliningMultiRegionInfo> 401 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 402 OptimizationRemarkEmitter &ORE) { 403 BasicBlock *EntryBlock = &F->front(); 404 405 DominatorTree DT(*F); 406 LoopInfo LI(DT); 407 BranchProbabilityInfo BPI(*F, LI); 408 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 409 BlockFrequencyInfo *BFI; 410 if (!GetBFI) { 411 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 412 BFI = ScopedBFI.get(); 413 } else 414 BFI = &(*GetBFI)(*F); 415 416 // Return if we don't have profiling information. 417 if (!PSI->hasInstrumentationProfile()) 418 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 419 420 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 421 std::make_unique<FunctionOutliningMultiRegionInfo>(); 422 423 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 424 BasicBlock *Dom = BlockList.front(); 425 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 426 }; 427 428 auto IsSingleExit = 429 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 430 BasicBlock *ExitBlock = nullptr; 431 for (auto *Block : BlockList) { 432 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 433 if (!is_contained(BlockList, *SI)) { 434 if (ExitBlock) { 435 ORE.emit([&]() { 436 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 437 &SI->front()) 438 << "Region dominated by " 439 << ore::NV("Block", BlockList.front()->getName()) 440 << " has more than one region exit edge."; 441 }); 442 return nullptr; 443 } else 444 ExitBlock = Block; 445 } 446 } 447 } 448 return ExitBlock; 449 }; 450 451 auto BBProfileCount = [BFI](BasicBlock *BB) { 452 return BFI->getBlockProfileCount(BB) 453 ? BFI->getBlockProfileCount(BB).getValue() 454 : 0; 455 }; 456 457 // Use the same computeBBInlineCost function to compute the cost savings of 458 // the outlining the candidate region. 459 int OverallFunctionCost = 0; 460 for (auto &BB : *F) 461 OverallFunctionCost += computeBBInlineCost(&BB); 462 463 #ifndef NDEBUG 464 if (TracePartialInlining) 465 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 466 #endif 467 int MinOutlineRegionCost = 468 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 469 BranchProbability MinBranchProbability( 470 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 471 MinBlockCounterExecution); 472 bool ColdCandidateFound = false; 473 BasicBlock *CurrEntry = EntryBlock; 474 std::vector<BasicBlock *> DFS; 475 DenseMap<BasicBlock *, bool> VisitedMap; 476 DFS.push_back(CurrEntry); 477 VisitedMap[CurrEntry] = true; 478 // Use Depth First Search on the basic blocks to find CFG edges that are 479 // considered cold. 480 // Cold regions considered must also have its inline cost compared to the 481 // overall inline cost of the original function. The region is outlined only 482 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 483 // more. 484 while (!DFS.empty()) { 485 auto *thisBB = DFS.back(); 486 DFS.pop_back(); 487 // Only consider regions with predecessor blocks that are considered 488 // not-cold (default: part of the top 99.99% of all block counters) 489 // AND greater than our minimum block execution count (default: 100). 490 if (PSI->isColdBlock(thisBB, BFI) || 491 BBProfileCount(thisBB) < MinBlockCounterExecution) 492 continue; 493 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 494 if (VisitedMap[*SI]) 495 continue; 496 VisitedMap[*SI] = true; 497 DFS.push_back(*SI); 498 // If branch isn't cold, we skip to the next one. 499 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 500 if (SuccProb > MinBranchProbability) 501 continue; 502 #ifndef NDEBUG 503 if (TracePartialInlining) { 504 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 505 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 506 << "\n"; 507 } 508 #endif 509 SmallVector<BasicBlock *, 8> DominateVector; 510 DT.getDescendants(*SI, DominateVector); 511 // We can only outline single entry regions (for now). 512 if (!IsSingleEntry(DominateVector)) 513 continue; 514 BasicBlock *ExitBlock = nullptr; 515 // We can only outline single exit regions (for now). 516 if (!(ExitBlock = IsSingleExit(DominateVector))) 517 continue; 518 int OutlineRegionCost = 0; 519 for (auto *BB : DominateVector) 520 OutlineRegionCost += computeBBInlineCost(BB); 521 522 #ifndef NDEBUG 523 if (TracePartialInlining) 524 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 525 #endif 526 527 if (OutlineRegionCost < MinOutlineRegionCost) { 528 ORE.emit([&]() { 529 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 530 &SI->front()) 531 << ore::NV("Callee", F) << " inline cost-savings smaller than " 532 << ore::NV("Cost", MinOutlineRegionCost); 533 }); 534 continue; 535 } 536 // For now, ignore blocks that belong to a SISE region that is a 537 // candidate for outlining. In the future, we may want to look 538 // at inner regions because the outer region may have live-exit 539 // variables. 540 for (auto *BB : DominateVector) 541 VisitedMap[BB] = true; 542 // ReturnBlock here means the block after the outline call 543 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 544 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 545 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 546 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 547 OutliningInfo->ORI.push_back(RegInfo); 548 #ifndef NDEBUG 549 if (TracePartialInlining) { 550 dbgs() << "Found Cold Candidate starting at block: " 551 << DominateVector.front()->getName() << "\n"; 552 } 553 #endif 554 ColdCandidateFound = true; 555 NumColdRegionsFound++; 556 } 557 } 558 if (ColdCandidateFound) 559 return OutliningInfo; 560 else 561 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 562 } 563 564 std::unique_ptr<FunctionOutliningInfo> 565 PartialInlinerImpl::computeOutliningInfo(Function *F) { 566 BasicBlock *EntryBlock = &F->front(); 567 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 568 if (!BR || BR->isUnconditional()) 569 return std::unique_ptr<FunctionOutliningInfo>(); 570 571 // Returns true if Succ is BB's successor 572 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 573 return is_contained(successors(BB), Succ); 574 }; 575 576 auto IsReturnBlock = [](BasicBlock *BB) { 577 Instruction *TI = BB->getTerminator(); 578 return isa<ReturnInst>(TI); 579 }; 580 581 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 582 if (IsReturnBlock(Succ1)) 583 return std::make_tuple(Succ1, Succ2); 584 if (IsReturnBlock(Succ2)) 585 return std::make_tuple(Succ2, Succ1); 586 587 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 588 }; 589 590 // Detect a triangular shape: 591 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 592 if (IsSuccessor(Succ1, Succ2)) 593 return std::make_tuple(Succ1, Succ2); 594 if (IsSuccessor(Succ2, Succ1)) 595 return std::make_tuple(Succ2, Succ1); 596 597 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 598 }; 599 600 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 601 std::make_unique<FunctionOutliningInfo>(); 602 603 BasicBlock *CurrEntry = EntryBlock; 604 bool CandidateFound = false; 605 do { 606 // The number of blocks to be inlined has already reached 607 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 608 // disables partial inlining for the function. 609 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 610 break; 611 612 if (succ_size(CurrEntry) != 2) 613 break; 614 615 BasicBlock *Succ1 = *succ_begin(CurrEntry); 616 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 617 618 BasicBlock *ReturnBlock, *NonReturnBlock; 619 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 620 621 if (ReturnBlock) { 622 OutliningInfo->Entries.push_back(CurrEntry); 623 OutliningInfo->ReturnBlock = ReturnBlock; 624 OutliningInfo->NonReturnBlock = NonReturnBlock; 625 CandidateFound = true; 626 break; 627 } 628 629 BasicBlock *CommSucc; 630 BasicBlock *OtherSucc; 631 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 632 633 if (!CommSucc) 634 break; 635 636 OutliningInfo->Entries.push_back(CurrEntry); 637 CurrEntry = OtherSucc; 638 } while (true); 639 640 if (!CandidateFound) 641 return std::unique_ptr<FunctionOutliningInfo>(); 642 643 // Do sanity check of the entries: threre should not 644 // be any successors (not in the entry set) other than 645 // {ReturnBlock, NonReturnBlock} 646 assert(OutliningInfo->Entries[0] == &F->front() && 647 "Function Entry must be the first in Entries vector"); 648 DenseSet<BasicBlock *> Entries; 649 for (BasicBlock *E : OutliningInfo->Entries) 650 Entries.insert(E); 651 652 // Returns true of BB has Predecessor which is not 653 // in Entries set. 654 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 655 for (auto Pred : predecessors(BB)) { 656 if (!Entries.count(Pred)) 657 return true; 658 } 659 return false; 660 }; 661 auto CheckAndNormalizeCandidate = 662 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 663 for (BasicBlock *E : OutliningInfo->Entries) { 664 for (auto Succ : successors(E)) { 665 if (Entries.count(Succ)) 666 continue; 667 if (Succ == OutliningInfo->ReturnBlock) 668 OutliningInfo->ReturnBlockPreds.push_back(E); 669 else if (Succ != OutliningInfo->NonReturnBlock) 670 return false; 671 } 672 // There should not be any outside incoming edges either: 673 if (HasNonEntryPred(E)) 674 return false; 675 } 676 return true; 677 }; 678 679 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 680 return std::unique_ptr<FunctionOutliningInfo>(); 681 682 // Now further growing the candidate's inlining region by 683 // peeling off dominating blocks from the outlining region: 684 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 685 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 686 if (succ_size(Cand) != 2) 687 break; 688 689 if (HasNonEntryPred(Cand)) 690 break; 691 692 BasicBlock *Succ1 = *succ_begin(Cand); 693 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 694 695 BasicBlock *ReturnBlock, *NonReturnBlock; 696 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 697 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 698 break; 699 700 if (NonReturnBlock->getSinglePredecessor() != Cand) 701 break; 702 703 // Now grow and update OutlininigInfo: 704 OutliningInfo->Entries.push_back(Cand); 705 OutliningInfo->NonReturnBlock = NonReturnBlock; 706 OutliningInfo->ReturnBlockPreds.push_back(Cand); 707 Entries.insert(Cand); 708 } 709 710 return OutliningInfo; 711 } 712 713 // Check if there is PGO data or user annotated branch data: 714 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 715 if (F->hasProfileData()) 716 return true; 717 // Now check if any of the entry block has MD_prof data: 718 for (auto *E : OI->Entries) { 719 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 720 if (!BR || BR->isUnconditional()) 721 continue; 722 uint64_t T, F; 723 if (BR->extractProfMetadata(T, F)) 724 return true; 725 } 726 return false; 727 } 728 729 BranchProbability 730 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 731 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 732 auto EntryFreq = 733 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 734 auto OutliningCallFreq = 735 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 736 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 737 // we outlined any regions, so we may encounter situations where the 738 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 739 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 740 OutliningCallFreq = EntryFreq; 741 } 742 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 743 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 744 745 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 746 return OutlineRegionRelFreq; 747 748 // When profile data is not available, we need to be conservative in 749 // estimating the overall savings. Static branch prediction can usually 750 // guess the branch direction right (taken/non-taken), but the guessed 751 // branch probability is usually not biased enough. In case when the 752 // outlined region is predicted to be likely, its probability needs 753 // to be made higher (more biased) to not under-estimate the cost of 754 // function outlining. On the other hand, if the outlined region 755 // is predicted to be less likely, the predicted probablity is usually 756 // higher than the actual. For instance, the actual probability of the 757 // less likely target is only 5%, but the guessed probablity can be 758 // 40%. In the latter case, there is no need for further adjustement. 759 // FIXME: add an option for this. 760 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 761 return OutlineRegionRelFreq; 762 763 OutlineRegionRelFreq = std::max( 764 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 765 766 return OutlineRegionRelFreq; 767 } 768 769 bool PartialInlinerImpl::shouldPartialInline( 770 CallSite CS, FunctionCloner &Cloner, 771 BlockFrequency WeightedOutliningRcost, 772 OptimizationRemarkEmitter &ORE) { 773 using namespace ore; 774 775 Instruction *Call = CS.getInstruction(); 776 Function *Callee = CS.getCalledFunction(); 777 assert(Callee == Cloner.ClonedFunc); 778 779 if (SkipCostAnalysis) 780 return isInlineViable(*Callee).isSuccess(); 781 782 Function *Caller = CS.getCaller(); 783 auto &CalleeTTI = (*GetTTI)(*Callee); 784 bool RemarksEnabled = 785 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 786 DEBUG_TYPE); 787 assert(Call && "invalid callsite for partial inline"); 788 InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(), 789 CalleeTTI, *GetAssumptionCache, GetBFI, *GetTLI, 790 PSI, RemarksEnabled ? &ORE : nullptr); 791 792 if (IC.isAlways()) { 793 ORE.emit([&]() { 794 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) 795 << NV("Callee", Cloner.OrigFunc) 796 << " should always be fully inlined, not partially"; 797 }); 798 return false; 799 } 800 801 if (IC.isNever()) { 802 ORE.emit([&]() { 803 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) 804 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 805 << NV("Caller", Caller) 806 << " because it should never be inlined (cost=never)"; 807 }); 808 return false; 809 } 810 811 if (!IC) { 812 ORE.emit([&]() { 813 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) 814 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 815 << NV("Caller", Caller) << " because too costly to inline (cost=" 816 << NV("Cost", IC.getCost()) << ", threshold=" 817 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 818 }); 819 return false; 820 } 821 const DataLayout &DL = Caller->getParent()->getDataLayout(); 822 823 // The savings of eliminating the call: 824 int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL); 825 BlockFrequency NormWeightedSavings(NonWeightedSavings); 826 827 // Weighted saving is smaller than weighted cost, return false 828 if (NormWeightedSavings < WeightedOutliningRcost) { 829 ORE.emit([&]() { 830 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 831 Call) 832 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 833 << NV("Caller", Caller) << " runtime overhead (overhead=" 834 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 835 << ", savings=" 836 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 837 << ")" 838 << " of making the outlined call is too high"; 839 }); 840 841 return false; 842 } 843 844 ORE.emit([&]() { 845 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) 846 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 847 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 848 << " (threshold=" 849 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 850 }); 851 return true; 852 } 853 854 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 855 // For now use a simplified version. The returned 'InlineCost' will be used 856 // to esimate the size cost as well as runtime cost of the BB. 857 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { 858 int InlineCost = 0; 859 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 860 for (Instruction &I : BB->instructionsWithoutDebug()) { 861 // Skip free instructions. 862 switch (I.getOpcode()) { 863 case Instruction::BitCast: 864 case Instruction::PtrToInt: 865 case Instruction::IntToPtr: 866 case Instruction::Alloca: 867 case Instruction::PHI: 868 continue; 869 case Instruction::GetElementPtr: 870 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 871 continue; 872 break; 873 default: 874 break; 875 } 876 877 if (I.isLifetimeStartOrEnd()) 878 continue; 879 880 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 881 InlineCost += getCallsiteCost(*CI, DL); 882 continue; 883 } 884 885 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 886 InlineCost += getCallsiteCost(*II, DL); 887 continue; 888 } 889 890 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 891 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 892 continue; 893 } 894 InlineCost += InlineConstants::InstrCost; 895 } 896 return InlineCost; 897 } 898 899 std::tuple<int, int> 900 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 901 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 902 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 903 Function *OutlinedFunc = FuncBBPair.first; 904 BasicBlock* OutliningCallBB = FuncBBPair.second; 905 // Now compute the cost of the call sequence to the outlined function 906 // 'OutlinedFunction' in BB 'OutliningCallBB': 907 OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB); 908 909 // Now compute the cost of the extracted/outlined function itself: 910 for (BasicBlock &BB : *OutlinedFunc) 911 OutlinedFunctionCost += computeBBInlineCost(&BB); 912 } 913 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 914 "Outlined function cost should be no less than the outlined region"); 915 916 // The code extractor introduces a new root and exit stub blocks with 917 // additional unconditional branches. Those branches will be eliminated 918 // later with bb layout. The cost should be adjusted accordingly: 919 OutlinedFunctionCost -= 920 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 921 922 int OutliningRuntimeOverhead = 923 OutliningFuncCallCost + 924 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 925 ExtraOutliningPenalty; 926 927 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 928 } 929 930 // Create the callsite to profile count map which is 931 // used to update the original function's entry count, 932 // after the function is partially inlined into the callsite. 933 void PartialInlinerImpl::computeCallsiteToProfCountMap( 934 Function *DuplicateFunction, 935 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 936 std::vector<User *> Users(DuplicateFunction->user_begin(), 937 DuplicateFunction->user_end()); 938 Function *CurrentCaller = nullptr; 939 std::unique_ptr<BlockFrequencyInfo> TempBFI; 940 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 941 942 auto ComputeCurrBFI = [&,this](Function *Caller) { 943 // For the old pass manager: 944 if (!GetBFI) { 945 DominatorTree DT(*Caller); 946 LoopInfo LI(DT); 947 BranchProbabilityInfo BPI(*Caller, LI); 948 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 949 CurrentCallerBFI = TempBFI.get(); 950 } else { 951 // New pass manager: 952 CurrentCallerBFI = &(*GetBFI)(*Caller); 953 } 954 }; 955 956 for (User *User : Users) { 957 CallSite CS = getCallSite(User); 958 Function *Caller = CS.getCaller(); 959 if (CurrentCaller != Caller) { 960 CurrentCaller = Caller; 961 ComputeCurrBFI(Caller); 962 } else { 963 assert(CurrentCallerBFI && "CallerBFI is not set"); 964 } 965 BasicBlock *CallBB = CS.getInstruction()->getParent(); 966 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 967 if (Count) 968 CallSiteToProfCountMap[User] = *Count; 969 else 970 CallSiteToProfCountMap[User] = 0; 971 } 972 } 973 974 PartialInlinerImpl::FunctionCloner::FunctionCloner( 975 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 976 function_ref<AssumptionCache *(Function &)> LookupAC) 977 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 978 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 979 980 // Clone the function, so that we can hack away on it. 981 ValueToValueMapTy VMap; 982 ClonedFunc = CloneFunction(F, VMap); 983 984 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 985 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 986 for (BasicBlock *BB : OI->Entries) { 987 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 988 } 989 for (BasicBlock *E : OI->ReturnBlockPreds) { 990 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 991 ClonedOI->ReturnBlockPreds.push_back(NewE); 992 } 993 // Go ahead and update all uses to the duplicate, so that we can just 994 // use the inliner functionality when we're done hacking. 995 F->replaceAllUsesWith(ClonedFunc); 996 } 997 998 PartialInlinerImpl::FunctionCloner::FunctionCloner( 999 Function *F, FunctionOutliningMultiRegionInfo *OI, 1000 OptimizationRemarkEmitter &ORE, 1001 function_ref<AssumptionCache *(Function &)> LookupAC) 1002 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) { 1003 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1004 1005 // Clone the function, so that we can hack away on it. 1006 ValueToValueMapTy VMap; 1007 ClonedFunc = CloneFunction(F, VMap); 1008 1009 // Go through all Outline Candidate Regions and update all BasicBlock 1010 // information. 1011 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1012 OI->ORI) { 1013 SmallVector<BasicBlock *, 8> Region; 1014 for (BasicBlock *BB : RegionInfo.Region) { 1015 Region.push_back(cast<BasicBlock>(VMap[BB])); 1016 } 1017 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1018 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1019 BasicBlock *NewReturnBlock = nullptr; 1020 if (RegionInfo.ReturnBlock) 1021 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1022 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1023 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1024 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1025 } 1026 // Go ahead and update all uses to the duplicate, so that we can just 1027 // use the inliner functionality when we're done hacking. 1028 F->replaceAllUsesWith(ClonedFunc); 1029 } 1030 1031 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1032 auto getFirstPHI = [](BasicBlock *BB) { 1033 BasicBlock::iterator I = BB->begin(); 1034 PHINode *FirstPhi = nullptr; 1035 while (I != BB->end()) { 1036 PHINode *Phi = dyn_cast<PHINode>(I); 1037 if (!Phi) 1038 break; 1039 if (!FirstPhi) { 1040 FirstPhi = Phi; 1041 break; 1042 } 1043 } 1044 return FirstPhi; 1045 }; 1046 1047 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1048 // blocks. 1049 if (!ClonedOI) 1050 return; 1051 1052 // Special hackery is needed with PHI nodes that have inputs from more than 1053 // one extracted block. For simplicity, just split the PHIs into a two-level 1054 // sequence of PHIs, some of which will go in the extracted region, and some 1055 // of which will go outside. 1056 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1057 // only split block when necessary: 1058 PHINode *FirstPhi = getFirstPHI(PreReturn); 1059 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1060 1061 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1062 return; 1063 1064 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1065 Value *CommonValue = PN->getIncomingValue(0); 1066 if (all_of(PN->incoming_values(), 1067 [&](Value *V) { return V == CommonValue; })) 1068 return CommonValue; 1069 return nullptr; 1070 }; 1071 1072 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1073 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1074 BasicBlock::iterator I = PreReturn->begin(); 1075 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1076 SmallVector<Instruction *, 4> DeadPhis; 1077 while (I != PreReturn->end()) { 1078 PHINode *OldPhi = dyn_cast<PHINode>(I); 1079 if (!OldPhi) 1080 break; 1081 1082 PHINode *RetPhi = 1083 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1084 OldPhi->replaceAllUsesWith(RetPhi); 1085 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1086 1087 RetPhi->addIncoming(&*I, PreReturn); 1088 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1089 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1090 OldPhi->removeIncomingValue(E); 1091 } 1092 1093 // After incoming values splitting, the old phi may become trivial. 1094 // Keeping the trivial phi can introduce definition inside the outline 1095 // region which is live-out, causing necessary overhead (load, store 1096 // arg passing etc). 1097 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1098 OldPhi->replaceAllUsesWith(OldPhiVal); 1099 DeadPhis.push_back(OldPhi); 1100 } 1101 ++I; 1102 } 1103 for (auto *DP : DeadPhis) 1104 DP->eraseFromParent(); 1105 1106 for (auto E : ClonedOI->ReturnBlockPreds) { 1107 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1108 } 1109 } 1110 1111 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1112 1113 auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) { 1114 int Cost = 0; 1115 for (BasicBlock* BB : Region) 1116 Cost += computeBBInlineCost(BB); 1117 return Cost; 1118 }; 1119 1120 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1121 1122 if (ClonedOMRI->ORI.empty()) 1123 return false; 1124 1125 // The CodeExtractor needs a dominator tree. 1126 DominatorTree DT; 1127 DT.recalculate(*ClonedFunc); 1128 1129 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1130 LoopInfo LI(DT); 1131 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1132 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1133 1134 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1135 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1136 1137 SetVector<Value *> Inputs, Outputs, Sinks; 1138 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1139 ClonedOMRI->ORI) { 1140 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1141 1142 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1143 ClonedFuncBFI.get(), &BPI, 1144 LookupAC(*RegionInfo.EntryBlock->getParent()), 1145 /* AllowVarargs */ false); 1146 1147 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1148 1149 #ifndef NDEBUG 1150 if (TracePartialInlining) { 1151 dbgs() << "inputs: " << Inputs.size() << "\n"; 1152 dbgs() << "outputs: " << Outputs.size() << "\n"; 1153 for (Value *value : Inputs) 1154 dbgs() << "value used in func: " << *value << "\n"; 1155 for (Value *output : Outputs) 1156 dbgs() << "instr used in func: " << *output << "\n"; 1157 } 1158 #endif 1159 // Do not extract regions that have live exit variables. 1160 if (Outputs.size() > 0 && !ForceLiveExit) 1161 continue; 1162 1163 Function *OutlinedFunc = CE.extractCodeRegion(CEAC); 1164 1165 if (OutlinedFunc) { 1166 CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1167 BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent(); 1168 assert(OutliningCallBB->getParent() == ClonedFunc); 1169 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1170 NumColdRegionsOutlined++; 1171 OutlinedRegionCost += CurrentOutlinedRegionCost; 1172 1173 if (MarkOutlinedColdCC) { 1174 OutlinedFunc->setCallingConv(CallingConv::Cold); 1175 OCS.setCallingConv(CallingConv::Cold); 1176 } 1177 } else 1178 ORE.emit([&]() { 1179 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1180 &RegionInfo.Region.front()->front()) 1181 << "Failed to extract region at block " 1182 << ore::NV("Block", RegionInfo.Region.front()); 1183 }); 1184 } 1185 1186 return !OutlinedFunctions.empty(); 1187 } 1188 1189 Function * 1190 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1191 // Returns true if the block is to be partial inlined into the caller 1192 // (i.e. not to be extracted to the out of line function) 1193 auto ToBeInlined = [&, this](BasicBlock *BB) { 1194 return BB == ClonedOI->ReturnBlock || 1195 (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) != 1196 ClonedOI->Entries.end()); 1197 }; 1198 1199 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1200 // The CodeExtractor needs a dominator tree. 1201 DominatorTree DT; 1202 DT.recalculate(*ClonedFunc); 1203 1204 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1205 LoopInfo LI(DT); 1206 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1207 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1208 1209 // Gather up the blocks that we're going to extract. 1210 std::vector<BasicBlock *> ToExtract; 1211 ToExtract.push_back(ClonedOI->NonReturnBlock); 1212 OutlinedRegionCost += 1213 PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock); 1214 for (BasicBlock &BB : *ClonedFunc) 1215 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1216 ToExtract.push_back(&BB); 1217 // FIXME: the code extractor may hoist/sink more code 1218 // into the outlined function which may make the outlining 1219 // overhead (the difference of the outlined function cost 1220 // and OutliningRegionCost) look larger. 1221 OutlinedRegionCost += computeBBInlineCost(&BB); 1222 } 1223 1224 // Extract the body of the if. 1225 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1226 Function *OutlinedFunc = 1227 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1228 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1229 /* AllowVarargs */ true) 1230 .extractCodeRegion(CEAC); 1231 1232 if (OutlinedFunc) { 1233 BasicBlock *OutliningCallBB = 1234 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1235 .getInstruction() 1236 ->getParent(); 1237 assert(OutliningCallBB->getParent() == ClonedFunc); 1238 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1239 } else 1240 ORE.emit([&]() { 1241 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1242 &ToExtract.front()->front()) 1243 << "Failed to extract region at block " 1244 << ore::NV("Block", ToExtract.front()); 1245 }); 1246 1247 return OutlinedFunc; 1248 } 1249 1250 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1251 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1252 // users (function pointers, etc.) back to the original function. 1253 ClonedFunc->replaceAllUsesWith(OrigFunc); 1254 ClonedFunc->eraseFromParent(); 1255 if (!IsFunctionInlined) { 1256 // Remove each function that was speculatively created if there is no 1257 // reference. 1258 for (auto FuncBBPair : OutlinedFunctions) { 1259 Function *Func = FuncBBPair.first; 1260 Func->eraseFromParent(); 1261 } 1262 } 1263 } 1264 1265 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1266 1267 if (F->hasAddressTaken()) 1268 return {false, nullptr}; 1269 1270 // Let inliner handle it 1271 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1272 return {false, nullptr}; 1273 1274 if (F->hasFnAttribute(Attribute::NoInline)) 1275 return {false, nullptr}; 1276 1277 if (PSI->isFunctionEntryCold(F)) 1278 return {false, nullptr}; 1279 1280 if (F->users().empty()) 1281 return {false, nullptr}; 1282 1283 OptimizationRemarkEmitter ORE(F); 1284 1285 // Only try to outline cold regions if we have a profile summary, which 1286 // implies we have profiling information. 1287 if (PSI->hasProfileSummary() && F->hasProfileData() && 1288 !DisableMultiRegionPartialInline) { 1289 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1290 computeOutliningColdRegionsInfo(F, ORE); 1291 if (OMRI) { 1292 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache); 1293 1294 #ifndef NDEBUG 1295 if (TracePartialInlining) { 1296 dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n"; 1297 dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold() 1298 << "\n"; 1299 } 1300 #endif 1301 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1302 1303 if (DidOutline) { 1304 #ifndef NDEBUG 1305 if (TracePartialInlining) { 1306 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1307 Cloner.ClonedFunc->print(dbgs()); 1308 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1309 } 1310 #endif 1311 1312 if (tryPartialInline(Cloner)) 1313 return {true, nullptr}; 1314 } 1315 } 1316 } 1317 1318 // Fall-thru to regular partial inlining if we: 1319 // i) can't find any cold regions to outline, or 1320 // ii) can't inline the outlined function anywhere. 1321 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1322 if (!OI) 1323 return {false, nullptr}; 1324 1325 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache); 1326 Cloner.NormalizeReturnBlock(); 1327 1328 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1329 1330 if (!OutlinedFunction) 1331 return {false, nullptr}; 1332 1333 bool AnyInline = tryPartialInline(Cloner); 1334 1335 if (AnyInline) 1336 return {true, OutlinedFunction}; 1337 1338 return {false, nullptr}; 1339 } 1340 1341 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1342 if (Cloner.OutlinedFunctions.empty()) 1343 return false; 1344 1345 int SizeCost = 0; 1346 BlockFrequency WeightedRcost; 1347 int NonWeightedRcost; 1348 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1349 1350 // Only calculate RelativeToEntryFreq when we are doing single region 1351 // outlining. 1352 BranchProbability RelativeToEntryFreq; 1353 if (Cloner.ClonedOI) { 1354 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1355 } else 1356 // RelativeToEntryFreq doesn't make sense when we have more than one 1357 // outlined call because each call will have a different relative frequency 1358 // to the entry block. We can consider using the average, but the 1359 // usefulness of that information is questionable. For now, assume we never 1360 // execute the calls to outlined functions. 1361 RelativeToEntryFreq = BranchProbability(0, 1); 1362 1363 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1364 1365 // The call sequence(s) to the outlined function(s) are larger than the sum of 1366 // the original outlined region size(s), it does not increase the chances of 1367 // inlining the function with outlining (The inliner uses the size increase to 1368 // model the cost of inlining a callee). 1369 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1370 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1371 DebugLoc DLoc; 1372 BasicBlock *Block; 1373 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1374 OrigFuncORE.emit([&]() { 1375 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1376 DLoc, Block) 1377 << ore::NV("Function", Cloner.OrigFunc) 1378 << " not partially inlined into callers (Original Size = " 1379 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1380 << ", Size of call sequence to outlined function = " 1381 << ore::NV("NewSize", SizeCost) << ")"; 1382 }); 1383 return false; 1384 } 1385 1386 assert(Cloner.OrigFunc->users().empty() && 1387 "F's users should all be replaced!"); 1388 1389 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1390 Cloner.ClonedFunc->user_end()); 1391 1392 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1393 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1394 if (CalleeEntryCount) 1395 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1396 1397 uint64_t CalleeEntryCountV = 1398 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1399 1400 bool AnyInline = false; 1401 for (User *User : Users) { 1402 CallSite CS = getCallSite(User); 1403 1404 if (IsLimitReached()) 1405 continue; 1406 1407 OptimizationRemarkEmitter CallerORE(CS.getCaller()); 1408 if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE)) 1409 continue; 1410 1411 // Construct remark before doing the inlining, as after successful inlining 1412 // the callsite is removed. 1413 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()); 1414 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1415 << ore::NV("Caller", CS.getCaller()); 1416 1417 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); 1418 // We can only forward varargs when we outlined a single region, else we 1419 // bail on vararg functions. 1420 if (!InlineFunction(CS, IFI, nullptr, true, 1421 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1422 : nullptr)) 1423 .isSuccess()) 1424 continue; 1425 1426 CallerORE.emit(OR); 1427 1428 // Now update the entry count: 1429 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1430 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1431 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1432 } 1433 1434 AnyInline = true; 1435 NumPartialInlining++; 1436 // Update the stats 1437 if (Cloner.ClonedOI) 1438 NumPartialInlined++; 1439 else 1440 NumColdOutlinePartialInlined++; 1441 1442 } 1443 1444 if (AnyInline) { 1445 Cloner.IsFunctionInlined = true; 1446 if (CalleeEntryCount) 1447 Cloner.OrigFunc->setEntryCount( 1448 CalleeEntryCount.setCount(CalleeEntryCountV)); 1449 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1450 OrigFuncORE.emit([&]() { 1451 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1452 << "Partially inlined into at least one caller"; 1453 }); 1454 1455 } 1456 1457 return AnyInline; 1458 } 1459 1460 bool PartialInlinerImpl::run(Module &M) { 1461 if (DisablePartialInlining) 1462 return false; 1463 1464 std::vector<Function *> Worklist; 1465 Worklist.reserve(M.size()); 1466 for (Function &F : M) 1467 if (!F.use_empty() && !F.isDeclaration()) 1468 Worklist.push_back(&F); 1469 1470 bool Changed = false; 1471 while (!Worklist.empty()) { 1472 Function *CurrFunc = Worklist.back(); 1473 Worklist.pop_back(); 1474 1475 if (CurrFunc->use_empty()) 1476 continue; 1477 1478 bool Recursive = false; 1479 for (User *U : CurrFunc->users()) 1480 if (Instruction *I = dyn_cast<Instruction>(U)) 1481 if (I->getParent()->getParent() == CurrFunc) { 1482 Recursive = true; 1483 break; 1484 } 1485 if (Recursive) 1486 continue; 1487 1488 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1489 if (Result.second) 1490 Worklist.push_back(Result.second); 1491 Changed |= Result.first; 1492 } 1493 1494 return Changed; 1495 } 1496 1497 char PartialInlinerLegacyPass::ID = 0; 1498 1499 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1500 "Partial Inliner", false, false) 1501 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1502 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1503 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1504 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1505 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1506 "Partial Inliner", false, false) 1507 1508 ModulePass *llvm::createPartialInliningPass() { 1509 return new PartialInlinerLegacyPass(); 1510 } 1511 1512 PreservedAnalyses PartialInlinerPass::run(Module &M, 1513 ModuleAnalysisManager &AM) { 1514 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1515 1516 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 1517 [&FAM](Function &F) -> AssumptionCache & { 1518 return FAM.getResult<AssumptionAnalysis>(F); 1519 }; 1520 1521 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1522 return FAM.getCachedResult<AssumptionAnalysis>(F); 1523 }; 1524 1525 std::function<BlockFrequencyInfo &(Function &)> GetBFI = 1526 [&FAM](Function &F) -> BlockFrequencyInfo & { 1527 return FAM.getResult<BlockFrequencyAnalysis>(F); 1528 }; 1529 1530 std::function<TargetTransformInfo &(Function &)> GetTTI = 1531 [&FAM](Function &F) -> TargetTransformInfo & { 1532 return FAM.getResult<TargetIRAnalysis>(F); 1533 }; 1534 1535 std::function<const TargetLibraryInfo &(Function &)> GetTLI = 1536 [&FAM](Function &F) -> TargetLibraryInfo & { 1537 return FAM.getResult<TargetLibraryAnalysis>(F); 1538 }; 1539 1540 ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M); 1541 1542 if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI, 1543 {GetBFI}, &GetTLI, PSI) 1544 .run(M)) 1545 return PreservedAnalyses::none(); 1546 return PreservedAnalyses::all(); 1547 } 1548