1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 71 "cold outlined regions were partially " 72 "inlined into its caller(s)."); 73 STATISTIC(NumColdRegionsFound, 74 "Number of cold single entry/exit regions found."); 75 STATISTIC(NumColdRegionsOutlined, 76 "Number of cold single entry/exit regions outlined."); 77 78 // Command line option to disable partial-inlining. The default is false: 79 static cl::opt<bool> 80 DisablePartialInlining("disable-partial-inlining", cl::init(false), 81 cl::Hidden, cl::desc("Disable partial inlining")); 82 // Command line option to disable multi-region partial-inlining. The default is 83 // false: 84 static cl::opt<bool> DisableMultiRegionPartialInline( 85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 86 cl::desc("Disable multi-region partial inlining")); 87 88 // Command line option to force outlining in regions with live exit variables. 89 // The default is false: 90 static cl::opt<bool> 91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 92 cl::desc("Force outline regions with live exits")); 93 94 // Command line option to enable marking outline functions with Cold Calling 95 // Convention. The default is false: 96 static cl::opt<bool> 97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 98 cl::desc("Mark outline function calls with ColdCC")); 99 100 // This is an option used by testing: 101 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 102 cl::init(false), cl::ZeroOrMore, 103 cl::ReallyHidden, 104 cl::desc("Skip Cost Analysis")); 105 // Used to determine if a cold region is worth outlining based on 106 // its inlining cost compared to the original function. Default is set at 10%. 107 // ie. if the cold region reduces the inlining cost of the original function by 108 // at least 10%. 109 static cl::opt<float> MinRegionSizeRatio( 110 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 111 cl::desc("Minimum ratio comparing relative sizes of each " 112 "outline candidate and original function")); 113 // Used to tune the minimum number of execution counts needed in the predecessor 114 // block to the cold edge. ie. confidence interval. 115 static cl::opt<unsigned> 116 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 117 cl::desc("Minimum block executions to consider " 118 "its BranchProbabilityInfo valid")); 119 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 120 // if the branch probability is 10% or less, then it is deemed as 'cold'. 121 static cl::opt<float> ColdBranchRatio( 122 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 123 cl::desc("Minimum BranchProbability to consider a region cold.")); 124 125 static cl::opt<unsigned> MaxNumInlineBlocks( 126 "max-num-inline-blocks", cl::init(5), cl::Hidden, 127 cl::desc("Max number of blocks to be partially inlined")); 128 129 // Command line option to set the maximum number of partial inlining allowed 130 // for the module. The default value of -1 means no limit. 131 static cl::opt<int> MaxNumPartialInlining( 132 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 133 cl::desc("Max number of partial inlining. The default is unlimited")); 134 135 // Used only when PGO or user annotated branch data is absent. It is 136 // the least value that is used to weigh the outline region. If BFI 137 // produces larger value, the BFI value will be used. 138 static cl::opt<int> 139 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 140 cl::Hidden, cl::ZeroOrMore, 141 cl::desc("Relative frequency of outline region to " 142 "the entry block")); 143 144 static cl::opt<unsigned> ExtraOutliningPenalty( 145 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 146 cl::desc("A debug option to add additional penalty to the computed one.")); 147 148 namespace { 149 150 struct FunctionOutliningInfo { 151 FunctionOutliningInfo() = default; 152 153 // Returns the number of blocks to be inlined including all blocks 154 // in Entries and one return block. 155 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 156 157 // A set of blocks including the function entry that guard 158 // the region to be outlined. 159 SmallVector<BasicBlock *, 4> Entries; 160 161 // The return block that is not included in the outlined region. 162 BasicBlock *ReturnBlock = nullptr; 163 164 // The dominating block of the region to be outlined. 165 BasicBlock *NonReturnBlock = nullptr; 166 167 // The set of blocks in Entries that that are predecessors to ReturnBlock 168 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 169 }; 170 171 struct FunctionOutliningMultiRegionInfo { 172 FunctionOutliningMultiRegionInfo() 173 : ORI() {} 174 175 // Container for outline regions 176 struct OutlineRegionInfo { 177 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 178 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 179 BasicBlock *ReturnBlock) 180 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 181 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 182 SmallVector<BasicBlock *, 8> Region; 183 BasicBlock *EntryBlock; 184 BasicBlock *ExitBlock; 185 BasicBlock *ReturnBlock; 186 }; 187 188 SmallVector<OutlineRegionInfo, 4> ORI; 189 }; 190 191 struct PartialInlinerImpl { 192 193 PartialInlinerImpl( 194 function_ref<AssumptionCache &(Function &)> GetAC, 195 function_ref<AssumptionCache *(Function &)> LookupAC, 196 function_ref<TargetTransformInfo &(Function &)> GTTI, 197 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 198 ProfileSummaryInfo &ProfSI, 199 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 200 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 201 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 202 203 bool run(Module &M); 204 // Main part of the transformation that calls helper functions to find 205 // outlining candidates, clone & outline the function, and attempt to 206 // partially inline the resulting function. Returns true if 207 // inlining was successful, false otherwise. Also returns the outline 208 // function (only if we partially inlined early returns) as there is a 209 // possibility to further "peel" early return statements that were left in the 210 // outline function due to code size. 211 std::pair<bool, Function *> unswitchFunction(Function &F); 212 213 // This class speculatively clones the function to be partial inlined. 214 // At the end of partial inlining, the remaining callsites to the cloned 215 // function that are not partially inlined will be fixed up to reference 216 // the original function, and the cloned function will be erased. 217 struct FunctionCloner { 218 // Two constructors, one for single region outlining, the other for 219 // multi-region outlining. 220 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 221 OptimizationRemarkEmitter &ORE, 222 function_ref<AssumptionCache *(Function &)> LookupAC, 223 function_ref<TargetTransformInfo &(Function &)> GetTTI); 224 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 225 OptimizationRemarkEmitter &ORE, 226 function_ref<AssumptionCache *(Function &)> LookupAC, 227 function_ref<TargetTransformInfo &(Function &)> GetTTI); 228 229 ~FunctionCloner(); 230 231 // Prepare for function outlining: making sure there is only 232 // one incoming edge from the extracted/outlined region to 233 // the return block. 234 void normalizeReturnBlock() const; 235 236 // Do function outlining for cold regions. 237 bool doMultiRegionFunctionOutlining(); 238 // Do function outlining for region after early return block(s). 239 // NOTE: For vararg functions that do the vararg handling in the outlined 240 // function, we temporarily generate IR that does not properly 241 // forward varargs to the outlined function. Calling InlineFunction 242 // will update calls to the outlined functions to properly forward 243 // the varargs. 244 Function *doSingleRegionFunctionOutlining(); 245 246 Function *OrigFunc = nullptr; 247 Function *ClonedFunc = nullptr; 248 249 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 250 // Keep track of Outlined Functions and the basic block they're called from. 251 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 252 253 // ClonedFunc is inlined in one of its callers after function 254 // outlining. 255 bool IsFunctionInlined = false; 256 // The cost of the region to be outlined. 257 int OutlinedRegionCost = 0; 258 // ClonedOI is specific to outlining non-early return blocks. 259 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 260 // ClonedOMRI is specific to outlining cold regions. 261 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 262 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 263 OptimizationRemarkEmitter &ORE; 264 function_ref<AssumptionCache *(Function &)> LookupAC; 265 function_ref<TargetTransformInfo &(Function &)> GetTTI; 266 }; 267 268 private: 269 int NumPartialInlining = 0; 270 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 271 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 272 function_ref<TargetTransformInfo &(Function &)> GetTTI; 273 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 274 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 275 ProfileSummaryInfo &PSI; 276 277 // Return the frequency of the OutlininingBB relative to F's entry point. 278 // The result is no larger than 1 and is represented using BP. 279 // (Note that the outlined region's 'head' block can only have incoming 280 // edges from the guarding entry blocks). 281 BranchProbability 282 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 283 284 // Return true if the callee of CB should be partially inlined with 285 // profit. 286 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 287 BlockFrequency WeightedOutliningRcost, 288 OptimizationRemarkEmitter &ORE) const; 289 290 // Try to inline DuplicateFunction (cloned from F with call to 291 // the OutlinedFunction into its callers. Return true 292 // if there is any successful inlining. 293 bool tryPartialInline(FunctionCloner &Cloner); 294 295 // Compute the mapping from use site of DuplicationFunction to the enclosing 296 // BB's profile count. 297 void 298 computeCallsiteToProfCountMap(Function *DuplicateFunction, 299 DenseMap<User *, uint64_t> &SiteCountMap) const; 300 301 bool isLimitReached() const { 302 return (MaxNumPartialInlining != -1 && 303 NumPartialInlining >= MaxNumPartialInlining); 304 } 305 306 static CallBase *getSupportedCallBase(User *U) { 307 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 308 return cast<CallBase>(U); 309 llvm_unreachable("All uses must be calls"); 310 return nullptr; 311 } 312 313 static CallBase *getOneCallSiteTo(Function &F) { 314 User *User = *F.user_begin(); 315 return getSupportedCallBase(User); 316 } 317 318 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 319 CallBase *CB = getOneCallSiteTo(F); 320 DebugLoc DLoc = CB->getDebugLoc(); 321 BasicBlock *Block = CB->getParent(); 322 return std::make_tuple(DLoc, Block); 323 } 324 325 // Returns the costs associated with function outlining: 326 // - The first value is the non-weighted runtime cost for making the call 327 // to the outlined function, including the addtional setup cost in the 328 // outlined function itself; 329 // - The second value is the estimated size of the new call sequence in 330 // basic block Cloner.OutliningCallBB; 331 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner) const; 332 333 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 334 // approximate both the size and runtime cost (Note that in the current 335 // inline cost analysis, there is no clear distinction there either). 336 static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI); 337 338 std::unique_ptr<FunctionOutliningInfo> 339 computeOutliningInfo(Function &F) const; 340 341 std::unique_ptr<FunctionOutliningMultiRegionInfo> 342 computeOutliningColdRegionsInfo(Function &F, 343 OptimizationRemarkEmitter &ORE) const; 344 }; 345 346 struct PartialInlinerLegacyPass : public ModulePass { 347 static char ID; // Pass identification, replacement for typeid 348 349 PartialInlinerLegacyPass() : ModulePass(ID) { 350 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 351 } 352 353 void getAnalysisUsage(AnalysisUsage &AU) const override { 354 AU.addRequired<AssumptionCacheTracker>(); 355 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 356 AU.addRequired<TargetTransformInfoWrapperPass>(); 357 AU.addRequired<TargetLibraryInfoWrapperPass>(); 358 } 359 360 bool runOnModule(Module &M) override { 361 if (skipModule(M)) 362 return false; 363 364 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 365 TargetTransformInfoWrapperPass *TTIWP = 366 &getAnalysis<TargetTransformInfoWrapperPass>(); 367 ProfileSummaryInfo &PSI = 368 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 369 370 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 371 return ACT->getAssumptionCache(F); 372 }; 373 374 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 375 return ACT->lookupAssumptionCache(F); 376 }; 377 378 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 379 return TTIWP->getTTI(F); 380 }; 381 382 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 383 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 384 }; 385 386 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 387 GetTLI, PSI) 388 .run(M); 389 } 390 }; 391 392 } // end anonymous namespace 393 394 std::unique_ptr<FunctionOutliningMultiRegionInfo> 395 PartialInlinerImpl::computeOutliningColdRegionsInfo( 396 Function &F, OptimizationRemarkEmitter &ORE) const { 397 BasicBlock *EntryBlock = &F.front(); 398 399 DominatorTree DT(F); 400 LoopInfo LI(DT); 401 BranchProbabilityInfo BPI(F, LI); 402 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 403 BlockFrequencyInfo *BFI; 404 if (!GetBFI) { 405 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 406 BFI = ScopedBFI.get(); 407 } else 408 BFI = &(GetBFI(F)); 409 410 // Return if we don't have profiling information. 411 if (!PSI.hasInstrumentationProfile()) 412 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 413 414 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 415 std::make_unique<FunctionOutliningMultiRegionInfo>(); 416 417 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 418 BasicBlock *Dom = BlockList.front(); 419 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 420 }; 421 422 auto IsSingleExit = 423 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 424 BasicBlock *ExitBlock = nullptr; 425 for (auto *Block : BlockList) { 426 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 427 if (!is_contained(BlockList, *SI)) { 428 if (ExitBlock) { 429 ORE.emit([&]() { 430 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 431 &SI->front()) 432 << "Region dominated by " 433 << ore::NV("Block", BlockList.front()->getName()) 434 << " has more than one region exit edge."; 435 }); 436 return nullptr; 437 } 438 439 ExitBlock = Block; 440 } 441 } 442 } 443 return ExitBlock; 444 }; 445 446 auto BBProfileCount = [BFI](BasicBlock *BB) { 447 return BFI->getBlockProfileCount(BB) 448 ? BFI->getBlockProfileCount(BB).getValue() 449 : 0; 450 }; 451 452 // Use the same computeBBInlineCost function to compute the cost savings of 453 // the outlining the candidate region. 454 TargetTransformInfo *FTTI = &GetTTI(F); 455 int OverallFunctionCost = 0; 456 for (auto &BB : F) 457 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 458 459 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 460 << "\n";); 461 462 int MinOutlineRegionCost = 463 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 464 BranchProbability MinBranchProbability( 465 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 466 MinBlockCounterExecution); 467 bool ColdCandidateFound = false; 468 BasicBlock *CurrEntry = EntryBlock; 469 std::vector<BasicBlock *> DFS; 470 DenseMap<BasicBlock *, bool> VisitedMap; 471 DFS.push_back(CurrEntry); 472 VisitedMap[CurrEntry] = true; 473 474 // Use Depth First Search on the basic blocks to find CFG edges that are 475 // considered cold. 476 // Cold regions considered must also have its inline cost compared to the 477 // overall inline cost of the original function. The region is outlined only 478 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 479 // more. 480 while (!DFS.empty()) { 481 auto *ThisBB = DFS.back(); 482 DFS.pop_back(); 483 // Only consider regions with predecessor blocks that are considered 484 // not-cold (default: part of the top 99.99% of all block counters) 485 // AND greater than our minimum block execution count (default: 100). 486 if (PSI.isColdBlock(ThisBB, BFI) || 487 BBProfileCount(ThisBB) < MinBlockCounterExecution) 488 continue; 489 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 490 if (VisitedMap[*SI]) 491 continue; 492 VisitedMap[*SI] = true; 493 DFS.push_back(*SI); 494 // If branch isn't cold, we skip to the next one. 495 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 496 if (SuccProb > MinBranchProbability) 497 continue; 498 499 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 500 << SI->getName() 501 << "\nBranch Probability = " << SuccProb << "\n";); 502 503 SmallVector<BasicBlock *, 8> DominateVector; 504 DT.getDescendants(*SI, DominateVector); 505 506 // We can only outline single entry regions (for now). 507 if (!IsSingleEntry(DominateVector)) 508 continue; 509 510 BasicBlock *ExitBlock = nullptr; 511 // We can only outline single exit regions (for now). 512 if (!(ExitBlock = IsSingleExit(DominateVector))) 513 continue; 514 515 int OutlineRegionCost = 0; 516 for (auto *BB : DominateVector) 517 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 518 519 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 520 << "\n";); 521 522 if (OutlineRegionCost < MinOutlineRegionCost) { 523 ORE.emit([&]() { 524 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 525 &SI->front()) 526 << ore::NV("Callee", &F) 527 << " inline cost-savings smaller than " 528 << ore::NV("Cost", MinOutlineRegionCost); 529 }); 530 continue; 531 } 532 // For now, ignore blocks that belong to a SISE region that is a 533 // candidate for outlining. In the future, we may want to look 534 // at inner regions because the outer region may have live-exit 535 // variables. 536 for (auto *BB : DominateVector) 537 VisitedMap[BB] = true; 538 539 // ReturnBlock here means the block after the outline call 540 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 541 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 542 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 543 OutliningInfo->ORI.push_back(RegInfo); 544 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 545 << DominateVector.front()->getName() << "\n";); 546 ColdCandidateFound = true; 547 NumColdRegionsFound++; 548 } 549 } 550 551 if (ColdCandidateFound) 552 return OutliningInfo; 553 554 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 555 } 556 557 std::unique_ptr<FunctionOutliningInfo> 558 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 559 BasicBlock *EntryBlock = &F.front(); 560 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 561 if (!BR || BR->isUnconditional()) 562 return std::unique_ptr<FunctionOutliningInfo>(); 563 564 // Returns true if Succ is BB's successor 565 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 566 return is_contained(successors(BB), Succ); 567 }; 568 569 auto IsReturnBlock = [](BasicBlock *BB) { 570 Instruction *TI = BB->getTerminator(); 571 return isa<ReturnInst>(TI); 572 }; 573 574 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 575 if (IsReturnBlock(Succ1)) 576 return std::make_tuple(Succ1, Succ2); 577 if (IsReturnBlock(Succ2)) 578 return std::make_tuple(Succ2, Succ1); 579 580 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 581 }; 582 583 // Detect a triangular shape: 584 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 585 if (IsSuccessor(Succ1, Succ2)) 586 return std::make_tuple(Succ1, Succ2); 587 if (IsSuccessor(Succ2, Succ1)) 588 return std::make_tuple(Succ2, Succ1); 589 590 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 591 }; 592 593 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 594 std::make_unique<FunctionOutliningInfo>(); 595 596 BasicBlock *CurrEntry = EntryBlock; 597 bool CandidateFound = false; 598 do { 599 // The number of blocks to be inlined has already reached 600 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 601 // disables partial inlining for the function. 602 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 603 break; 604 605 if (succ_size(CurrEntry) != 2) 606 break; 607 608 BasicBlock *Succ1 = *succ_begin(CurrEntry); 609 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 610 611 BasicBlock *ReturnBlock, *NonReturnBlock; 612 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 613 614 if (ReturnBlock) { 615 OutliningInfo->Entries.push_back(CurrEntry); 616 OutliningInfo->ReturnBlock = ReturnBlock; 617 OutliningInfo->NonReturnBlock = NonReturnBlock; 618 CandidateFound = true; 619 break; 620 } 621 622 BasicBlock *CommSucc, *OtherSucc; 623 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 624 625 if (!CommSucc) 626 break; 627 628 OutliningInfo->Entries.push_back(CurrEntry); 629 CurrEntry = OtherSucc; 630 } while (true); 631 632 if (!CandidateFound) 633 return std::unique_ptr<FunctionOutliningInfo>(); 634 635 // Do sanity check of the entries: threre should not 636 // be any successors (not in the entry set) other than 637 // {ReturnBlock, NonReturnBlock} 638 assert(OutliningInfo->Entries[0] == &F.front() && 639 "Function Entry must be the first in Entries vector"); 640 DenseSet<BasicBlock *> Entries; 641 for (BasicBlock *E : OutliningInfo->Entries) 642 Entries.insert(E); 643 644 // Returns true of BB has Predecessor which is not 645 // in Entries set. 646 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 647 for (auto *Pred : predecessors(BB)) { 648 if (!Entries.count(Pred)) 649 return true; 650 } 651 return false; 652 }; 653 auto CheckAndNormalizeCandidate = 654 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 655 for (BasicBlock *E : OutliningInfo->Entries) { 656 for (auto *Succ : successors(E)) { 657 if (Entries.count(Succ)) 658 continue; 659 if (Succ == OutliningInfo->ReturnBlock) 660 OutliningInfo->ReturnBlockPreds.push_back(E); 661 else if (Succ != OutliningInfo->NonReturnBlock) 662 return false; 663 } 664 // There should not be any outside incoming edges either: 665 if (HasNonEntryPred(E)) 666 return false; 667 } 668 return true; 669 }; 670 671 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 672 return std::unique_ptr<FunctionOutliningInfo>(); 673 674 // Now further growing the candidate's inlining region by 675 // peeling off dominating blocks from the outlining region: 676 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 677 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 678 if (succ_size(Cand) != 2) 679 break; 680 681 if (HasNonEntryPred(Cand)) 682 break; 683 684 BasicBlock *Succ1 = *succ_begin(Cand); 685 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 686 687 BasicBlock *ReturnBlock, *NonReturnBlock; 688 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 689 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 690 break; 691 692 if (NonReturnBlock->getSinglePredecessor() != Cand) 693 break; 694 695 // Now grow and update OutlininigInfo: 696 OutliningInfo->Entries.push_back(Cand); 697 OutliningInfo->NonReturnBlock = NonReturnBlock; 698 OutliningInfo->ReturnBlockPreds.push_back(Cand); 699 Entries.insert(Cand); 700 } 701 702 return OutliningInfo; 703 } 704 705 // Check if there is PGO data or user annotated branch data: 706 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 707 if (F.hasProfileData()) 708 return true; 709 // Now check if any of the entry block has MD_prof data: 710 for (auto *E : OI.Entries) { 711 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 712 if (!BR || BR->isUnconditional()) 713 continue; 714 uint64_t T, F; 715 if (BR->extractProfMetadata(T, F)) 716 return true; 717 } 718 return false; 719 } 720 721 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 722 FunctionCloner &Cloner) const { 723 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 724 auto EntryFreq = 725 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 726 auto OutliningCallFreq = 727 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 728 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 729 // we outlined any regions, so we may encounter situations where the 730 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 731 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 732 OutliningCallFreq = EntryFreq; 733 734 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 735 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 736 737 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get())) 738 return OutlineRegionRelFreq; 739 740 // When profile data is not available, we need to be conservative in 741 // estimating the overall savings. Static branch prediction can usually 742 // guess the branch direction right (taken/non-taken), but the guessed 743 // branch probability is usually not biased enough. In case when the 744 // outlined region is predicted to be likely, its probability needs 745 // to be made higher (more biased) to not under-estimate the cost of 746 // function outlining. On the other hand, if the outlined region 747 // is predicted to be less likely, the predicted probablity is usually 748 // higher than the actual. For instance, the actual probability of the 749 // less likely target is only 5%, but the guessed probablity can be 750 // 40%. In the latter case, there is no need for further adjustement. 751 // FIXME: add an option for this. 752 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 753 return OutlineRegionRelFreq; 754 755 OutlineRegionRelFreq = std::max( 756 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 757 758 return OutlineRegionRelFreq; 759 } 760 761 bool PartialInlinerImpl::shouldPartialInline( 762 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 763 OptimizationRemarkEmitter &ORE) const { 764 using namespace ore; 765 766 Function *Callee = CB.getCalledFunction(); 767 assert(Callee == Cloner.ClonedFunc); 768 769 if (SkipCostAnalysis) 770 return isInlineViable(*Callee).isSuccess(); 771 772 Function *Caller = CB.getCaller(); 773 auto &CalleeTTI = GetTTI(*Callee); 774 bool RemarksEnabled = 775 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 776 DEBUG_TYPE); 777 InlineCost IC = 778 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 779 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 780 781 if (IC.isAlways()) { 782 ORE.emit([&]() { 783 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 784 << NV("Callee", Cloner.OrigFunc) 785 << " should always be fully inlined, not partially"; 786 }); 787 return false; 788 } 789 790 if (IC.isNever()) { 791 ORE.emit([&]() { 792 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 793 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 794 << NV("Caller", Caller) 795 << " because it should never be inlined (cost=never)"; 796 }); 797 return false; 798 } 799 800 if (!IC) { 801 ORE.emit([&]() { 802 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 803 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 804 << NV("Caller", Caller) << " because too costly to inline (cost=" 805 << NV("Cost", IC.getCost()) << ", threshold=" 806 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 807 }); 808 return false; 809 } 810 const DataLayout &DL = Caller->getParent()->getDataLayout(); 811 812 // The savings of eliminating the call: 813 int NonWeightedSavings = getCallsiteCost(CB, DL); 814 BlockFrequency NormWeightedSavings(NonWeightedSavings); 815 816 // Weighted saving is smaller than weighted cost, return false 817 if (NormWeightedSavings < WeightedOutliningRcost) { 818 ORE.emit([&]() { 819 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 820 &CB) 821 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 822 << NV("Caller", Caller) << " runtime overhead (overhead=" 823 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 824 << ", savings=" 825 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 826 << ")" 827 << " of making the outlined call is too high"; 828 }); 829 830 return false; 831 } 832 833 ORE.emit([&]() { 834 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 835 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 836 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 837 << " (threshold=" 838 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 839 }); 840 return true; 841 } 842 843 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 844 // For now use a simplified version. The returned 'InlineCost' will be used 845 // to esimate the size cost as well as runtime cost of the BB. 846 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 847 TargetTransformInfo *TTI) { 848 int InlineCost = 0; 849 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 850 for (Instruction &I : BB->instructionsWithoutDebug()) { 851 // Skip free instructions. 852 switch (I.getOpcode()) { 853 case Instruction::BitCast: 854 case Instruction::PtrToInt: 855 case Instruction::IntToPtr: 856 case Instruction::Alloca: 857 case Instruction::PHI: 858 continue; 859 case Instruction::GetElementPtr: 860 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 861 continue; 862 break; 863 default: 864 break; 865 } 866 867 if (I.isLifetimeStartOrEnd()) 868 continue; 869 870 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 871 Intrinsic::ID IID = II->getIntrinsicID(); 872 SmallVector<Type *, 4> Tys; 873 FastMathFlags FMF; 874 for (Value *Val : II->args()) 875 Tys.push_back(Val->getType()); 876 877 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 878 FMF = FPMO->getFastMathFlags(); 879 880 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 881 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 882 continue; 883 } 884 885 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 886 InlineCost += getCallsiteCost(*CI, DL); 887 continue; 888 } 889 890 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 891 InlineCost += getCallsiteCost(*II, DL); 892 continue; 893 } 894 895 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 896 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 897 continue; 898 } 899 InlineCost += InlineConstants::InstrCost; 900 } 901 return InlineCost; 902 } 903 904 std::tuple<int, int> 905 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 906 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 907 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 908 Function *OutlinedFunc = FuncBBPair.first; 909 BasicBlock* OutliningCallBB = FuncBBPair.second; 910 // Now compute the cost of the call sequence to the outlined function 911 // 'OutlinedFunction' in BB 'OutliningCallBB': 912 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 913 OutliningFuncCallCost += 914 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 915 916 // Now compute the cost of the extracted/outlined function itself: 917 for (BasicBlock &BB : *OutlinedFunc) 918 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 919 } 920 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 921 "Outlined function cost should be no less than the outlined region"); 922 923 // The code extractor introduces a new root and exit stub blocks with 924 // additional unconditional branches. Those branches will be eliminated 925 // later with bb layout. The cost should be adjusted accordingly: 926 OutlinedFunctionCost -= 927 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 928 929 int OutliningRuntimeOverhead = 930 OutliningFuncCallCost + 931 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 932 ExtraOutliningPenalty; 933 934 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 935 } 936 937 // Create the callsite to profile count map which is 938 // used to update the original function's entry count, 939 // after the function is partially inlined into the callsite. 940 void PartialInlinerImpl::computeCallsiteToProfCountMap( 941 Function *DuplicateFunction, 942 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 943 std::vector<User *> Users(DuplicateFunction->user_begin(), 944 DuplicateFunction->user_end()); 945 Function *CurrentCaller = nullptr; 946 std::unique_ptr<BlockFrequencyInfo> TempBFI; 947 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 948 949 auto ComputeCurrBFI = [&,this](Function *Caller) { 950 // For the old pass manager: 951 if (!GetBFI) { 952 DominatorTree DT(*Caller); 953 LoopInfo LI(DT); 954 BranchProbabilityInfo BPI(*Caller, LI); 955 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 956 CurrentCallerBFI = TempBFI.get(); 957 } else { 958 // New pass manager: 959 CurrentCallerBFI = &(GetBFI(*Caller)); 960 } 961 }; 962 963 for (User *User : Users) { 964 CallBase *CB = getSupportedCallBase(User); 965 Function *Caller = CB->getCaller(); 966 if (CurrentCaller != Caller) { 967 CurrentCaller = Caller; 968 ComputeCurrBFI(Caller); 969 } else { 970 assert(CurrentCallerBFI && "CallerBFI is not set"); 971 } 972 BasicBlock *CallBB = CB->getParent(); 973 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 974 if (Count) 975 CallSiteToProfCountMap[User] = *Count; 976 else 977 CallSiteToProfCountMap[User] = 0; 978 } 979 } 980 981 PartialInlinerImpl::FunctionCloner::FunctionCloner( 982 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 983 function_ref<AssumptionCache *(Function &)> LookupAC, 984 function_ref<TargetTransformInfo &(Function &)> GetTTI) 985 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 986 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 987 988 // Clone the function, so that we can hack away on it. 989 ValueToValueMapTy VMap; 990 ClonedFunc = CloneFunction(F, VMap); 991 992 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 993 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 994 for (BasicBlock *BB : OI->Entries) 995 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 996 997 for (BasicBlock *E : OI->ReturnBlockPreds) { 998 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 999 ClonedOI->ReturnBlockPreds.push_back(NewE); 1000 } 1001 // Go ahead and update all uses to the duplicate, so that we can just 1002 // use the inliner functionality when we're done hacking. 1003 F->replaceAllUsesWith(ClonedFunc); 1004 } 1005 1006 PartialInlinerImpl::FunctionCloner::FunctionCloner( 1007 Function *F, FunctionOutliningMultiRegionInfo *OI, 1008 OptimizationRemarkEmitter &ORE, 1009 function_ref<AssumptionCache *(Function &)> LookupAC, 1010 function_ref<TargetTransformInfo &(Function &)> GetTTI) 1011 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 1012 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1013 1014 // Clone the function, so that we can hack away on it. 1015 ValueToValueMapTy VMap; 1016 ClonedFunc = CloneFunction(F, VMap); 1017 1018 // Go through all Outline Candidate Regions and update all BasicBlock 1019 // information. 1020 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1021 OI->ORI) { 1022 SmallVector<BasicBlock *, 8> Region; 1023 for (BasicBlock *BB : RegionInfo.Region) 1024 Region.push_back(cast<BasicBlock>(VMap[BB])); 1025 1026 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1027 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1028 BasicBlock *NewReturnBlock = nullptr; 1029 if (RegionInfo.ReturnBlock) 1030 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1031 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1032 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1033 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1034 } 1035 // Go ahead and update all uses to the duplicate, so that we can just 1036 // use the inliner functionality when we're done hacking. 1037 F->replaceAllUsesWith(ClonedFunc); 1038 } 1039 1040 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 1041 auto GetFirstPHI = [](BasicBlock *BB) { 1042 BasicBlock::iterator I = BB->begin(); 1043 PHINode *FirstPhi = nullptr; 1044 while (I != BB->end()) { 1045 PHINode *Phi = dyn_cast<PHINode>(I); 1046 if (!Phi) 1047 break; 1048 if (!FirstPhi) { 1049 FirstPhi = Phi; 1050 break; 1051 } 1052 } 1053 return FirstPhi; 1054 }; 1055 1056 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1057 // blocks. 1058 if (!ClonedOI) 1059 return; 1060 1061 // Special hackery is needed with PHI nodes that have inputs from more than 1062 // one extracted block. For simplicity, just split the PHIs into a two-level 1063 // sequence of PHIs, some of which will go in the extracted region, and some 1064 // of which will go outside. 1065 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1066 // only split block when necessary: 1067 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1068 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1069 1070 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1071 return; 1072 1073 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1074 Value *CommonValue = PN->getIncomingValue(0); 1075 if (all_of(PN->incoming_values(), 1076 [&](Value *V) { return V == CommonValue; })) 1077 return CommonValue; 1078 return nullptr; 1079 }; 1080 1081 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1082 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1083 BasicBlock::iterator I = PreReturn->begin(); 1084 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1085 SmallVector<Instruction *, 4> DeadPhis; 1086 while (I != PreReturn->end()) { 1087 PHINode *OldPhi = dyn_cast<PHINode>(I); 1088 if (!OldPhi) 1089 break; 1090 1091 PHINode *RetPhi = 1092 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1093 OldPhi->replaceAllUsesWith(RetPhi); 1094 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1095 1096 RetPhi->addIncoming(&*I, PreReturn); 1097 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1098 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1099 OldPhi->removeIncomingValue(E); 1100 } 1101 1102 // After incoming values splitting, the old phi may become trivial. 1103 // Keeping the trivial phi can introduce definition inside the outline 1104 // region which is live-out, causing necessary overhead (load, store 1105 // arg passing etc). 1106 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1107 OldPhi->replaceAllUsesWith(OldPhiVal); 1108 DeadPhis.push_back(OldPhi); 1109 } 1110 ++I; 1111 } 1112 for (auto *DP : DeadPhis) 1113 DP->eraseFromParent(); 1114 1115 for (auto *E : ClonedOI->ReturnBlockPreds) 1116 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1117 } 1118 1119 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1120 1121 auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) { 1122 int Cost = 0; 1123 for (BasicBlock* BB : Region) 1124 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1125 return Cost; 1126 }; 1127 1128 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1129 1130 if (ClonedOMRI->ORI.empty()) 1131 return false; 1132 1133 // The CodeExtractor needs a dominator tree. 1134 DominatorTree DT; 1135 DT.recalculate(*ClonedFunc); 1136 1137 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1138 LoopInfo LI(DT); 1139 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1140 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1141 1142 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1143 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1144 1145 SetVector<Value *> Inputs, Outputs, Sinks; 1146 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1147 ClonedOMRI->ORI) { 1148 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1149 1150 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1151 ClonedFuncBFI.get(), &BPI, 1152 LookupAC(*RegionInfo.EntryBlock->getParent()), 1153 /* AllowVarargs */ false); 1154 1155 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1156 1157 LLVM_DEBUG({ 1158 dbgs() << "inputs: " << Inputs.size() << "\n"; 1159 dbgs() << "outputs: " << Outputs.size() << "\n"; 1160 for (Value *value : Inputs) 1161 dbgs() << "value used in func: " << *value << "\n"; 1162 for (Value *output : Outputs) 1163 dbgs() << "instr used in func: " << *output << "\n"; 1164 }); 1165 1166 // Do not extract regions that have live exit variables. 1167 if (Outputs.size() > 0 && !ForceLiveExit) 1168 continue; 1169 1170 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1171 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1172 BasicBlock *OutliningCallBB = OCS->getParent(); 1173 assert(OutliningCallBB->getParent() == ClonedFunc); 1174 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1175 NumColdRegionsOutlined++; 1176 OutlinedRegionCost += CurrentOutlinedRegionCost; 1177 1178 if (MarkOutlinedColdCC) { 1179 OutlinedFunc->setCallingConv(CallingConv::Cold); 1180 OCS->setCallingConv(CallingConv::Cold); 1181 } 1182 } else 1183 ORE.emit([&]() { 1184 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1185 &RegionInfo.Region.front()->front()) 1186 << "Failed to extract region at block " 1187 << ore::NV("Block", RegionInfo.Region.front()); 1188 }); 1189 } 1190 1191 return !OutlinedFunctions.empty(); 1192 } 1193 1194 Function * 1195 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1196 // Returns true if the block is to be partial inlined into the caller 1197 // (i.e. not to be extracted to the out of line function) 1198 auto ToBeInlined = [&, this](BasicBlock *BB) { 1199 return BB == ClonedOI->ReturnBlock || 1200 llvm::is_contained(ClonedOI->Entries, BB); 1201 }; 1202 1203 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1204 // The CodeExtractor needs a dominator tree. 1205 DominatorTree DT; 1206 DT.recalculate(*ClonedFunc); 1207 1208 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1209 LoopInfo LI(DT); 1210 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1211 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1212 1213 // Gather up the blocks that we're going to extract. 1214 std::vector<BasicBlock *> ToExtract; 1215 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1216 ToExtract.push_back(ClonedOI->NonReturnBlock); 1217 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1218 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1219 for (BasicBlock &BB : *ClonedFunc) 1220 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1221 ToExtract.push_back(&BB); 1222 // FIXME: the code extractor may hoist/sink more code 1223 // into the outlined function which may make the outlining 1224 // overhead (the difference of the outlined function cost 1225 // and OutliningRegionCost) look larger. 1226 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); 1227 } 1228 1229 // Extract the body of the if. 1230 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1231 Function *OutlinedFunc = 1232 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1233 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1234 /* AllowVarargs */ true) 1235 .extractCodeRegion(CEAC); 1236 1237 if (OutlinedFunc) { 1238 BasicBlock *OutliningCallBB = 1239 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1240 assert(OutliningCallBB->getParent() == ClonedFunc); 1241 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1242 } else 1243 ORE.emit([&]() { 1244 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1245 &ToExtract.front()->front()) 1246 << "Failed to extract region at block " 1247 << ore::NV("Block", ToExtract.front()); 1248 }); 1249 1250 return OutlinedFunc; 1251 } 1252 1253 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1254 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1255 // users (function pointers, etc.) back to the original function. 1256 ClonedFunc->replaceAllUsesWith(OrigFunc); 1257 ClonedFunc->eraseFromParent(); 1258 if (!IsFunctionInlined) { 1259 // Remove each function that was speculatively created if there is no 1260 // reference. 1261 for (auto FuncBBPair : OutlinedFunctions) { 1262 Function *Func = FuncBBPair.first; 1263 Func->eraseFromParent(); 1264 } 1265 } 1266 } 1267 1268 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1269 if (F.hasAddressTaken()) 1270 return {false, nullptr}; 1271 1272 // Let inliner handle it 1273 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1274 return {false, nullptr}; 1275 1276 if (F.hasFnAttribute(Attribute::NoInline)) 1277 return {false, nullptr}; 1278 1279 if (PSI.isFunctionEntryCold(&F)) 1280 return {false, nullptr}; 1281 1282 if (F.users().empty()) 1283 return {false, nullptr}; 1284 1285 OptimizationRemarkEmitter ORE(&F); 1286 1287 // Only try to outline cold regions if we have a profile summary, which 1288 // implies we have profiling information. 1289 if (PSI.hasProfileSummary() && F.hasProfileData() && 1290 !DisableMultiRegionPartialInline) { 1291 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1292 computeOutliningColdRegionsInfo(F, ORE); 1293 if (OMRI) { 1294 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1295 1296 LLVM_DEBUG({ 1297 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1298 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1299 << "\n"; 1300 }); 1301 1302 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1303 1304 if (DidOutline) { 1305 LLVM_DEBUG({ 1306 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1307 Cloner.ClonedFunc->print(dbgs()); 1308 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1309 }); 1310 1311 if (tryPartialInline(Cloner)) 1312 return {true, nullptr}; 1313 } 1314 } 1315 } 1316 1317 // Fall-thru to regular partial inlining if we: 1318 // i) can't find any cold regions to outline, or 1319 // ii) can't inline the outlined function anywhere. 1320 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1321 if (!OI) 1322 return {false, nullptr}; 1323 1324 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1325 Cloner.normalizeReturnBlock(); 1326 1327 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1328 1329 if (!OutlinedFunction) 1330 return {false, nullptr}; 1331 1332 if (tryPartialInline(Cloner)) 1333 return {true, OutlinedFunction}; 1334 1335 return {false, nullptr}; 1336 } 1337 1338 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1339 if (Cloner.OutlinedFunctions.empty()) 1340 return false; 1341 1342 int SizeCost = 0; 1343 BlockFrequency WeightedRcost; 1344 int NonWeightedRcost; 1345 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1346 1347 // Only calculate RelativeToEntryFreq when we are doing single region 1348 // outlining. 1349 BranchProbability RelativeToEntryFreq; 1350 if (Cloner.ClonedOI) 1351 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1352 else 1353 // RelativeToEntryFreq doesn't make sense when we have more than one 1354 // outlined call because each call will have a different relative frequency 1355 // to the entry block. We can consider using the average, but the 1356 // usefulness of that information is questionable. For now, assume we never 1357 // execute the calls to outlined functions. 1358 RelativeToEntryFreq = BranchProbability(0, 1); 1359 1360 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1361 1362 // The call sequence(s) to the outlined function(s) are larger than the sum of 1363 // the original outlined region size(s), it does not increase the chances of 1364 // inlining the function with outlining (The inliner uses the size increase to 1365 // model the cost of inlining a callee). 1366 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1367 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1368 DebugLoc DLoc; 1369 BasicBlock *Block; 1370 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1371 OrigFuncORE.emit([&]() { 1372 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1373 DLoc, Block) 1374 << ore::NV("Function", Cloner.OrigFunc) 1375 << " not partially inlined into callers (Original Size = " 1376 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1377 << ", Size of call sequence to outlined function = " 1378 << ore::NV("NewSize", SizeCost) << ")"; 1379 }); 1380 return false; 1381 } 1382 1383 assert(Cloner.OrigFunc->users().empty() && 1384 "F's users should all be replaced!"); 1385 1386 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1387 Cloner.ClonedFunc->user_end()); 1388 1389 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1390 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1391 if (CalleeEntryCount) 1392 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1393 1394 uint64_t CalleeEntryCountV = 1395 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1396 1397 bool AnyInline = false; 1398 for (User *User : Users) { 1399 CallBase *CB = getSupportedCallBase(User); 1400 1401 if (isLimitReached()) 1402 continue; 1403 1404 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1405 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1406 continue; 1407 1408 // Construct remark before doing the inlining, as after successful inlining 1409 // the callsite is removed. 1410 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1411 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1412 << ore::NV("Caller", CB->getCaller()); 1413 1414 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 1415 // We can only forward varargs when we outlined a single region, else we 1416 // bail on vararg functions. 1417 if (!InlineFunction(*CB, IFI, nullptr, true, 1418 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1419 : nullptr)) 1420 .isSuccess()) 1421 continue; 1422 1423 CallerORE.emit(OR); 1424 1425 // Now update the entry count: 1426 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1427 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1428 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1429 } 1430 1431 AnyInline = true; 1432 NumPartialInlining++; 1433 // Update the stats 1434 if (Cloner.ClonedOI) 1435 NumPartialInlined++; 1436 else 1437 NumColdOutlinePartialInlined++; 1438 } 1439 1440 if (AnyInline) { 1441 Cloner.IsFunctionInlined = true; 1442 if (CalleeEntryCount) 1443 Cloner.OrigFunc->setEntryCount( 1444 CalleeEntryCount.setCount(CalleeEntryCountV)); 1445 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1446 OrigFuncORE.emit([&]() { 1447 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1448 << "Partially inlined into at least one caller"; 1449 }); 1450 } 1451 1452 return AnyInline; 1453 } 1454 1455 bool PartialInlinerImpl::run(Module &M) { 1456 if (DisablePartialInlining) 1457 return false; 1458 1459 std::vector<Function *> Worklist; 1460 Worklist.reserve(M.size()); 1461 for (Function &F : M) 1462 if (!F.use_empty() && !F.isDeclaration()) 1463 Worklist.push_back(&F); 1464 1465 bool Changed = false; 1466 while (!Worklist.empty()) { 1467 Function *CurrFunc = Worklist.back(); 1468 Worklist.pop_back(); 1469 1470 if (CurrFunc->use_empty()) 1471 continue; 1472 1473 bool Recursive = false; 1474 for (User *U : CurrFunc->users()) 1475 if (Instruction *I = dyn_cast<Instruction>(U)) 1476 if (I->getParent()->getParent() == CurrFunc) { 1477 Recursive = true; 1478 break; 1479 } 1480 if (Recursive) 1481 continue; 1482 1483 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1484 if (Result.second) 1485 Worklist.push_back(Result.second); 1486 Changed |= Result.first; 1487 } 1488 1489 return Changed; 1490 } 1491 1492 char PartialInlinerLegacyPass::ID = 0; 1493 1494 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1495 "Partial Inliner", false, false) 1496 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1497 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1498 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1499 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1500 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1501 "Partial Inliner", false, false) 1502 1503 ModulePass *llvm::createPartialInliningPass() { 1504 return new PartialInlinerLegacyPass(); 1505 } 1506 1507 PreservedAnalyses PartialInlinerPass::run(Module &M, 1508 ModuleAnalysisManager &AM) { 1509 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1510 1511 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1512 return FAM.getResult<AssumptionAnalysis>(F); 1513 }; 1514 1515 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1516 return FAM.getCachedResult<AssumptionAnalysis>(F); 1517 }; 1518 1519 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1520 return FAM.getResult<BlockFrequencyAnalysis>(F); 1521 }; 1522 1523 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1524 return FAM.getResult<TargetIRAnalysis>(F); 1525 }; 1526 1527 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1528 return FAM.getResult<TargetLibraryAnalysis>(F); 1529 }; 1530 1531 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1532 1533 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1534 GetTLI, PSI, GetBFI) 1535 .run(M)) 1536 return PreservedAnalyses::none(); 1537 return PreservedAnalyses::all(); 1538 } 1539