1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/Operator.h" 44 #include "llvm/IR/User.h" 45 #include "llvm/InitializePasses.h" 46 #include "llvm/Pass.h" 47 #include "llvm/Support/BlockFrequency.h" 48 #include "llvm/Support/BranchProbability.h" 49 #include "llvm/Support/Casting.h" 50 #include "llvm/Support/CommandLine.h" 51 #include "llvm/Support/ErrorHandling.h" 52 #include "llvm/Transforms/IPO.h" 53 #include "llvm/Transforms/Utils/Cloning.h" 54 #include "llvm/Transforms/Utils/CodeExtractor.h" 55 #include "llvm/Transforms/Utils/ValueMapper.h" 56 #include <algorithm> 57 #include <cassert> 58 #include <cstdint> 59 #include <functional> 60 #include <iterator> 61 #include <memory> 62 #include <tuple> 63 #include <vector> 64 65 using namespace llvm; 66 67 #define DEBUG_TYPE "partial-inlining" 68 69 STATISTIC(NumPartialInlined, 70 "Number of callsites functions partially inlined into."); 71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 72 "cold outlined regions were partially " 73 "inlined into its caller(s)."); 74 STATISTIC(NumColdRegionsFound, 75 "Number of cold single entry/exit regions found."); 76 STATISTIC(NumColdRegionsOutlined, 77 "Number of cold single entry/exit regions outlined."); 78 79 // Command line option to disable partial-inlining. The default is false: 80 static cl::opt<bool> 81 DisablePartialInlining("disable-partial-inlining", cl::init(false), 82 cl::Hidden, cl::desc("Disable partial inlining")); 83 // Command line option to disable multi-region partial-inlining. The default is 84 // false: 85 static cl::opt<bool> DisableMultiRegionPartialInline( 86 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 87 cl::desc("Disable multi-region partial inlining")); 88 89 // Command line option to force outlining in regions with live exit variables. 90 // The default is false: 91 static cl::opt<bool> 92 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 93 cl::desc("Force outline regions with live exits")); 94 95 // Command line option to enable marking outline functions with Cold Calling 96 // Convention. The default is false: 97 static cl::opt<bool> 98 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 99 cl::desc("Mark outline function calls with ColdCC")); 100 101 // This is an option used by testing: 102 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 103 cl::init(false), cl::ZeroOrMore, 104 cl::ReallyHidden, 105 cl::desc("Skip Cost Analysis")); 106 // Used to determine if a cold region is worth outlining based on 107 // its inlining cost compared to the original function. Default is set at 10%. 108 // ie. if the cold region reduces the inlining cost of the original function by 109 // at least 10%. 110 static cl::opt<float> MinRegionSizeRatio( 111 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 112 cl::desc("Minimum ratio comparing relative sizes of each " 113 "outline candidate and original function")); 114 // Used to tune the minimum number of execution counts needed in the predecessor 115 // block to the cold edge. ie. confidence interval. 116 static cl::opt<unsigned> 117 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 118 cl::desc("Minimum block executions to consider " 119 "its BranchProbabilityInfo valid")); 120 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 121 // if the branch probability is 10% or less, then it is deemed as 'cold'. 122 static cl::opt<float> ColdBranchRatio( 123 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 124 cl::desc("Minimum BranchProbability to consider a region cold.")); 125 126 static cl::opt<unsigned> MaxNumInlineBlocks( 127 "max-num-inline-blocks", cl::init(5), cl::Hidden, 128 cl::desc("Max number of blocks to be partially inlined")); 129 130 // Command line option to set the maximum number of partial inlining allowed 131 // for the module. The default value of -1 means no limit. 132 static cl::opt<int> MaxNumPartialInlining( 133 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 134 cl::desc("Max number of partial inlining. The default is unlimited")); 135 136 // Used only when PGO or user annotated branch data is absent. It is 137 // the least value that is used to weigh the outline region. If BFI 138 // produces larger value, the BFI value will be used. 139 static cl::opt<int> 140 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 141 cl::Hidden, cl::ZeroOrMore, 142 cl::desc("Relative frequency of outline region to " 143 "the entry block")); 144 145 static cl::opt<unsigned> ExtraOutliningPenalty( 146 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 147 cl::desc("A debug option to add additional penalty to the computed one.")); 148 149 namespace { 150 151 struct FunctionOutliningInfo { 152 FunctionOutliningInfo() = default; 153 154 // Returns the number of blocks to be inlined including all blocks 155 // in Entries and one return block. 156 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; } 157 158 // A set of blocks including the function entry that guard 159 // the region to be outlined. 160 SmallVector<BasicBlock *, 4> Entries; 161 162 // The return block that is not included in the outlined region. 163 BasicBlock *ReturnBlock = nullptr; 164 165 // The dominating block of the region to be outlined. 166 BasicBlock *NonReturnBlock = nullptr; 167 168 // The set of blocks in Entries that that are predecessors to ReturnBlock 169 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 170 }; 171 172 struct FunctionOutliningMultiRegionInfo { 173 FunctionOutliningMultiRegionInfo() = default; 174 175 // Container for outline regions 176 struct OutlineRegionInfo { 177 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 178 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 179 BasicBlock *ReturnBlock) 180 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 181 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 182 SmallVector<BasicBlock *, 8> Region; 183 BasicBlock *EntryBlock; 184 BasicBlock *ExitBlock; 185 BasicBlock *ReturnBlock; 186 }; 187 188 SmallVector<OutlineRegionInfo, 4> ORI; 189 }; 190 191 struct PartialInlinerImpl { 192 193 PartialInlinerImpl( 194 function_ref<AssumptionCache &(Function &)> GetAC, 195 function_ref<AssumptionCache *(Function &)> LookupAC, 196 function_ref<TargetTransformInfo &(Function &)> GTTI, 197 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 198 ProfileSummaryInfo &ProfSI, 199 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 200 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 201 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 202 203 bool run(Module &M); 204 // Main part of the transformation that calls helper functions to find 205 // outlining candidates, clone & outline the function, and attempt to 206 // partially inline the resulting function. Returns true if 207 // inlining was successful, false otherwise. Also returns the outline 208 // function (only if we partially inlined early returns) as there is a 209 // possibility to further "peel" early return statements that were left in the 210 // outline function due to code size. 211 std::pair<bool, Function *> unswitchFunction(Function &F); 212 213 // This class speculatively clones the function to be partial inlined. 214 // At the end of partial inlining, the remaining callsites to the cloned 215 // function that are not partially inlined will be fixed up to reference 216 // the original function, and the cloned function will be erased. 217 struct FunctionCloner { 218 // Two constructors, one for single region outlining, the other for 219 // multi-region outlining. 220 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 221 OptimizationRemarkEmitter &ORE, 222 function_ref<AssumptionCache *(Function &)> LookupAC, 223 function_ref<TargetTransformInfo &(Function &)> GetTTI); 224 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 225 OptimizationRemarkEmitter &ORE, 226 function_ref<AssumptionCache *(Function &)> LookupAC, 227 function_ref<TargetTransformInfo &(Function &)> GetTTI); 228 229 ~FunctionCloner(); 230 231 // Prepare for function outlining: making sure there is only 232 // one incoming edge from the extracted/outlined region to 233 // the return block. 234 void normalizeReturnBlock() const; 235 236 // Do function outlining for cold regions. 237 bool doMultiRegionFunctionOutlining(); 238 // Do function outlining for region after early return block(s). 239 // NOTE: For vararg functions that do the vararg handling in the outlined 240 // function, we temporarily generate IR that does not properly 241 // forward varargs to the outlined function. Calling InlineFunction 242 // will update calls to the outlined functions to properly forward 243 // the varargs. 244 Function *doSingleRegionFunctionOutlining(); 245 246 Function *OrigFunc = nullptr; 247 Function *ClonedFunc = nullptr; 248 249 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 250 // Keep track of Outlined Functions and the basic block they're called from. 251 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 252 253 // ClonedFunc is inlined in one of its callers after function 254 // outlining. 255 bool IsFunctionInlined = false; 256 // The cost of the region to be outlined. 257 InstructionCost OutlinedRegionCost = 0; 258 // ClonedOI is specific to outlining non-early return blocks. 259 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 260 // ClonedOMRI is specific to outlining cold regions. 261 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 262 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 263 OptimizationRemarkEmitter &ORE; 264 function_ref<AssumptionCache *(Function &)> LookupAC; 265 function_ref<TargetTransformInfo &(Function &)> GetTTI; 266 }; 267 268 private: 269 int NumPartialInlining = 0; 270 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 271 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 272 function_ref<TargetTransformInfo &(Function &)> GetTTI; 273 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 274 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 275 ProfileSummaryInfo &PSI; 276 277 // Return the frequency of the OutlininingBB relative to F's entry point. 278 // The result is no larger than 1 and is represented using BP. 279 // (Note that the outlined region's 'head' block can only have incoming 280 // edges from the guarding entry blocks). 281 BranchProbability 282 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const; 283 284 // Return true if the callee of CB should be partially inlined with 285 // profit. 286 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 287 BlockFrequency WeightedOutliningRcost, 288 OptimizationRemarkEmitter &ORE) const; 289 290 // Try to inline DuplicateFunction (cloned from F with call to 291 // the OutlinedFunction into its callers. Return true 292 // if there is any successful inlining. 293 bool tryPartialInline(FunctionCloner &Cloner); 294 295 // Compute the mapping from use site of DuplicationFunction to the enclosing 296 // BB's profile count. 297 void 298 computeCallsiteToProfCountMap(Function *DuplicateFunction, 299 DenseMap<User *, uint64_t> &SiteCountMap) const; 300 301 bool isLimitReached() const { 302 return (MaxNumPartialInlining != -1 && 303 NumPartialInlining >= MaxNumPartialInlining); 304 } 305 306 static CallBase *getSupportedCallBase(User *U) { 307 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 308 return cast<CallBase>(U); 309 llvm_unreachable("All uses must be calls"); 310 return nullptr; 311 } 312 313 static CallBase *getOneCallSiteTo(Function &F) { 314 User *User = *F.user_begin(); 315 return getSupportedCallBase(User); 316 } 317 318 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const { 319 CallBase *CB = getOneCallSiteTo(F); 320 DebugLoc DLoc = CB->getDebugLoc(); 321 BasicBlock *Block = CB->getParent(); 322 return std::make_tuple(DLoc, Block); 323 } 324 325 // Returns the costs associated with function outlining: 326 // - The first value is the non-weighted runtime cost for making the call 327 // to the outlined function, including the addtional setup cost in the 328 // outlined function itself; 329 // - The second value is the estimated size of the new call sequence in 330 // basic block Cloner.OutliningCallBB; 331 std::tuple<InstructionCost, InstructionCost> 332 computeOutliningCosts(FunctionCloner &Cloner) const; 333 334 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 335 // approximate both the size and runtime cost (Note that in the current 336 // inline cost analysis, there is no clear distinction there either). 337 static InstructionCost computeBBInlineCost(BasicBlock *BB, 338 TargetTransformInfo *TTI); 339 340 std::unique_ptr<FunctionOutliningInfo> 341 computeOutliningInfo(Function &F) const; 342 343 std::unique_ptr<FunctionOutliningMultiRegionInfo> 344 computeOutliningColdRegionsInfo(Function &F, 345 OptimizationRemarkEmitter &ORE) const; 346 }; 347 348 struct PartialInlinerLegacyPass : public ModulePass { 349 static char ID; // Pass identification, replacement for typeid 350 351 PartialInlinerLegacyPass() : ModulePass(ID) { 352 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 353 } 354 355 void getAnalysisUsage(AnalysisUsage &AU) const override { 356 AU.addRequired<AssumptionCacheTracker>(); 357 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 358 AU.addRequired<TargetTransformInfoWrapperPass>(); 359 AU.addRequired<TargetLibraryInfoWrapperPass>(); 360 } 361 362 bool runOnModule(Module &M) override { 363 if (skipModule(M)) 364 return false; 365 366 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 367 TargetTransformInfoWrapperPass *TTIWP = 368 &getAnalysis<TargetTransformInfoWrapperPass>(); 369 ProfileSummaryInfo &PSI = 370 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 371 372 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 373 return ACT->getAssumptionCache(F); 374 }; 375 376 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 377 return ACT->lookupAssumptionCache(F); 378 }; 379 380 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 381 return TTIWP->getTTI(F); 382 }; 383 384 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 385 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 386 }; 387 388 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 389 GetTLI, PSI) 390 .run(M); 391 } 392 }; 393 394 } // end anonymous namespace 395 396 std::unique_ptr<FunctionOutliningMultiRegionInfo> 397 PartialInlinerImpl::computeOutliningColdRegionsInfo( 398 Function &F, OptimizationRemarkEmitter &ORE) const { 399 BasicBlock *EntryBlock = &F.front(); 400 401 DominatorTree DT(F); 402 LoopInfo LI(DT); 403 BranchProbabilityInfo BPI(F, LI); 404 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 405 BlockFrequencyInfo *BFI; 406 if (!GetBFI) { 407 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI)); 408 BFI = ScopedBFI.get(); 409 } else 410 BFI = &(GetBFI(F)); 411 412 // Return if we don't have profiling information. 413 if (!PSI.hasInstrumentationProfile()) 414 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 415 416 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 417 std::make_unique<FunctionOutliningMultiRegionInfo>(); 418 419 auto IsSingleExit = 420 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 421 BasicBlock *ExitBlock = nullptr; 422 for (auto *Block : BlockList) { 423 for (BasicBlock *Succ : successors(Block)) { 424 if (!is_contained(BlockList, Succ)) { 425 if (ExitBlock) { 426 ORE.emit([&]() { 427 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 428 &Succ->front()) 429 << "Region dominated by " 430 << ore::NV("Block", BlockList.front()->getName()) 431 << " has more than one region exit edge."; 432 }); 433 return nullptr; 434 } 435 436 ExitBlock = Block; 437 } 438 } 439 } 440 return ExitBlock; 441 }; 442 443 auto BBProfileCount = [BFI](BasicBlock *BB) { 444 return BFI->getBlockProfileCount(BB).getValueOr(0); 445 }; 446 447 // Use the same computeBBInlineCost function to compute the cost savings of 448 // the outlining the candidate region. 449 TargetTransformInfo *FTTI = &GetTTI(F); 450 InstructionCost OverallFunctionCost = 0; 451 for (auto &BB : F) 452 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 453 454 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost 455 << "\n";); 456 457 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map( 458 [&](auto Cost) { return Cost * MinRegionSizeRatio; }); 459 460 BranchProbability MinBranchProbability( 461 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 462 MinBlockCounterExecution); 463 bool ColdCandidateFound = false; 464 BasicBlock *CurrEntry = EntryBlock; 465 std::vector<BasicBlock *> DFS; 466 DenseMap<BasicBlock *, bool> VisitedMap; 467 DFS.push_back(CurrEntry); 468 VisitedMap[CurrEntry] = true; 469 470 // Use Depth First Search on the basic blocks to find CFG edges that are 471 // considered cold. 472 // Cold regions considered must also have its inline cost compared to the 473 // overall inline cost of the original function. The region is outlined only 474 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 475 // more. 476 while (!DFS.empty()) { 477 auto *ThisBB = DFS.back(); 478 DFS.pop_back(); 479 // Only consider regions with predecessor blocks that are considered 480 // not-cold (default: part of the top 99.99% of all block counters) 481 // AND greater than our minimum block execution count (default: 100). 482 if (PSI.isColdBlock(ThisBB, BFI) || 483 BBProfileCount(ThisBB) < MinBlockCounterExecution) 484 continue; 485 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) { 486 if (VisitedMap[*SI]) 487 continue; 488 VisitedMap[*SI] = true; 489 DFS.push_back(*SI); 490 // If branch isn't cold, we skip to the next one. 491 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI); 492 if (SuccProb > MinBranchProbability) 493 continue; 494 495 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->" 496 << SI->getName() 497 << "\nBranch Probability = " << SuccProb << "\n";); 498 499 SmallVector<BasicBlock *, 8> DominateVector; 500 DT.getDescendants(*SI, DominateVector); 501 assert(!DominateVector.empty() && 502 "SI should be reachable and have at least itself as descendant"); 503 504 // We can only outline single entry regions (for now). 505 if (!DominateVector.front()->hasNPredecessors(1)) { 506 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 507 << " doesn't have a single predecessor in the " 508 "dominator tree\n";); 509 continue; 510 } 511 512 BasicBlock *ExitBlock = nullptr; 513 // We can only outline single exit regions (for now). 514 if (!(ExitBlock = IsSingleExit(DominateVector))) { 515 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName() 516 << " doesn't have a unique successor\n";); 517 continue; 518 } 519 520 InstructionCost OutlineRegionCost = 0; 521 for (auto *BB : DominateVector) 522 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 523 524 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost 525 << "\n";); 526 527 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) { 528 ORE.emit([&]() { 529 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 530 &SI->front()) 531 << ore::NV("Callee", &F) 532 << " inline cost-savings smaller than " 533 << ore::NV("Cost", MinOutlineRegionCost); 534 }); 535 536 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than " 537 << MinOutlineRegionCost << "\n";); 538 continue; 539 } 540 541 // For now, ignore blocks that belong to a SISE region that is a 542 // candidate for outlining. In the future, we may want to look 543 // at inner regions because the outer region may have live-exit 544 // variables. 545 for (auto *BB : DominateVector) 546 VisitedMap[BB] = true; 547 548 // ReturnBlock here means the block after the outline call 549 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 550 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 551 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 552 OutliningInfo->ORI.push_back(RegInfo); 553 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: " 554 << DominateVector.front()->getName() << "\n";); 555 ColdCandidateFound = true; 556 NumColdRegionsFound++; 557 } 558 } 559 560 if (ColdCandidateFound) 561 return OutliningInfo; 562 563 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 564 } 565 566 std::unique_ptr<FunctionOutliningInfo> 567 PartialInlinerImpl::computeOutliningInfo(Function &F) const { 568 BasicBlock *EntryBlock = &F.front(); 569 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 570 if (!BR || BR->isUnconditional()) 571 return std::unique_ptr<FunctionOutliningInfo>(); 572 573 // Returns true if Succ is BB's successor 574 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 575 return is_contained(successors(BB), Succ); 576 }; 577 578 auto IsReturnBlock = [](BasicBlock *BB) { 579 Instruction *TI = BB->getTerminator(); 580 return isa<ReturnInst>(TI); 581 }; 582 583 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 584 if (IsReturnBlock(Succ1)) 585 return std::make_tuple(Succ1, Succ2); 586 if (IsReturnBlock(Succ2)) 587 return std::make_tuple(Succ2, Succ1); 588 589 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 590 }; 591 592 // Detect a triangular shape: 593 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 594 if (IsSuccessor(Succ1, Succ2)) 595 return std::make_tuple(Succ1, Succ2); 596 if (IsSuccessor(Succ2, Succ1)) 597 return std::make_tuple(Succ2, Succ1); 598 599 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 600 }; 601 602 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 603 std::make_unique<FunctionOutliningInfo>(); 604 605 BasicBlock *CurrEntry = EntryBlock; 606 bool CandidateFound = false; 607 do { 608 // The number of blocks to be inlined has already reached 609 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 610 // disables partial inlining for the function. 611 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks) 612 break; 613 614 if (succ_size(CurrEntry) != 2) 615 break; 616 617 BasicBlock *Succ1 = *succ_begin(CurrEntry); 618 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 619 620 BasicBlock *ReturnBlock, *NonReturnBlock; 621 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 622 623 if (ReturnBlock) { 624 OutliningInfo->Entries.push_back(CurrEntry); 625 OutliningInfo->ReturnBlock = ReturnBlock; 626 OutliningInfo->NonReturnBlock = NonReturnBlock; 627 CandidateFound = true; 628 break; 629 } 630 631 BasicBlock *CommSucc, *OtherSucc; 632 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 633 634 if (!CommSucc) 635 break; 636 637 OutliningInfo->Entries.push_back(CurrEntry); 638 CurrEntry = OtherSucc; 639 } while (true); 640 641 if (!CandidateFound) 642 return std::unique_ptr<FunctionOutliningInfo>(); 643 644 // There should not be any successors (not in the entry set) other than 645 // {ReturnBlock, NonReturnBlock} 646 assert(OutliningInfo->Entries[0] == &F.front() && 647 "Function Entry must be the first in Entries vector"); 648 DenseSet<BasicBlock *> Entries; 649 for (BasicBlock *E : OutliningInfo->Entries) 650 Entries.insert(E); 651 652 // Returns true of BB has Predecessor which is not 653 // in Entries set. 654 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 655 for (auto *Pred : predecessors(BB)) { 656 if (!Entries.count(Pred)) 657 return true; 658 } 659 return false; 660 }; 661 auto CheckAndNormalizeCandidate = 662 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 663 for (BasicBlock *E : OutliningInfo->Entries) { 664 for (auto *Succ : successors(E)) { 665 if (Entries.count(Succ)) 666 continue; 667 if (Succ == OutliningInfo->ReturnBlock) 668 OutliningInfo->ReturnBlockPreds.push_back(E); 669 else if (Succ != OutliningInfo->NonReturnBlock) 670 return false; 671 } 672 // There should not be any outside incoming edges either: 673 if (HasNonEntryPred(E)) 674 return false; 675 } 676 return true; 677 }; 678 679 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 680 return std::unique_ptr<FunctionOutliningInfo>(); 681 682 // Now further growing the candidate's inlining region by 683 // peeling off dominating blocks from the outlining region: 684 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) { 685 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 686 if (succ_size(Cand) != 2) 687 break; 688 689 if (HasNonEntryPred(Cand)) 690 break; 691 692 BasicBlock *Succ1 = *succ_begin(Cand); 693 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 694 695 BasicBlock *ReturnBlock, *NonReturnBlock; 696 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 697 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 698 break; 699 700 if (NonReturnBlock->getSinglePredecessor() != Cand) 701 break; 702 703 // Now grow and update OutlininigInfo: 704 OutliningInfo->Entries.push_back(Cand); 705 OutliningInfo->NonReturnBlock = NonReturnBlock; 706 OutliningInfo->ReturnBlockPreds.push_back(Cand); 707 Entries.insert(Cand); 708 } 709 710 return OutliningInfo; 711 } 712 713 // Check if there is PGO data or user annotated branch data: 714 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) { 715 if (F.hasProfileData()) 716 return true; 717 // Now check if any of the entry block has MD_prof data: 718 for (auto *E : OI.Entries) { 719 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 720 if (!BR || BR->isUnconditional()) 721 continue; 722 uint64_t T, F; 723 if (BR->extractProfMetadata(T, F)) 724 return true; 725 } 726 return false; 727 } 728 729 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( 730 FunctionCloner &Cloner) const { 731 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 732 auto EntryFreq = 733 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 734 auto OutliningCallFreq = 735 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 736 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 737 // we outlined any regions, so we may encounter situations where the 738 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 739 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) 740 OutliningCallFreq = EntryFreq; 741 742 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 743 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 744 745 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get())) 746 return OutlineRegionRelFreq; 747 748 // When profile data is not available, we need to be conservative in 749 // estimating the overall savings. Static branch prediction can usually 750 // guess the branch direction right (taken/non-taken), but the guessed 751 // branch probability is usually not biased enough. In case when the 752 // outlined region is predicted to be likely, its probability needs 753 // to be made higher (more biased) to not under-estimate the cost of 754 // function outlining. On the other hand, if the outlined region 755 // is predicted to be less likely, the predicted probablity is usually 756 // higher than the actual. For instance, the actual probability of the 757 // less likely target is only 5%, but the guessed probablity can be 758 // 40%. In the latter case, there is no need for further adjustement. 759 // FIXME: add an option for this. 760 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 761 return OutlineRegionRelFreq; 762 763 OutlineRegionRelFreq = std::max( 764 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 765 766 return OutlineRegionRelFreq; 767 } 768 769 bool PartialInlinerImpl::shouldPartialInline( 770 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 771 OptimizationRemarkEmitter &ORE) const { 772 using namespace ore; 773 774 Function *Callee = CB.getCalledFunction(); 775 assert(Callee == Cloner.ClonedFunc); 776 777 if (SkipCostAnalysis) 778 return isInlineViable(*Callee).isSuccess(); 779 780 Function *Caller = CB.getCaller(); 781 auto &CalleeTTI = GetTTI(*Callee); 782 bool RemarksEnabled = 783 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 784 DEBUG_TYPE); 785 InlineCost IC = 786 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 787 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 788 789 if (IC.isAlways()) { 790 ORE.emit([&]() { 791 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 792 << NV("Callee", Cloner.OrigFunc) 793 << " should always be fully inlined, not partially"; 794 }); 795 return false; 796 } 797 798 if (IC.isNever()) { 799 ORE.emit([&]() { 800 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 801 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 802 << NV("Caller", Caller) 803 << " because it should never be inlined (cost=never)"; 804 }); 805 return false; 806 } 807 808 if (!IC) { 809 ORE.emit([&]() { 810 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 811 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 812 << NV("Caller", Caller) << " because too costly to inline (cost=" 813 << NV("Cost", IC.getCost()) << ", threshold=" 814 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 815 }); 816 return false; 817 } 818 const DataLayout &DL = Caller->getParent()->getDataLayout(); 819 820 // The savings of eliminating the call: 821 int NonWeightedSavings = getCallsiteCost(CB, DL); 822 BlockFrequency NormWeightedSavings(NonWeightedSavings); 823 824 // Weighted saving is smaller than weighted cost, return false 825 if (NormWeightedSavings < WeightedOutliningRcost) { 826 ORE.emit([&]() { 827 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 828 &CB) 829 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 830 << NV("Caller", Caller) << " runtime overhead (overhead=" 831 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 832 << ", savings=" 833 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 834 << ")" 835 << " of making the outlined call is too high"; 836 }); 837 838 return false; 839 } 840 841 ORE.emit([&]() { 842 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 843 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 844 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 845 << " (threshold=" 846 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 847 }); 848 return true; 849 } 850 851 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 852 // For now use a simplified version. The returned 'InlineCost' will be used 853 // to esimate the size cost as well as runtime cost of the BB. 854 InstructionCost 855 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 856 TargetTransformInfo *TTI) { 857 InstructionCost InlineCost = 0; 858 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 859 for (Instruction &I : BB->instructionsWithoutDebug()) { 860 // Skip free instructions. 861 switch (I.getOpcode()) { 862 case Instruction::BitCast: 863 case Instruction::PtrToInt: 864 case Instruction::IntToPtr: 865 case Instruction::Alloca: 866 case Instruction::PHI: 867 continue; 868 case Instruction::GetElementPtr: 869 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 870 continue; 871 break; 872 default: 873 break; 874 } 875 876 if (I.isLifetimeStartOrEnd()) 877 continue; 878 879 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 880 Intrinsic::ID IID = II->getIntrinsicID(); 881 SmallVector<Type *, 4> Tys; 882 FastMathFlags FMF; 883 for (Value *Val : II->args()) 884 Tys.push_back(Val->getType()); 885 886 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 887 FMF = FPMO->getFastMathFlags(); 888 889 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 890 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 891 continue; 892 } 893 894 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 895 InlineCost += getCallsiteCost(*CI, DL); 896 continue; 897 } 898 899 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 900 InlineCost += getCallsiteCost(*II, DL); 901 continue; 902 } 903 904 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 905 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 906 continue; 907 } 908 InlineCost += InlineConstants::InstrCost; 909 } 910 911 return InlineCost; 912 } 913 914 std::tuple<InstructionCost, InstructionCost> 915 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const { 916 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 917 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 918 Function *OutlinedFunc = FuncBBPair.first; 919 BasicBlock* OutliningCallBB = FuncBBPair.second; 920 // Now compute the cost of the call sequence to the outlined function 921 // 'OutlinedFunction' in BB 'OutliningCallBB': 922 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 923 OutliningFuncCallCost += 924 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 925 926 // Now compute the cost of the extracted/outlined function itself: 927 for (BasicBlock &BB : *OutlinedFunc) 928 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 929 } 930 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 931 "Outlined function cost should be no less than the outlined region"); 932 933 // The code extractor introduces a new root and exit stub blocks with 934 // additional unconditional branches. Those branches will be eliminated 935 // later with bb layout. The cost should be adjusted accordingly: 936 OutlinedFunctionCost -= 937 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 938 939 InstructionCost OutliningRuntimeOverhead = 940 OutliningFuncCallCost + 941 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 942 ExtraOutliningPenalty.getValue(); 943 944 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 945 } 946 947 // Create the callsite to profile count map which is 948 // used to update the original function's entry count, 949 // after the function is partially inlined into the callsite. 950 void PartialInlinerImpl::computeCallsiteToProfCountMap( 951 Function *DuplicateFunction, 952 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const { 953 std::vector<User *> Users(DuplicateFunction->user_begin(), 954 DuplicateFunction->user_end()); 955 Function *CurrentCaller = nullptr; 956 std::unique_ptr<BlockFrequencyInfo> TempBFI; 957 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 958 959 auto ComputeCurrBFI = [&,this](Function *Caller) { 960 // For the old pass manager: 961 if (!GetBFI) { 962 DominatorTree DT(*Caller); 963 LoopInfo LI(DT); 964 BranchProbabilityInfo BPI(*Caller, LI); 965 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 966 CurrentCallerBFI = TempBFI.get(); 967 } else { 968 // New pass manager: 969 CurrentCallerBFI = &(GetBFI(*Caller)); 970 } 971 }; 972 973 for (User *User : Users) { 974 // Don't bother with BlockAddress used by CallBr for asm goto. 975 if (isa<BlockAddress>(User)) 976 continue; 977 CallBase *CB = getSupportedCallBase(User); 978 Function *Caller = CB->getCaller(); 979 if (CurrentCaller != Caller) { 980 CurrentCaller = Caller; 981 ComputeCurrBFI(Caller); 982 } else { 983 assert(CurrentCallerBFI && "CallerBFI is not set"); 984 } 985 BasicBlock *CallBB = CB->getParent(); 986 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 987 if (Count) 988 CallSiteToProfCountMap[User] = *Count; 989 else 990 CallSiteToProfCountMap[User] = 0; 991 } 992 } 993 994 PartialInlinerImpl::FunctionCloner::FunctionCloner( 995 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 996 function_ref<AssumptionCache *(Function &)> LookupAC, 997 function_ref<TargetTransformInfo &(Function &)> GetTTI) 998 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 999 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 1000 1001 // Clone the function, so that we can hack away on it. 1002 ValueToValueMapTy VMap; 1003 ClonedFunc = CloneFunction(F, VMap); 1004 1005 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 1006 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 1007 for (BasicBlock *BB : OI->Entries) 1008 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 1009 1010 for (BasicBlock *E : OI->ReturnBlockPreds) { 1011 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 1012 ClonedOI->ReturnBlockPreds.push_back(NewE); 1013 } 1014 // Go ahead and update all uses to the duplicate, so that we can just 1015 // use the inliner functionality when we're done hacking. 1016 F->replaceAllUsesWith(ClonedFunc); 1017 } 1018 1019 PartialInlinerImpl::FunctionCloner::FunctionCloner( 1020 Function *F, FunctionOutliningMultiRegionInfo *OI, 1021 OptimizationRemarkEmitter &ORE, 1022 function_ref<AssumptionCache *(Function &)> LookupAC, 1023 function_ref<TargetTransformInfo &(Function &)> GetTTI) 1024 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 1025 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1026 1027 // Clone the function, so that we can hack away on it. 1028 ValueToValueMapTy VMap; 1029 ClonedFunc = CloneFunction(F, VMap); 1030 1031 // Go through all Outline Candidate Regions and update all BasicBlock 1032 // information. 1033 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1034 OI->ORI) { 1035 SmallVector<BasicBlock *, 8> Region; 1036 for (BasicBlock *BB : RegionInfo.Region) 1037 Region.push_back(cast<BasicBlock>(VMap[BB])); 1038 1039 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1040 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1041 BasicBlock *NewReturnBlock = nullptr; 1042 if (RegionInfo.ReturnBlock) 1043 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1044 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1045 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1046 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1047 } 1048 // Go ahead and update all uses to the duplicate, so that we can just 1049 // use the inliner functionality when we're done hacking. 1050 F->replaceAllUsesWith(ClonedFunc); 1051 } 1052 1053 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const { 1054 auto GetFirstPHI = [](BasicBlock *BB) { 1055 BasicBlock::iterator I = BB->begin(); 1056 PHINode *FirstPhi = nullptr; 1057 while (I != BB->end()) { 1058 PHINode *Phi = dyn_cast<PHINode>(I); 1059 if (!Phi) 1060 break; 1061 if (!FirstPhi) { 1062 FirstPhi = Phi; 1063 break; 1064 } 1065 } 1066 return FirstPhi; 1067 }; 1068 1069 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1070 // blocks. 1071 if (!ClonedOI) 1072 return; 1073 1074 // Special hackery is needed with PHI nodes that have inputs from more than 1075 // one extracted block. For simplicity, just split the PHIs into a two-level 1076 // sequence of PHIs, some of which will go in the extracted region, and some 1077 // of which will go outside. 1078 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1079 // only split block when necessary: 1080 PHINode *FirstPhi = GetFirstPHI(PreReturn); 1081 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1082 1083 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1084 return; 1085 1086 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1087 Value *CommonValue = PN->getIncomingValue(0); 1088 if (all_of(PN->incoming_values(), 1089 [&](Value *V) { return V == CommonValue; })) 1090 return CommonValue; 1091 return nullptr; 1092 }; 1093 1094 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1095 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1096 BasicBlock::iterator I = PreReturn->begin(); 1097 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1098 SmallVector<Instruction *, 4> DeadPhis; 1099 while (I != PreReturn->end()) { 1100 PHINode *OldPhi = dyn_cast<PHINode>(I); 1101 if (!OldPhi) 1102 break; 1103 1104 PHINode *RetPhi = 1105 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1106 OldPhi->replaceAllUsesWith(RetPhi); 1107 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1108 1109 RetPhi->addIncoming(&*I, PreReturn); 1110 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1111 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1112 OldPhi->removeIncomingValue(E); 1113 } 1114 1115 // After incoming values splitting, the old phi may become trivial. 1116 // Keeping the trivial phi can introduce definition inside the outline 1117 // region which is live-out, causing necessary overhead (load, store 1118 // arg passing etc). 1119 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1120 OldPhi->replaceAllUsesWith(OldPhiVal); 1121 DeadPhis.push_back(OldPhi); 1122 } 1123 ++I; 1124 } 1125 for (auto *DP : DeadPhis) 1126 DP->eraseFromParent(); 1127 1128 for (auto *E : ClonedOI->ReturnBlockPreds) 1129 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1130 } 1131 1132 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1133 1134 auto ComputeRegionCost = 1135 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost { 1136 InstructionCost Cost = 0; 1137 for (BasicBlock* BB : Region) 1138 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1139 return Cost; 1140 }; 1141 1142 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1143 1144 if (ClonedOMRI->ORI.empty()) 1145 return false; 1146 1147 // The CodeExtractor needs a dominator tree. 1148 DominatorTree DT; 1149 DT.recalculate(*ClonedFunc); 1150 1151 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1152 LoopInfo LI(DT); 1153 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1154 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1155 1156 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1157 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1158 1159 SetVector<Value *> Inputs, Outputs, Sinks; 1160 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1161 ClonedOMRI->ORI) { 1162 InstructionCost CurrentOutlinedRegionCost = 1163 ComputeRegionCost(RegionInfo.Region); 1164 1165 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1166 ClonedFuncBFI.get(), &BPI, 1167 LookupAC(*RegionInfo.EntryBlock->getParent()), 1168 /* AllowVarargs */ false); 1169 1170 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1171 1172 LLVM_DEBUG({ 1173 dbgs() << "inputs: " << Inputs.size() << "\n"; 1174 dbgs() << "outputs: " << Outputs.size() << "\n"; 1175 for (Value *value : Inputs) 1176 dbgs() << "value used in func: " << *value << "\n"; 1177 for (Value *output : Outputs) 1178 dbgs() << "instr used in func: " << *output << "\n"; 1179 }); 1180 1181 // Do not extract regions that have live exit variables. 1182 if (Outputs.size() > 0 && !ForceLiveExit) 1183 continue; 1184 1185 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) { 1186 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc); 1187 BasicBlock *OutliningCallBB = OCS->getParent(); 1188 assert(OutliningCallBB->getParent() == ClonedFunc); 1189 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1190 NumColdRegionsOutlined++; 1191 OutlinedRegionCost += CurrentOutlinedRegionCost; 1192 1193 if (MarkOutlinedColdCC) { 1194 OutlinedFunc->setCallingConv(CallingConv::Cold); 1195 OCS->setCallingConv(CallingConv::Cold); 1196 } 1197 } else 1198 ORE.emit([&]() { 1199 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1200 &RegionInfo.Region.front()->front()) 1201 << "Failed to extract region at block " 1202 << ore::NV("Block", RegionInfo.Region.front()); 1203 }); 1204 } 1205 1206 return !OutlinedFunctions.empty(); 1207 } 1208 1209 Function * 1210 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1211 // Returns true if the block is to be partial inlined into the caller 1212 // (i.e. not to be extracted to the out of line function) 1213 auto ToBeInlined = [&, this](BasicBlock *BB) { 1214 return BB == ClonedOI->ReturnBlock || 1215 llvm::is_contained(ClonedOI->Entries, BB); 1216 }; 1217 1218 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1219 // The CodeExtractor needs a dominator tree. 1220 DominatorTree DT; 1221 DT.recalculate(*ClonedFunc); 1222 1223 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1224 LoopInfo LI(DT); 1225 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1226 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1227 1228 // Gather up the blocks that we're going to extract. 1229 std::vector<BasicBlock *> ToExtract; 1230 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1231 ToExtract.push_back(ClonedOI->NonReturnBlock); 1232 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1233 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1234 for (BasicBlock &BB : *ClonedFunc) 1235 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1236 ToExtract.push_back(&BB); 1237 // FIXME: the code extractor may hoist/sink more code 1238 // into the outlined function which may make the outlining 1239 // overhead (the difference of the outlined function cost 1240 // and OutliningRegionCost) look larger. 1241 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); 1242 } 1243 1244 // Extract the body of the if. 1245 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1246 Function *OutlinedFunc = 1247 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1248 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1249 /* AllowVarargs */ true) 1250 .extractCodeRegion(CEAC); 1251 1252 if (OutlinedFunc) { 1253 BasicBlock *OutliningCallBB = 1254 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent(); 1255 assert(OutliningCallBB->getParent() == ClonedFunc); 1256 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1257 } else 1258 ORE.emit([&]() { 1259 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1260 &ToExtract.front()->front()) 1261 << "Failed to extract region at block " 1262 << ore::NV("Block", ToExtract.front()); 1263 }); 1264 1265 return OutlinedFunc; 1266 } 1267 1268 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1269 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1270 // users (function pointers, etc.) back to the original function. 1271 ClonedFunc->replaceAllUsesWith(OrigFunc); 1272 ClonedFunc->eraseFromParent(); 1273 if (!IsFunctionInlined) { 1274 // Remove each function that was speculatively created if there is no 1275 // reference. 1276 for (auto FuncBBPair : OutlinedFunctions) { 1277 Function *Func = FuncBBPair.first; 1278 Func->eraseFromParent(); 1279 } 1280 } 1281 } 1282 1283 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) { 1284 if (F.hasAddressTaken()) 1285 return {false, nullptr}; 1286 1287 // Let inliner handle it 1288 if (F.hasFnAttribute(Attribute::AlwaysInline)) 1289 return {false, nullptr}; 1290 1291 if (F.hasFnAttribute(Attribute::NoInline)) 1292 return {false, nullptr}; 1293 1294 if (PSI.isFunctionEntryCold(&F)) 1295 return {false, nullptr}; 1296 1297 if (F.users().empty()) 1298 return {false, nullptr}; 1299 1300 OptimizationRemarkEmitter ORE(&F); 1301 1302 // Only try to outline cold regions if we have a profile summary, which 1303 // implies we have profiling information. 1304 if (PSI.hasProfileSummary() && F.hasProfileData() && 1305 !DisableMultiRegionPartialInline) { 1306 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1307 computeOutliningColdRegionsInfo(F, ORE); 1308 if (OMRI) { 1309 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1310 1311 LLVM_DEBUG({ 1312 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1313 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1314 << "\n"; 1315 }); 1316 1317 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1318 1319 if (DidOutline) { 1320 LLVM_DEBUG({ 1321 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1322 Cloner.ClonedFunc->print(dbgs()); 1323 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1324 }); 1325 1326 if (tryPartialInline(Cloner)) 1327 return {true, nullptr}; 1328 } 1329 } 1330 } 1331 1332 // Fall-thru to regular partial inlining if we: 1333 // i) can't find any cold regions to outline, or 1334 // ii) can't inline the outlined function anywhere. 1335 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1336 if (!OI) 1337 return {false, nullptr}; 1338 1339 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1340 Cloner.normalizeReturnBlock(); 1341 1342 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1343 1344 if (!OutlinedFunction) 1345 return {false, nullptr}; 1346 1347 if (tryPartialInline(Cloner)) 1348 return {true, OutlinedFunction}; 1349 1350 return {false, nullptr}; 1351 } 1352 1353 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1354 if (Cloner.OutlinedFunctions.empty()) 1355 return false; 1356 1357 int SizeCost = 0; 1358 BlockFrequency WeightedRcost; 1359 int NonWeightedRcost; 1360 1361 auto OutliningCosts = computeOutliningCosts(Cloner); 1362 assert(std::get<0>(OutliningCosts).isValid() && 1363 std::get<1>(OutliningCosts).isValid() && "Expected valid costs"); 1364 1365 SizeCost = *std::get<0>(OutliningCosts).getValue(); 1366 NonWeightedRcost = *std::get<1>(OutliningCosts).getValue(); 1367 1368 // Only calculate RelativeToEntryFreq when we are doing single region 1369 // outlining. 1370 BranchProbability RelativeToEntryFreq; 1371 if (Cloner.ClonedOI) 1372 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1373 else 1374 // RelativeToEntryFreq doesn't make sense when we have more than one 1375 // outlined call because each call will have a different relative frequency 1376 // to the entry block. We can consider using the average, but the 1377 // usefulness of that information is questionable. For now, assume we never 1378 // execute the calls to outlined functions. 1379 RelativeToEntryFreq = BranchProbability(0, 1); 1380 1381 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1382 1383 // The call sequence(s) to the outlined function(s) are larger than the sum of 1384 // the original outlined region size(s), it does not increase the chances of 1385 // inlining the function with outlining (The inliner uses the size increase to 1386 // model the cost of inlining a callee). 1387 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1388 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1389 DebugLoc DLoc; 1390 BasicBlock *Block; 1391 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc); 1392 OrigFuncORE.emit([&]() { 1393 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1394 DLoc, Block) 1395 << ore::NV("Function", Cloner.OrigFunc) 1396 << " not partially inlined into callers (Original Size = " 1397 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1398 << ", Size of call sequence to outlined function = " 1399 << ore::NV("NewSize", SizeCost) << ")"; 1400 }); 1401 return false; 1402 } 1403 1404 assert(Cloner.OrigFunc->users().empty() && 1405 "F's users should all be replaced!"); 1406 1407 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1408 Cloner.ClonedFunc->user_end()); 1409 1410 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1411 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1412 if (CalleeEntryCount) 1413 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1414 1415 uint64_t CalleeEntryCountV = 1416 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0); 1417 1418 bool AnyInline = false; 1419 for (User *User : Users) { 1420 // Don't bother with BlockAddress used by CallBr for asm goto. 1421 if (isa<BlockAddress>(User)) 1422 continue; 1423 1424 CallBase *CB = getSupportedCallBase(User); 1425 1426 if (isLimitReached()) 1427 continue; 1428 1429 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1430 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1431 continue; 1432 1433 // Construct remark before doing the inlining, as after successful inlining 1434 // the callsite is removed. 1435 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1436 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1437 << ore::NV("Caller", CB->getCaller()); 1438 1439 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 1440 // We can only forward varargs when we outlined a single region, else we 1441 // bail on vararg functions. 1442 if (!InlineFunction(*CB, IFI, nullptr, true, 1443 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1444 : nullptr)) 1445 .isSuccess()) 1446 continue; 1447 1448 CallerORE.emit(OR); 1449 1450 // Now update the entry count: 1451 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1452 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1453 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1454 } 1455 1456 AnyInline = true; 1457 NumPartialInlining++; 1458 // Update the stats 1459 if (Cloner.ClonedOI) 1460 NumPartialInlined++; 1461 else 1462 NumColdOutlinePartialInlined++; 1463 } 1464 1465 if (AnyInline) { 1466 Cloner.IsFunctionInlined = true; 1467 if (CalleeEntryCount) 1468 Cloner.OrigFunc->setEntryCount(Function::ProfileCount( 1469 CalleeEntryCountV, CalleeEntryCount->getType())); 1470 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1471 OrigFuncORE.emit([&]() { 1472 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1473 << "Partially inlined into at least one caller"; 1474 }); 1475 } 1476 1477 return AnyInline; 1478 } 1479 1480 bool PartialInlinerImpl::run(Module &M) { 1481 if (DisablePartialInlining) 1482 return false; 1483 1484 std::vector<Function *> Worklist; 1485 Worklist.reserve(M.size()); 1486 for (Function &F : M) 1487 if (!F.use_empty() && !F.isDeclaration()) 1488 Worklist.push_back(&F); 1489 1490 bool Changed = false; 1491 while (!Worklist.empty()) { 1492 Function *CurrFunc = Worklist.back(); 1493 Worklist.pop_back(); 1494 1495 if (CurrFunc->use_empty()) 1496 continue; 1497 1498 bool Recursive = false; 1499 for (User *U : CurrFunc->users()) 1500 if (Instruction *I = dyn_cast<Instruction>(U)) 1501 if (I->getParent()->getParent() == CurrFunc) { 1502 Recursive = true; 1503 break; 1504 } 1505 if (Recursive) 1506 continue; 1507 1508 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc); 1509 if (Result.second) 1510 Worklist.push_back(Result.second); 1511 Changed |= Result.first; 1512 } 1513 1514 return Changed; 1515 } 1516 1517 char PartialInlinerLegacyPass::ID = 0; 1518 1519 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1520 "Partial Inliner", false, false) 1521 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1522 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1523 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1524 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1525 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1526 "Partial Inliner", false, false) 1527 1528 ModulePass *llvm::createPartialInliningPass() { 1529 return new PartialInlinerLegacyPass(); 1530 } 1531 1532 PreservedAnalyses PartialInlinerPass::run(Module &M, 1533 ModuleAnalysisManager &AM) { 1534 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1535 1536 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1537 return FAM.getResult<AssumptionAnalysis>(F); 1538 }; 1539 1540 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1541 return FAM.getCachedResult<AssumptionAnalysis>(F); 1542 }; 1543 1544 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1545 return FAM.getResult<BlockFrequencyAnalysis>(F); 1546 }; 1547 1548 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1549 return FAM.getResult<TargetIRAnalysis>(F); 1550 }; 1551 1552 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1553 return FAM.getResult<TargetLibraryAnalysis>(F); 1554 }; 1555 1556 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1557 1558 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1559 GetTLI, PSI, GetBFI) 1560 .run(M)) 1561 return PreservedAnalyses::none(); 1562 return PreservedAnalyses::all(); 1563 } 1564