1 //===- PartialInlining.cpp - Inline parts of functions --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass performs partial inlining, typically by inlining an if statement 10 // that surrounds the body of the function. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/IPO/PartialInlining.h" 15 #include "llvm/ADT/DenseMap.h" 16 #include "llvm/ADT/DenseSet.h" 17 #include "llvm/ADT/None.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/Statistic.h" 22 #include "llvm/Analysis/BlockFrequencyInfo.h" 23 #include "llvm/Analysis/BranchProbabilityInfo.h" 24 #include "llvm/Analysis/InlineCost.h" 25 #include "llvm/Analysis/LoopInfo.h" 26 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 27 #include "llvm/Analysis/ProfileSummaryInfo.h" 28 #include "llvm/Analysis/TargetLibraryInfo.h" 29 #include "llvm/Analysis/TargetTransformInfo.h" 30 #include "llvm/IR/Attributes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/CFG.h" 33 #include "llvm/IR/DebugLoc.h" 34 #include "llvm/IR/DiagnosticInfo.h" 35 #include "llvm/IR/Dominators.h" 36 #include "llvm/IR/Function.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/IntrinsicInst.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Module.h" 43 #include "llvm/IR/User.h" 44 #include "llvm/InitializePasses.h" 45 #include "llvm/Pass.h" 46 #include "llvm/Support/BlockFrequency.h" 47 #include "llvm/Support/BranchProbability.h" 48 #include "llvm/Support/Casting.h" 49 #include "llvm/Support/CommandLine.h" 50 #include "llvm/Support/ErrorHandling.h" 51 #include "llvm/Transforms/IPO.h" 52 #include "llvm/Transforms/Utils/Cloning.h" 53 #include "llvm/Transforms/Utils/CodeExtractor.h" 54 #include "llvm/Transforms/Utils/ValueMapper.h" 55 #include <algorithm> 56 #include <cassert> 57 #include <cstdint> 58 #include <functional> 59 #include <iterator> 60 #include <memory> 61 #include <tuple> 62 #include <vector> 63 64 using namespace llvm; 65 66 #define DEBUG_TYPE "partial-inlining" 67 68 STATISTIC(NumPartialInlined, 69 "Number of callsites functions partially inlined into."); 70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with " 71 "cold outlined regions were partially " 72 "inlined into its caller(s)."); 73 STATISTIC(NumColdRegionsFound, 74 "Number of cold single entry/exit regions found."); 75 STATISTIC(NumColdRegionsOutlined, 76 "Number of cold single entry/exit regions outlined."); 77 78 // Command line option to disable partial-inlining. The default is false: 79 static cl::opt<bool> 80 DisablePartialInlining("disable-partial-inlining", cl::init(false), 81 cl::Hidden, cl::desc("Disable partial inlining")); 82 // Command line option to disable multi-region partial-inlining. The default is 83 // false: 84 static cl::opt<bool> DisableMultiRegionPartialInline( 85 "disable-mr-partial-inlining", cl::init(false), cl::Hidden, 86 cl::desc("Disable multi-region partial inlining")); 87 88 // Command line option to force outlining in regions with live exit variables. 89 // The default is false: 90 static cl::opt<bool> 91 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden, 92 cl::desc("Force outline regions with live exits")); 93 94 // Command line option to enable marking outline functions with Cold Calling 95 // Convention. The default is false: 96 static cl::opt<bool> 97 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden, 98 cl::desc("Mark outline function calls with ColdCC")); 99 100 #ifndef NDEBUG 101 // Command line option to debug partial-inlining. The default is none: 102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining", 103 cl::init(false), cl::Hidden, 104 cl::desc("Trace partial inlining.")); 105 #endif 106 107 // This is an option used by testing: 108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis", 109 cl::init(false), cl::ZeroOrMore, 110 cl::ReallyHidden, 111 cl::desc("Skip Cost Analysis")); 112 // Used to determine if a cold region is worth outlining based on 113 // its inlining cost compared to the original function. Default is set at 10%. 114 // ie. if the cold region reduces the inlining cost of the original function by 115 // at least 10%. 116 static cl::opt<float> MinRegionSizeRatio( 117 "min-region-size-ratio", cl::init(0.1), cl::Hidden, 118 cl::desc("Minimum ratio comparing relative sizes of each " 119 "outline candidate and original function")); 120 // Used to tune the minimum number of execution counts needed in the predecessor 121 // block to the cold edge. ie. confidence interval. 122 static cl::opt<unsigned> 123 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden, 124 cl::desc("Minimum block executions to consider " 125 "its BranchProbabilityInfo valid")); 126 // Used to determine when an edge is considered cold. Default is set to 10%. ie. 127 // if the branch probability is 10% or less, then it is deemed as 'cold'. 128 static cl::opt<float> ColdBranchRatio( 129 "cold-branch-ratio", cl::init(0.1), cl::Hidden, 130 cl::desc("Minimum BranchProbability to consider a region cold.")); 131 132 static cl::opt<unsigned> MaxNumInlineBlocks( 133 "max-num-inline-blocks", cl::init(5), cl::Hidden, 134 cl::desc("Max number of blocks to be partially inlined")); 135 136 // Command line option to set the maximum number of partial inlining allowed 137 // for the module. The default value of -1 means no limit. 138 static cl::opt<int> MaxNumPartialInlining( 139 "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, 140 cl::desc("Max number of partial inlining. The default is unlimited")); 141 142 // Used only when PGO or user annotated branch data is absent. It is 143 // the least value that is used to weigh the outline region. If BFI 144 // produces larger value, the BFI value will be used. 145 static cl::opt<int> 146 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), 147 cl::Hidden, cl::ZeroOrMore, 148 cl::desc("Relative frequency of outline region to " 149 "the entry block")); 150 151 static cl::opt<unsigned> ExtraOutliningPenalty( 152 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden, 153 cl::desc("A debug option to add additional penalty to the computed one.")); 154 155 namespace { 156 157 struct FunctionOutliningInfo { 158 FunctionOutliningInfo() = default; 159 160 // Returns the number of blocks to be inlined including all blocks 161 // in Entries and one return block. 162 unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; } 163 164 // A set of blocks including the function entry that guard 165 // the region to be outlined. 166 SmallVector<BasicBlock *, 4> Entries; 167 168 // The return block that is not included in the outlined region. 169 BasicBlock *ReturnBlock = nullptr; 170 171 // The dominating block of the region to be outlined. 172 BasicBlock *NonReturnBlock = nullptr; 173 174 // The set of blocks in Entries that that are predecessors to ReturnBlock 175 SmallVector<BasicBlock *, 4> ReturnBlockPreds; 176 }; 177 178 struct FunctionOutliningMultiRegionInfo { 179 FunctionOutliningMultiRegionInfo() 180 : ORI() {} 181 182 // Container for outline regions 183 struct OutlineRegionInfo { 184 OutlineRegionInfo(ArrayRef<BasicBlock *> Region, 185 BasicBlock *EntryBlock, BasicBlock *ExitBlock, 186 BasicBlock *ReturnBlock) 187 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock), 188 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {} 189 SmallVector<BasicBlock *, 8> Region; 190 BasicBlock *EntryBlock; 191 BasicBlock *ExitBlock; 192 BasicBlock *ReturnBlock; 193 }; 194 195 SmallVector<OutlineRegionInfo, 4> ORI; 196 }; 197 198 struct PartialInlinerImpl { 199 200 PartialInlinerImpl( 201 function_ref<AssumptionCache &(Function &)> GetAC, 202 function_ref<AssumptionCache *(Function &)> LookupAC, 203 function_ref<TargetTransformInfo &(Function &)> GTTI, 204 function_ref<const TargetLibraryInfo &(Function &)> GTLI, 205 ProfileSummaryInfo &ProfSI, 206 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr) 207 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC), 208 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {} 209 210 bool run(Module &M); 211 // Main part of the transformation that calls helper functions to find 212 // outlining candidates, clone & outline the function, and attempt to 213 // partially inline the resulting function. Returns true if 214 // inlining was successful, false otherwise. Also returns the outline 215 // function (only if we partially inlined early returns) as there is a 216 // possibility to further "peel" early return statements that were left in the 217 // outline function due to code size. 218 std::pair<bool, Function *> unswitchFunction(Function *F); 219 220 // This class speculatively clones the function to be partial inlined. 221 // At the end of partial inlining, the remaining callsites to the cloned 222 // function that are not partially inlined will be fixed up to reference 223 // the original function, and the cloned function will be erased. 224 struct FunctionCloner { 225 // Two constructors, one for single region outlining, the other for 226 // multi-region outlining. 227 FunctionCloner(Function *F, FunctionOutliningInfo *OI, 228 OptimizationRemarkEmitter &ORE, 229 function_ref<AssumptionCache *(Function &)> LookupAC, 230 function_ref<TargetTransformInfo &(Function &)> GetTTI); 231 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI, 232 OptimizationRemarkEmitter &ORE, 233 function_ref<AssumptionCache *(Function &)> LookupAC, 234 function_ref<TargetTransformInfo &(Function &)> GetTTI); 235 236 ~FunctionCloner(); 237 238 // Prepare for function outlining: making sure there is only 239 // one incoming edge from the extracted/outlined region to 240 // the return block. 241 void NormalizeReturnBlock(); 242 243 // Do function outlining for cold regions. 244 bool doMultiRegionFunctionOutlining(); 245 // Do function outlining for region after early return block(s). 246 // NOTE: For vararg functions that do the vararg handling in the outlined 247 // function, we temporarily generate IR that does not properly 248 // forward varargs to the outlined function. Calling InlineFunction 249 // will update calls to the outlined functions to properly forward 250 // the varargs. 251 Function *doSingleRegionFunctionOutlining(); 252 253 Function *OrigFunc = nullptr; 254 Function *ClonedFunc = nullptr; 255 256 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair; 257 // Keep track of Outlined Functions and the basic block they're called from. 258 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions; 259 260 // ClonedFunc is inlined in one of its callers after function 261 // outlining. 262 bool IsFunctionInlined = false; 263 // The cost of the region to be outlined. 264 int OutlinedRegionCost = 0; 265 // ClonedOI is specific to outlining non-early return blocks. 266 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr; 267 // ClonedOMRI is specific to outlining cold regions. 268 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr; 269 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr; 270 OptimizationRemarkEmitter &ORE; 271 function_ref<AssumptionCache *(Function &)> LookupAC; 272 function_ref<TargetTransformInfo &(Function &)> GetTTI; 273 }; 274 275 private: 276 int NumPartialInlining = 0; 277 function_ref<AssumptionCache &(Function &)> GetAssumptionCache; 278 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache; 279 function_ref<TargetTransformInfo &(Function &)> GetTTI; 280 function_ref<BlockFrequencyInfo &(Function &)> GetBFI; 281 function_ref<const TargetLibraryInfo &(Function &)> GetTLI; 282 ProfileSummaryInfo &PSI; 283 284 // Return the frequency of the OutlininingBB relative to F's entry point. 285 // The result is no larger than 1 and is represented using BP. 286 // (Note that the outlined region's 'head' block can only have incoming 287 // edges from the guarding entry blocks). 288 BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner); 289 290 // Return true if the callee of CB should be partially inlined with 291 // profit. 292 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner, 293 BlockFrequency WeightedOutliningRcost, 294 OptimizationRemarkEmitter &ORE); 295 296 // Try to inline DuplicateFunction (cloned from F with call to 297 // the OutlinedFunction into its callers. Return true 298 // if there is any successful inlining. 299 bool tryPartialInline(FunctionCloner &Cloner); 300 301 // Compute the mapping from use site of DuplicationFunction to the enclosing 302 // BB's profile count. 303 void computeCallsiteToProfCountMap(Function *DuplicateFunction, 304 DenseMap<User *, uint64_t> &SiteCountMap); 305 306 bool IsLimitReached() { 307 return (MaxNumPartialInlining != -1 && 308 NumPartialInlining >= MaxNumPartialInlining); 309 } 310 311 static CallBase *getSupportedCallBase(User *U) { 312 if (isa<CallInst>(U) || isa<InvokeInst>(U)) 313 return cast<CallBase>(U); 314 llvm_unreachable("All uses must be calls"); 315 return nullptr; 316 } 317 318 static CallBase *getOneCallSiteTo(Function *F) { 319 User *User = *F->user_begin(); 320 return getSupportedCallBase(User); 321 } 322 323 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) { 324 CallBase *CB = getOneCallSiteTo(F); 325 DebugLoc DLoc = CB->getDebugLoc(); 326 BasicBlock *Block = CB->getParent(); 327 return std::make_tuple(DLoc, Block); 328 } 329 330 // Returns the costs associated with function outlining: 331 // - The first value is the non-weighted runtime cost for making the call 332 // to the outlined function, including the addtional setup cost in the 333 // outlined function itself; 334 // - The second value is the estimated size of the new call sequence in 335 // basic block Cloner.OutliningCallBB; 336 std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner); 337 338 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to 339 // approximate both the size and runtime cost (Note that in the current 340 // inline cost analysis, there is no clear distinction there either). 341 static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI); 342 343 std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F); 344 std::unique_ptr<FunctionOutliningMultiRegionInfo> 345 computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE); 346 }; 347 348 struct PartialInlinerLegacyPass : public ModulePass { 349 static char ID; // Pass identification, replacement for typeid 350 351 PartialInlinerLegacyPass() : ModulePass(ID) { 352 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry()); 353 } 354 355 void getAnalysisUsage(AnalysisUsage &AU) const override { 356 AU.addRequired<AssumptionCacheTracker>(); 357 AU.addRequired<ProfileSummaryInfoWrapperPass>(); 358 AU.addRequired<TargetTransformInfoWrapperPass>(); 359 AU.addRequired<TargetLibraryInfoWrapperPass>(); 360 } 361 362 bool runOnModule(Module &M) override { 363 if (skipModule(M)) 364 return false; 365 366 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>(); 367 TargetTransformInfoWrapperPass *TTIWP = 368 &getAnalysis<TargetTransformInfoWrapperPass>(); 369 ProfileSummaryInfo &PSI = 370 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); 371 372 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & { 373 return ACT->getAssumptionCache(F); 374 }; 375 376 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * { 377 return ACT->lookupAssumptionCache(F); 378 }; 379 380 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & { 381 return TTIWP->getTTI(F); 382 }; 383 384 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & { 385 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F); 386 }; 387 388 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 389 GetTLI, PSI) 390 .run(M); 391 } 392 }; 393 394 } // end anonymous namespace 395 396 std::unique_ptr<FunctionOutliningMultiRegionInfo> 397 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F, 398 OptimizationRemarkEmitter &ORE) { 399 BasicBlock *EntryBlock = &F->front(); 400 401 DominatorTree DT(*F); 402 LoopInfo LI(DT); 403 BranchProbabilityInfo BPI(*F, LI); 404 std::unique_ptr<BlockFrequencyInfo> ScopedBFI; 405 BlockFrequencyInfo *BFI; 406 if (!GetBFI) { 407 ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI)); 408 BFI = ScopedBFI.get(); 409 } else 410 BFI = &(GetBFI(*F)); 411 412 // Return if we don't have profiling information. 413 if (!PSI.hasInstrumentationProfile()) 414 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 415 416 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo = 417 std::make_unique<FunctionOutliningMultiRegionInfo>(); 418 419 auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) { 420 BasicBlock *Dom = BlockList.front(); 421 return BlockList.size() > 1 && Dom->hasNPredecessors(1); 422 }; 423 424 auto IsSingleExit = 425 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * { 426 BasicBlock *ExitBlock = nullptr; 427 for (auto *Block : BlockList) { 428 for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) { 429 if (!is_contained(BlockList, *SI)) { 430 if (ExitBlock) { 431 ORE.emit([&]() { 432 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion", 433 &SI->front()) 434 << "Region dominated by " 435 << ore::NV("Block", BlockList.front()->getName()) 436 << " has more than one region exit edge."; 437 }); 438 return nullptr; 439 } else 440 ExitBlock = Block; 441 } 442 } 443 } 444 return ExitBlock; 445 }; 446 447 auto BBProfileCount = [BFI](BasicBlock *BB) { 448 return BFI->getBlockProfileCount(BB) 449 ? BFI->getBlockProfileCount(BB).getValue() 450 : 0; 451 }; 452 453 // Use the same computeBBInlineCost function to compute the cost savings of 454 // the outlining the candidate region. 455 TargetTransformInfo *FTTI = &GetTTI(*F); 456 int OverallFunctionCost = 0; 457 for (auto &BB : *F) 458 OverallFunctionCost += computeBBInlineCost(&BB, FTTI); 459 460 #ifndef NDEBUG 461 if (TracePartialInlining) 462 dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n"; 463 #endif 464 int MinOutlineRegionCost = 465 static_cast<int>(OverallFunctionCost * MinRegionSizeRatio); 466 BranchProbability MinBranchProbability( 467 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution), 468 MinBlockCounterExecution); 469 bool ColdCandidateFound = false; 470 BasicBlock *CurrEntry = EntryBlock; 471 std::vector<BasicBlock *> DFS; 472 DenseMap<BasicBlock *, bool> VisitedMap; 473 DFS.push_back(CurrEntry); 474 VisitedMap[CurrEntry] = true; 475 // Use Depth First Search on the basic blocks to find CFG edges that are 476 // considered cold. 477 // Cold regions considered must also have its inline cost compared to the 478 // overall inline cost of the original function. The region is outlined only 479 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or 480 // more. 481 while (!DFS.empty()) { 482 auto *thisBB = DFS.back(); 483 DFS.pop_back(); 484 // Only consider regions with predecessor blocks that are considered 485 // not-cold (default: part of the top 99.99% of all block counters) 486 // AND greater than our minimum block execution count (default: 100). 487 if (PSI.isColdBlock(thisBB, BFI) || 488 BBProfileCount(thisBB) < MinBlockCounterExecution) 489 continue; 490 for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) { 491 if (VisitedMap[*SI]) 492 continue; 493 VisitedMap[*SI] = true; 494 DFS.push_back(*SI); 495 // If branch isn't cold, we skip to the next one. 496 BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI); 497 if (SuccProb > MinBranchProbability) 498 continue; 499 #ifndef NDEBUG 500 if (TracePartialInlining) { 501 dbgs() << "Found cold edge: " << thisBB->getName() << "->" 502 << (*SI)->getName() << "\nBranch Probability = " << SuccProb 503 << "\n"; 504 } 505 #endif 506 SmallVector<BasicBlock *, 8> DominateVector; 507 DT.getDescendants(*SI, DominateVector); 508 // We can only outline single entry regions (for now). 509 if (!IsSingleEntry(DominateVector)) 510 continue; 511 BasicBlock *ExitBlock = nullptr; 512 // We can only outline single exit regions (for now). 513 if (!(ExitBlock = IsSingleExit(DominateVector))) 514 continue; 515 int OutlineRegionCost = 0; 516 for (auto *BB : DominateVector) 517 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 518 519 #ifndef NDEBUG 520 if (TracePartialInlining) 521 dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n"; 522 #endif 523 524 if (OutlineRegionCost < MinOutlineRegionCost) { 525 ORE.emit([&]() { 526 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", 527 &SI->front()) 528 << ore::NV("Callee", F) << " inline cost-savings smaller than " 529 << ore::NV("Cost", MinOutlineRegionCost); 530 }); 531 continue; 532 } 533 // For now, ignore blocks that belong to a SISE region that is a 534 // candidate for outlining. In the future, we may want to look 535 // at inner regions because the outer region may have live-exit 536 // variables. 537 for (auto *BB : DominateVector) 538 VisitedMap[BB] = true; 539 // ReturnBlock here means the block after the outline call 540 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor(); 541 // assert(ReturnBlock && "ReturnBlock is NULL somehow!"); 542 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo( 543 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock); 544 OutliningInfo->ORI.push_back(RegInfo); 545 #ifndef NDEBUG 546 if (TracePartialInlining) { 547 dbgs() << "Found Cold Candidate starting at block: " 548 << DominateVector.front()->getName() << "\n"; 549 } 550 #endif 551 ColdCandidateFound = true; 552 NumColdRegionsFound++; 553 } 554 } 555 if (ColdCandidateFound) 556 return OutliningInfo; 557 else 558 return std::unique_ptr<FunctionOutliningMultiRegionInfo>(); 559 } 560 561 std::unique_ptr<FunctionOutliningInfo> 562 PartialInlinerImpl::computeOutliningInfo(Function *F) { 563 BasicBlock *EntryBlock = &F->front(); 564 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator()); 565 if (!BR || BR->isUnconditional()) 566 return std::unique_ptr<FunctionOutliningInfo>(); 567 568 // Returns true if Succ is BB's successor 569 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) { 570 return is_contained(successors(BB), Succ); 571 }; 572 573 auto IsReturnBlock = [](BasicBlock *BB) { 574 Instruction *TI = BB->getTerminator(); 575 return isa<ReturnInst>(TI); 576 }; 577 578 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 579 if (IsReturnBlock(Succ1)) 580 return std::make_tuple(Succ1, Succ2); 581 if (IsReturnBlock(Succ2)) 582 return std::make_tuple(Succ2, Succ1); 583 584 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 585 }; 586 587 // Detect a triangular shape: 588 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { 589 if (IsSuccessor(Succ1, Succ2)) 590 return std::make_tuple(Succ1, Succ2); 591 if (IsSuccessor(Succ2, Succ1)) 592 return std::make_tuple(Succ2, Succ1); 593 594 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr); 595 }; 596 597 std::unique_ptr<FunctionOutliningInfo> OutliningInfo = 598 std::make_unique<FunctionOutliningInfo>(); 599 600 BasicBlock *CurrEntry = EntryBlock; 601 bool CandidateFound = false; 602 do { 603 // The number of blocks to be inlined has already reached 604 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this 605 // disables partial inlining for the function. 606 if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks) 607 break; 608 609 if (succ_size(CurrEntry) != 2) 610 break; 611 612 BasicBlock *Succ1 = *succ_begin(CurrEntry); 613 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1); 614 615 BasicBlock *ReturnBlock, *NonReturnBlock; 616 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 617 618 if (ReturnBlock) { 619 OutliningInfo->Entries.push_back(CurrEntry); 620 OutliningInfo->ReturnBlock = ReturnBlock; 621 OutliningInfo->NonReturnBlock = NonReturnBlock; 622 CandidateFound = true; 623 break; 624 } 625 626 BasicBlock *CommSucc; 627 BasicBlock *OtherSucc; 628 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2); 629 630 if (!CommSucc) 631 break; 632 633 OutliningInfo->Entries.push_back(CurrEntry); 634 CurrEntry = OtherSucc; 635 } while (true); 636 637 if (!CandidateFound) 638 return std::unique_ptr<FunctionOutliningInfo>(); 639 640 // Do sanity check of the entries: threre should not 641 // be any successors (not in the entry set) other than 642 // {ReturnBlock, NonReturnBlock} 643 assert(OutliningInfo->Entries[0] == &F->front() && 644 "Function Entry must be the first in Entries vector"); 645 DenseSet<BasicBlock *> Entries; 646 for (BasicBlock *E : OutliningInfo->Entries) 647 Entries.insert(E); 648 649 // Returns true of BB has Predecessor which is not 650 // in Entries set. 651 auto HasNonEntryPred = [Entries](BasicBlock *BB) { 652 for (auto Pred : predecessors(BB)) { 653 if (!Entries.count(Pred)) 654 return true; 655 } 656 return false; 657 }; 658 auto CheckAndNormalizeCandidate = 659 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) { 660 for (BasicBlock *E : OutliningInfo->Entries) { 661 for (auto Succ : successors(E)) { 662 if (Entries.count(Succ)) 663 continue; 664 if (Succ == OutliningInfo->ReturnBlock) 665 OutliningInfo->ReturnBlockPreds.push_back(E); 666 else if (Succ != OutliningInfo->NonReturnBlock) 667 return false; 668 } 669 // There should not be any outside incoming edges either: 670 if (HasNonEntryPred(E)) 671 return false; 672 } 673 return true; 674 }; 675 676 if (!CheckAndNormalizeCandidate(OutliningInfo.get())) 677 return std::unique_ptr<FunctionOutliningInfo>(); 678 679 // Now further growing the candidate's inlining region by 680 // peeling off dominating blocks from the outlining region: 681 while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) { 682 BasicBlock *Cand = OutliningInfo->NonReturnBlock; 683 if (succ_size(Cand) != 2) 684 break; 685 686 if (HasNonEntryPred(Cand)) 687 break; 688 689 BasicBlock *Succ1 = *succ_begin(Cand); 690 BasicBlock *Succ2 = *(succ_begin(Cand) + 1); 691 692 BasicBlock *ReturnBlock, *NonReturnBlock; 693 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2); 694 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock) 695 break; 696 697 if (NonReturnBlock->getSinglePredecessor() != Cand) 698 break; 699 700 // Now grow and update OutlininigInfo: 701 OutliningInfo->Entries.push_back(Cand); 702 OutliningInfo->NonReturnBlock = NonReturnBlock; 703 OutliningInfo->ReturnBlockPreds.push_back(Cand); 704 Entries.insert(Cand); 705 } 706 707 return OutliningInfo; 708 } 709 710 // Check if there is PGO data or user annotated branch data: 711 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { 712 if (F->hasProfileData()) 713 return true; 714 // Now check if any of the entry block has MD_prof data: 715 for (auto *E : OI->Entries) { 716 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator()); 717 if (!BR || BR->isUnconditional()) 718 continue; 719 uint64_t T, F; 720 if (BR->extractProfMetadata(T, F)) 721 return true; 722 } 723 return false; 724 } 725 726 BranchProbability 727 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) { 728 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second; 729 auto EntryFreq = 730 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock()); 731 auto OutliningCallFreq = 732 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB); 733 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE 734 // we outlined any regions, so we may encounter situations where the 735 // OutliningCallFreq is *slightly* bigger than the EntryFreq. 736 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) { 737 OutliningCallFreq = EntryFreq; 738 } 739 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability( 740 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency()); 741 742 if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get())) 743 return OutlineRegionRelFreq; 744 745 // When profile data is not available, we need to be conservative in 746 // estimating the overall savings. Static branch prediction can usually 747 // guess the branch direction right (taken/non-taken), but the guessed 748 // branch probability is usually not biased enough. In case when the 749 // outlined region is predicted to be likely, its probability needs 750 // to be made higher (more biased) to not under-estimate the cost of 751 // function outlining. On the other hand, if the outlined region 752 // is predicted to be less likely, the predicted probablity is usually 753 // higher than the actual. For instance, the actual probability of the 754 // less likely target is only 5%, but the guessed probablity can be 755 // 40%. In the latter case, there is no need for further adjustement. 756 // FIXME: add an option for this. 757 if (OutlineRegionRelFreq < BranchProbability(45, 100)) 758 return OutlineRegionRelFreq; 759 760 OutlineRegionRelFreq = std::max( 761 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); 762 763 return OutlineRegionRelFreq; 764 } 765 766 bool PartialInlinerImpl::shouldPartialInline( 767 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost, 768 OptimizationRemarkEmitter &ORE) { 769 using namespace ore; 770 771 Function *Callee = CB.getCalledFunction(); 772 assert(Callee == Cloner.ClonedFunc); 773 774 if (SkipCostAnalysis) 775 return isInlineViable(*Callee).isSuccess(); 776 777 Function *Caller = CB.getCaller(); 778 auto &CalleeTTI = GetTTI(*Callee); 779 bool RemarksEnabled = 780 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled( 781 DEBUG_TYPE); 782 InlineCost IC = 783 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache, 784 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr); 785 786 if (IC.isAlways()) { 787 ORE.emit([&]() { 788 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB) 789 << NV("Callee", Cloner.OrigFunc) 790 << " should always be fully inlined, not partially"; 791 }); 792 return false; 793 } 794 795 if (IC.isNever()) { 796 ORE.emit([&]() { 797 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB) 798 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 799 << NV("Caller", Caller) 800 << " because it should never be inlined (cost=never)"; 801 }); 802 return false; 803 } 804 805 if (!IC) { 806 ORE.emit([&]() { 807 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB) 808 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 809 << NV("Caller", Caller) << " because too costly to inline (cost=" 810 << NV("Cost", IC.getCost()) << ", threshold=" 811 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 812 }); 813 return false; 814 } 815 const DataLayout &DL = Caller->getParent()->getDataLayout(); 816 817 // The savings of eliminating the call: 818 int NonWeightedSavings = getCallsiteCost(CB, DL); 819 BlockFrequency NormWeightedSavings(NonWeightedSavings); 820 821 // Weighted saving is smaller than weighted cost, return false 822 if (NormWeightedSavings < WeightedOutliningRcost) { 823 ORE.emit([&]() { 824 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", 825 &CB) 826 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into " 827 << NV("Caller", Caller) << " runtime overhead (overhead=" 828 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency()) 829 << ", savings=" 830 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) 831 << ")" 832 << " of making the outlined call is too high"; 833 }); 834 835 return false; 836 } 837 838 ORE.emit([&]() { 839 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB) 840 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into " 841 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) 842 << " (threshold=" 843 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"; 844 }); 845 return true; 846 } 847 848 // TODO: Ideally we should share Inliner's InlineCost Analysis code. 849 // For now use a simplified version. The returned 'InlineCost' will be used 850 // to esimate the size cost as well as runtime cost of the BB. 851 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB, 852 TargetTransformInfo *TTI) { 853 int InlineCost = 0; 854 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); 855 for (Instruction &I : BB->instructionsWithoutDebug()) { 856 // Skip free instructions. 857 switch (I.getOpcode()) { 858 case Instruction::BitCast: 859 case Instruction::PtrToInt: 860 case Instruction::IntToPtr: 861 case Instruction::Alloca: 862 case Instruction::PHI: 863 continue; 864 case Instruction::GetElementPtr: 865 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices()) 866 continue; 867 break; 868 default: 869 break; 870 } 871 872 if (I.isLifetimeStartOrEnd()) 873 continue; 874 875 if (auto *II = dyn_cast<IntrinsicInst>(&I)) { 876 Intrinsic::ID IID = II->getIntrinsicID(); 877 SmallVector<Type *, 4> Tys; 878 FastMathFlags FMF; 879 for (Value *Val : II->args()) 880 Tys.push_back(Val->getType()); 881 882 if (auto *FPMO = dyn_cast<FPMathOperator>(II)) 883 FMF = FPMO->getFastMathFlags(); 884 885 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF); 886 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency); 887 continue; 888 } 889 890 if (CallInst *CI = dyn_cast<CallInst>(&I)) { 891 InlineCost += getCallsiteCost(*CI, DL); 892 continue; 893 } 894 895 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) { 896 InlineCost += getCallsiteCost(*II, DL); 897 continue; 898 } 899 900 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) { 901 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; 902 continue; 903 } 904 InlineCost += InlineConstants::InstrCost; 905 } 906 return InlineCost; 907 } 908 909 std::tuple<int, int> 910 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) { 911 int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0; 912 for (auto FuncBBPair : Cloner.OutlinedFunctions) { 913 Function *OutlinedFunc = FuncBBPair.first; 914 BasicBlock* OutliningCallBB = FuncBBPair.second; 915 // Now compute the cost of the call sequence to the outlined function 916 // 'OutlinedFunction' in BB 'OutliningCallBB': 917 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc); 918 OutliningFuncCallCost += 919 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI); 920 921 // Now compute the cost of the extracted/outlined function itself: 922 for (BasicBlock &BB : *OutlinedFunc) 923 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI); 924 } 925 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost && 926 "Outlined function cost should be no less than the outlined region"); 927 928 // The code extractor introduces a new root and exit stub blocks with 929 // additional unconditional branches. Those branches will be eliminated 930 // later with bb layout. The cost should be adjusted accordingly: 931 OutlinedFunctionCost -= 932 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size(); 933 934 int OutliningRuntimeOverhead = 935 OutliningFuncCallCost + 936 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) + 937 ExtraOutliningPenalty; 938 939 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead); 940 } 941 942 // Create the callsite to profile count map which is 943 // used to update the original function's entry count, 944 // after the function is partially inlined into the callsite. 945 void PartialInlinerImpl::computeCallsiteToProfCountMap( 946 Function *DuplicateFunction, 947 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) { 948 std::vector<User *> Users(DuplicateFunction->user_begin(), 949 DuplicateFunction->user_end()); 950 Function *CurrentCaller = nullptr; 951 std::unique_ptr<BlockFrequencyInfo> TempBFI; 952 BlockFrequencyInfo *CurrentCallerBFI = nullptr; 953 954 auto ComputeCurrBFI = [&,this](Function *Caller) { 955 // For the old pass manager: 956 if (!GetBFI) { 957 DominatorTree DT(*Caller); 958 LoopInfo LI(DT); 959 BranchProbabilityInfo BPI(*Caller, LI); 960 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI)); 961 CurrentCallerBFI = TempBFI.get(); 962 } else { 963 // New pass manager: 964 CurrentCallerBFI = &(GetBFI(*Caller)); 965 } 966 }; 967 968 for (User *User : Users) { 969 CallBase *CB = getSupportedCallBase(User); 970 Function *Caller = CB->getCaller(); 971 if (CurrentCaller != Caller) { 972 CurrentCaller = Caller; 973 ComputeCurrBFI(Caller); 974 } else { 975 assert(CurrentCallerBFI && "CallerBFI is not set"); 976 } 977 BasicBlock *CallBB = CB->getParent(); 978 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); 979 if (Count) 980 CallSiteToProfCountMap[User] = *Count; 981 else 982 CallSiteToProfCountMap[User] = 0; 983 } 984 } 985 986 PartialInlinerImpl::FunctionCloner::FunctionCloner( 987 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE, 988 function_ref<AssumptionCache *(Function &)> LookupAC, 989 function_ref<TargetTransformInfo &(Function &)> GetTTI) 990 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 991 ClonedOI = std::make_unique<FunctionOutliningInfo>(); 992 993 // Clone the function, so that we can hack away on it. 994 ValueToValueMapTy VMap; 995 ClonedFunc = CloneFunction(F, VMap); 996 997 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]); 998 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]); 999 for (BasicBlock *BB : OI->Entries) { 1000 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB])); 1001 } 1002 for (BasicBlock *E : OI->ReturnBlockPreds) { 1003 BasicBlock *NewE = cast<BasicBlock>(VMap[E]); 1004 ClonedOI->ReturnBlockPreds.push_back(NewE); 1005 } 1006 // Go ahead and update all uses to the duplicate, so that we can just 1007 // use the inliner functionality when we're done hacking. 1008 F->replaceAllUsesWith(ClonedFunc); 1009 } 1010 1011 PartialInlinerImpl::FunctionCloner::FunctionCloner( 1012 Function *F, FunctionOutliningMultiRegionInfo *OI, 1013 OptimizationRemarkEmitter &ORE, 1014 function_ref<AssumptionCache *(Function &)> LookupAC, 1015 function_ref<TargetTransformInfo &(Function &)> GetTTI) 1016 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) { 1017 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>(); 1018 1019 // Clone the function, so that we can hack away on it. 1020 ValueToValueMapTy VMap; 1021 ClonedFunc = CloneFunction(F, VMap); 1022 1023 // Go through all Outline Candidate Regions and update all BasicBlock 1024 // information. 1025 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1026 OI->ORI) { 1027 SmallVector<BasicBlock *, 8> Region; 1028 for (BasicBlock *BB : RegionInfo.Region) { 1029 Region.push_back(cast<BasicBlock>(VMap[BB])); 1030 } 1031 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]); 1032 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]); 1033 BasicBlock *NewReturnBlock = nullptr; 1034 if (RegionInfo.ReturnBlock) 1035 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]); 1036 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo( 1037 Region, NewEntryBlock, NewExitBlock, NewReturnBlock); 1038 ClonedOMRI->ORI.push_back(MappedRegionInfo); 1039 } 1040 // Go ahead and update all uses to the duplicate, so that we can just 1041 // use the inliner functionality when we're done hacking. 1042 F->replaceAllUsesWith(ClonedFunc); 1043 } 1044 1045 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() { 1046 auto getFirstPHI = [](BasicBlock *BB) { 1047 BasicBlock::iterator I = BB->begin(); 1048 PHINode *FirstPhi = nullptr; 1049 while (I != BB->end()) { 1050 PHINode *Phi = dyn_cast<PHINode>(I); 1051 if (!Phi) 1052 break; 1053 if (!FirstPhi) { 1054 FirstPhi = Phi; 1055 break; 1056 } 1057 } 1058 return FirstPhi; 1059 }; 1060 1061 // Shouldn't need to normalize PHIs if we're not outlining non-early return 1062 // blocks. 1063 if (!ClonedOI) 1064 return; 1065 1066 // Special hackery is needed with PHI nodes that have inputs from more than 1067 // one extracted block. For simplicity, just split the PHIs into a two-level 1068 // sequence of PHIs, some of which will go in the extracted region, and some 1069 // of which will go outside. 1070 BasicBlock *PreReturn = ClonedOI->ReturnBlock; 1071 // only split block when necessary: 1072 PHINode *FirstPhi = getFirstPHI(PreReturn); 1073 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size(); 1074 1075 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1) 1076 return; 1077 1078 auto IsTrivialPhi = [](PHINode *PN) -> Value * { 1079 Value *CommonValue = PN->getIncomingValue(0); 1080 if (all_of(PN->incoming_values(), 1081 [&](Value *V) { return V == CommonValue; })) 1082 return CommonValue; 1083 return nullptr; 1084 }; 1085 1086 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock( 1087 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator()); 1088 BasicBlock::iterator I = PreReturn->begin(); 1089 Instruction *Ins = &ClonedOI->ReturnBlock->front(); 1090 SmallVector<Instruction *, 4> DeadPhis; 1091 while (I != PreReturn->end()) { 1092 PHINode *OldPhi = dyn_cast<PHINode>(I); 1093 if (!OldPhi) 1094 break; 1095 1096 PHINode *RetPhi = 1097 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins); 1098 OldPhi->replaceAllUsesWith(RetPhi); 1099 Ins = ClonedOI->ReturnBlock->getFirstNonPHI(); 1100 1101 RetPhi->addIncoming(&*I, PreReturn); 1102 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) { 1103 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E); 1104 OldPhi->removeIncomingValue(E); 1105 } 1106 1107 // After incoming values splitting, the old phi may become trivial. 1108 // Keeping the trivial phi can introduce definition inside the outline 1109 // region which is live-out, causing necessary overhead (load, store 1110 // arg passing etc). 1111 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) { 1112 OldPhi->replaceAllUsesWith(OldPhiVal); 1113 DeadPhis.push_back(OldPhi); 1114 } 1115 ++I; 1116 } 1117 for (auto *DP : DeadPhis) 1118 DP->eraseFromParent(); 1119 1120 for (auto E : ClonedOI->ReturnBlockPreds) { 1121 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock); 1122 } 1123 } 1124 1125 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() { 1126 1127 auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) { 1128 int Cost = 0; 1129 for (BasicBlock* BB : Region) 1130 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent())); 1131 return Cost; 1132 }; 1133 1134 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline"); 1135 1136 if (ClonedOMRI->ORI.empty()) 1137 return false; 1138 1139 // The CodeExtractor needs a dominator tree. 1140 DominatorTree DT; 1141 DT.recalculate(*ClonedFunc); 1142 1143 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1144 LoopInfo LI(DT); 1145 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1146 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1147 1148 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time. 1149 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1150 1151 SetVector<Value *> Inputs, Outputs, Sinks; 1152 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo : 1153 ClonedOMRI->ORI) { 1154 int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region); 1155 1156 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false, 1157 ClonedFuncBFI.get(), &BPI, 1158 LookupAC(*RegionInfo.EntryBlock->getParent()), 1159 /* AllowVarargs */ false); 1160 1161 CE.findInputsOutputs(Inputs, Outputs, Sinks); 1162 1163 #ifndef NDEBUG 1164 if (TracePartialInlining) { 1165 dbgs() << "inputs: " << Inputs.size() << "\n"; 1166 dbgs() << "outputs: " << Outputs.size() << "\n"; 1167 for (Value *value : Inputs) 1168 dbgs() << "value used in func: " << *value << "\n"; 1169 for (Value *output : Outputs) 1170 dbgs() << "instr used in func: " << *output << "\n"; 1171 } 1172 #endif 1173 // Do not extract regions that have live exit variables. 1174 if (Outputs.size() > 0 && !ForceLiveExit) 1175 continue; 1176 1177 Function *OutlinedFunc = CE.extractCodeRegion(CEAC); 1178 1179 if (OutlinedFunc) { 1180 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc); 1181 BasicBlock *OutliningCallBB = OCS->getParent(); 1182 assert(OutliningCallBB->getParent() == ClonedFunc); 1183 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB)); 1184 NumColdRegionsOutlined++; 1185 OutlinedRegionCost += CurrentOutlinedRegionCost; 1186 1187 if (MarkOutlinedColdCC) { 1188 OutlinedFunc->setCallingConv(CallingConv::Cold); 1189 OCS->setCallingConv(CallingConv::Cold); 1190 } 1191 } else 1192 ORE.emit([&]() { 1193 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1194 &RegionInfo.Region.front()->front()) 1195 << "Failed to extract region at block " 1196 << ore::NV("Block", RegionInfo.Region.front()); 1197 }); 1198 } 1199 1200 return !OutlinedFunctions.empty(); 1201 } 1202 1203 Function * 1204 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() { 1205 // Returns true if the block is to be partial inlined into the caller 1206 // (i.e. not to be extracted to the out of line function) 1207 auto ToBeInlined = [&, this](BasicBlock *BB) { 1208 return BB == ClonedOI->ReturnBlock || 1209 llvm::is_contained(ClonedOI->Entries, BB); 1210 }; 1211 1212 assert(ClonedOI && "Expecting OutlineInfo for single region outline"); 1213 // The CodeExtractor needs a dominator tree. 1214 DominatorTree DT; 1215 DT.recalculate(*ClonedFunc); 1216 1217 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo. 1218 LoopInfo LI(DT); 1219 BranchProbabilityInfo BPI(*ClonedFunc, LI); 1220 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI)); 1221 1222 // Gather up the blocks that we're going to extract. 1223 std::vector<BasicBlock *> ToExtract; 1224 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc); 1225 ToExtract.push_back(ClonedOI->NonReturnBlock); 1226 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost( 1227 ClonedOI->NonReturnBlock, ClonedFuncTTI); 1228 for (BasicBlock &BB : *ClonedFunc) 1229 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) { 1230 ToExtract.push_back(&BB); 1231 // FIXME: the code extractor may hoist/sink more code 1232 // into the outlined function which may make the outlining 1233 // overhead (the difference of the outlined function cost 1234 // and OutliningRegionCost) look larger. 1235 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI); 1236 } 1237 1238 // Extract the body of the if. 1239 CodeExtractorAnalysisCache CEAC(*ClonedFunc); 1240 Function *OutlinedFunc = 1241 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, 1242 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc), 1243 /* AllowVarargs */ true) 1244 .extractCodeRegion(CEAC); 1245 1246 if (OutlinedFunc) { 1247 BasicBlock *OutliningCallBB = 1248 PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc) 1249 ->getParent(); 1250 assert(OutliningCallBB->getParent() == ClonedFunc); 1251 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB)); 1252 } else 1253 ORE.emit([&]() { 1254 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed", 1255 &ToExtract.front()->front()) 1256 << "Failed to extract region at block " 1257 << ore::NV("Block", ToExtract.front()); 1258 }); 1259 1260 return OutlinedFunc; 1261 } 1262 1263 PartialInlinerImpl::FunctionCloner::~FunctionCloner() { 1264 // Ditch the duplicate, since we're done with it, and rewrite all remaining 1265 // users (function pointers, etc.) back to the original function. 1266 ClonedFunc->replaceAllUsesWith(OrigFunc); 1267 ClonedFunc->eraseFromParent(); 1268 if (!IsFunctionInlined) { 1269 // Remove each function that was speculatively created if there is no 1270 // reference. 1271 for (auto FuncBBPair : OutlinedFunctions) { 1272 Function *Func = FuncBBPair.first; 1273 Func->eraseFromParent(); 1274 } 1275 } 1276 } 1277 1278 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) { 1279 1280 if (F->hasAddressTaken()) 1281 return {false, nullptr}; 1282 1283 // Let inliner handle it 1284 if (F->hasFnAttribute(Attribute::AlwaysInline)) 1285 return {false, nullptr}; 1286 1287 if (F->hasFnAttribute(Attribute::NoInline)) 1288 return {false, nullptr}; 1289 1290 if (PSI.isFunctionEntryCold(F)) 1291 return {false, nullptr}; 1292 1293 if (F->users().empty()) 1294 return {false, nullptr}; 1295 1296 OptimizationRemarkEmitter ORE(F); 1297 1298 // Only try to outline cold regions if we have a profile summary, which 1299 // implies we have profiling information. 1300 if (PSI.hasProfileSummary() && F->hasProfileData() && 1301 !DisableMultiRegionPartialInline) { 1302 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI = 1303 computeOutliningColdRegionsInfo(F, ORE); 1304 if (OMRI) { 1305 FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI); 1306 1307 #ifndef NDEBUG 1308 if (TracePartialInlining) { 1309 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n"; 1310 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold() 1311 << "\n"; 1312 } 1313 #endif 1314 bool DidOutline = Cloner.doMultiRegionFunctionOutlining(); 1315 1316 if (DidOutline) { 1317 #ifndef NDEBUG 1318 if (TracePartialInlining) { 1319 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n"; 1320 Cloner.ClonedFunc->print(dbgs()); 1321 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n"; 1322 } 1323 #endif 1324 1325 if (tryPartialInline(Cloner)) 1326 return {true, nullptr}; 1327 } 1328 } 1329 } 1330 1331 // Fall-thru to regular partial inlining if we: 1332 // i) can't find any cold regions to outline, or 1333 // ii) can't inline the outlined function anywhere. 1334 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F); 1335 if (!OI) 1336 return {false, nullptr}; 1337 1338 FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI); 1339 Cloner.NormalizeReturnBlock(); 1340 1341 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining(); 1342 1343 if (!OutlinedFunction) 1344 return {false, nullptr}; 1345 1346 bool AnyInline = tryPartialInline(Cloner); 1347 1348 if (AnyInline) 1349 return {true, OutlinedFunction}; 1350 1351 return {false, nullptr}; 1352 } 1353 1354 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) { 1355 if (Cloner.OutlinedFunctions.empty()) 1356 return false; 1357 1358 int SizeCost = 0; 1359 BlockFrequency WeightedRcost; 1360 int NonWeightedRcost; 1361 std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner); 1362 1363 // Only calculate RelativeToEntryFreq when we are doing single region 1364 // outlining. 1365 BranchProbability RelativeToEntryFreq; 1366 if (Cloner.ClonedOI) { 1367 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner); 1368 } else 1369 // RelativeToEntryFreq doesn't make sense when we have more than one 1370 // outlined call because each call will have a different relative frequency 1371 // to the entry block. We can consider using the average, but the 1372 // usefulness of that information is questionable. For now, assume we never 1373 // execute the calls to outlined functions. 1374 RelativeToEntryFreq = BranchProbability(0, 1); 1375 1376 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq; 1377 1378 // The call sequence(s) to the outlined function(s) are larger than the sum of 1379 // the original outlined region size(s), it does not increase the chances of 1380 // inlining the function with outlining (The inliner uses the size increase to 1381 // model the cost of inlining a callee). 1382 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) { 1383 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1384 DebugLoc DLoc; 1385 BasicBlock *Block; 1386 std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc); 1387 OrigFuncORE.emit([&]() { 1388 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", 1389 DLoc, Block) 1390 << ore::NV("Function", Cloner.OrigFunc) 1391 << " not partially inlined into callers (Original Size = " 1392 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost) 1393 << ", Size of call sequence to outlined function = " 1394 << ore::NV("NewSize", SizeCost) << ")"; 1395 }); 1396 return false; 1397 } 1398 1399 assert(Cloner.OrigFunc->users().empty() && 1400 "F's users should all be replaced!"); 1401 1402 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(), 1403 Cloner.ClonedFunc->user_end()); 1404 1405 DenseMap<User *, uint64_t> CallSiteToProfCountMap; 1406 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount(); 1407 if (CalleeEntryCount) 1408 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap); 1409 1410 uint64_t CalleeEntryCountV = 1411 (CalleeEntryCount ? CalleeEntryCount.getCount() : 0); 1412 1413 bool AnyInline = false; 1414 for (User *User : Users) { 1415 CallBase *CB = getSupportedCallBase(User); 1416 1417 if (IsLimitReached()) 1418 continue; 1419 1420 OptimizationRemarkEmitter CallerORE(CB->getCaller()); 1421 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE)) 1422 continue; 1423 1424 // Construct remark before doing the inlining, as after successful inlining 1425 // the callsite is removed. 1426 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB); 1427 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into " 1428 << ore::NV("Caller", CB->getCaller()); 1429 1430 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI); 1431 // We can only forward varargs when we outlined a single region, else we 1432 // bail on vararg functions. 1433 if (!InlineFunction(*CB, IFI, nullptr, true, 1434 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first 1435 : nullptr)) 1436 .isSuccess()) 1437 continue; 1438 1439 CallerORE.emit(OR); 1440 1441 // Now update the entry count: 1442 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { 1443 uint64_t CallSiteCount = CallSiteToProfCountMap[User]; 1444 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); 1445 } 1446 1447 AnyInline = true; 1448 NumPartialInlining++; 1449 // Update the stats 1450 if (Cloner.ClonedOI) 1451 NumPartialInlined++; 1452 else 1453 NumColdOutlinePartialInlined++; 1454 1455 } 1456 1457 if (AnyInline) { 1458 Cloner.IsFunctionInlined = true; 1459 if (CalleeEntryCount) 1460 Cloner.OrigFunc->setEntryCount( 1461 CalleeEntryCount.setCount(CalleeEntryCountV)); 1462 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc); 1463 OrigFuncORE.emit([&]() { 1464 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc) 1465 << "Partially inlined into at least one caller"; 1466 }); 1467 1468 } 1469 1470 return AnyInline; 1471 } 1472 1473 bool PartialInlinerImpl::run(Module &M) { 1474 if (DisablePartialInlining) 1475 return false; 1476 1477 std::vector<Function *> Worklist; 1478 Worklist.reserve(M.size()); 1479 for (Function &F : M) 1480 if (!F.use_empty() && !F.isDeclaration()) 1481 Worklist.push_back(&F); 1482 1483 bool Changed = false; 1484 while (!Worklist.empty()) { 1485 Function *CurrFunc = Worklist.back(); 1486 Worklist.pop_back(); 1487 1488 if (CurrFunc->use_empty()) 1489 continue; 1490 1491 bool Recursive = false; 1492 for (User *U : CurrFunc->users()) 1493 if (Instruction *I = dyn_cast<Instruction>(U)) 1494 if (I->getParent()->getParent() == CurrFunc) { 1495 Recursive = true; 1496 break; 1497 } 1498 if (Recursive) 1499 continue; 1500 1501 std::pair<bool, Function * > Result = unswitchFunction(CurrFunc); 1502 if (Result.second) 1503 Worklist.push_back(Result.second); 1504 Changed |= Result.first; 1505 } 1506 1507 return Changed; 1508 } 1509 1510 char PartialInlinerLegacyPass::ID = 0; 1511 1512 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner", 1513 "Partial Inliner", false, false) 1514 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 1515 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 1516 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 1517 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 1518 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner", 1519 "Partial Inliner", false, false) 1520 1521 ModulePass *llvm::createPartialInliningPass() { 1522 return new PartialInlinerLegacyPass(); 1523 } 1524 1525 PreservedAnalyses PartialInlinerPass::run(Module &M, 1526 ModuleAnalysisManager &AM) { 1527 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 1528 1529 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & { 1530 return FAM.getResult<AssumptionAnalysis>(F); 1531 }; 1532 1533 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * { 1534 return FAM.getCachedResult<AssumptionAnalysis>(F); 1535 }; 1536 1537 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & { 1538 return FAM.getResult<BlockFrequencyAnalysis>(F); 1539 }; 1540 1541 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & { 1542 return FAM.getResult<TargetIRAnalysis>(F); 1543 }; 1544 1545 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & { 1546 return FAM.getResult<TargetLibraryAnalysis>(F); 1547 }; 1548 1549 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M); 1550 1551 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI, 1552 GetTLI, PSI, GetBFI) 1553 .run(M)) 1554 return PreservedAnalyses::none(); 1555 return PreservedAnalyses::all(); 1556 } 1557