1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 #ifndef NDEBUG
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103                                           cl::init(false), cl::Hidden,
104                                           cl::desc("Trace partial inlining."));
105 #endif
106 
107 // This is an option used by testing:
108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109                                       cl::init(false), cl::ZeroOrMore,
110                                       cl::ReallyHidden,
111                                       cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function.  Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
115 // at least 10%.
116 static cl::opt<float> MinRegionSizeRatio(
117     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118     cl::desc("Minimum ratio comparing relative sizes of each "
119              "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt<unsigned>
123     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124                              cl::desc("Minimum block executions to consider "
125                                       "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt<float> ColdBranchRatio(
129     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130     cl::desc("Minimum BranchProbability to consider a region cold."));
131 
132 static cl::opt<unsigned> MaxNumInlineBlocks(
133     "max-num-inline-blocks", cl::init(5), cl::Hidden,
134     cl::desc("Max number of blocks to be partially inlined"));
135 
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt<int> MaxNumPartialInlining(
139     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140     cl::desc("Max number of partial inlining. The default is unlimited"));
141 
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
145 static cl::opt<int>
146     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147                              cl::Hidden, cl::ZeroOrMore,
148                              cl::desc("Relative frequency of outline region to "
149                                       "the entry block"));
150 
151 static cl::opt<unsigned> ExtraOutliningPenalty(
152     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153     cl::desc("A debug option to add additional penalty to the computed one."));
154 
155 namespace {
156 
157 struct FunctionOutliningInfo {
158   FunctionOutliningInfo() = default;
159 
160   // Returns the number of blocks to be inlined including all blocks
161   // in Entries and one return block.
162   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163 
164   // A set of blocks including the function entry that guard
165   // the region to be outlined.
166   SmallVector<BasicBlock *, 4> Entries;
167 
168   // The return block that is not included in the outlined region.
169   BasicBlock *ReturnBlock = nullptr;
170 
171   // The dominating block of the region to be outlined.
172   BasicBlock *NonReturnBlock = nullptr;
173 
174   // The set of blocks in Entries that that are predecessors to ReturnBlock
175   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176 };
177 
178 struct FunctionOutliningMultiRegionInfo {
179   FunctionOutliningMultiRegionInfo()
180       : ORI() {}
181 
182   // Container for outline regions
183   struct OutlineRegionInfo {
184     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
185                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186                       BasicBlock *ReturnBlock)
187         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
188           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
189     SmallVector<BasicBlock *, 8> Region;
190     BasicBlock *EntryBlock;
191     BasicBlock *ExitBlock;
192     BasicBlock *ReturnBlock;
193   };
194 
195   SmallVector<OutlineRegionInfo, 4> ORI;
196 };
197 
198 struct PartialInlinerImpl {
199 
200   PartialInlinerImpl(
201       function_ref<AssumptionCache &(Function &)> GetAC,
202       function_ref<AssumptionCache *(Function &)> LookupAC,
203       function_ref<TargetTransformInfo &(Function &)> GTTI,
204       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
205       ProfileSummaryInfo &ProfSI,
206       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
207       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
208         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
209 
210   bool run(Module &M);
211   // Main part of the transformation that calls helper functions to find
212   // outlining candidates, clone & outline the function, and attempt to
213   // partially inline the resulting function. Returns true if
214   // inlining was successful, false otherwise.  Also returns the outline
215   // function (only if we partially inlined early returns) as there is a
216   // possibility to further "peel" early return statements that were left in the
217   // outline function due to code size.
218   std::pair<bool, Function *> unswitchFunction(Function *F);
219 
220   // This class speculatively clones the function to be partial inlined.
221   // At the end of partial inlining, the remaining callsites to the cloned
222   // function that are not partially inlined will be fixed up to reference
223   // the original function, and the cloned function will be erased.
224   struct FunctionCloner {
225     // Two constructors, one for single region outlining, the other for
226     // multi-region outlining.
227     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
228                    OptimizationRemarkEmitter &ORE,
229                    function_ref<AssumptionCache *(Function &)> LookupAC,
230                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
231     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
232                    OptimizationRemarkEmitter &ORE,
233                    function_ref<AssumptionCache *(Function &)> LookupAC,
234                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
235 
236     ~FunctionCloner();
237 
238     // Prepare for function outlining: making sure there is only
239     // one incoming edge from the extracted/outlined region to
240     // the return block.
241     void NormalizeReturnBlock();
242 
243     // Do function outlining for cold regions.
244     bool doMultiRegionFunctionOutlining();
245     // Do function outlining for region after early return block(s).
246     // NOTE: For vararg functions that do the vararg handling in the outlined
247     //       function, we temporarily generate IR that does not properly
248     //       forward varargs to the outlined function. Calling InlineFunction
249     //       will update calls to the outlined functions to properly forward
250     //       the varargs.
251     Function *doSingleRegionFunctionOutlining();
252 
253     Function *OrigFunc = nullptr;
254     Function *ClonedFunc = nullptr;
255 
256     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
257     // Keep track of Outlined Functions and the basic block they're called from.
258     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
259 
260     // ClonedFunc is inlined in one of its callers after function
261     // outlining.
262     bool IsFunctionInlined = false;
263     // The cost of the region to be outlined.
264     int OutlinedRegionCost = 0;
265     // ClonedOI is specific to outlining non-early return blocks.
266     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
267     // ClonedOMRI is specific to outlining cold regions.
268     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
269     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
270     OptimizationRemarkEmitter &ORE;
271     function_ref<AssumptionCache *(Function &)> LookupAC;
272     function_ref<TargetTransformInfo &(Function &)> GetTTI;
273   };
274 
275 private:
276   int NumPartialInlining = 0;
277   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
278   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
279   function_ref<TargetTransformInfo &(Function &)> GetTTI;
280   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
281   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
282   ProfileSummaryInfo &PSI;
283 
284   // Return the frequency of the OutlininingBB relative to F's entry point.
285   // The result is no larger than 1 and is represented using BP.
286   // (Note that the outlined region's 'head' block can only have incoming
287   // edges from the guarding entry blocks).
288   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
289 
290   // Return true if the callee of CB should be partially inlined with
291   // profit.
292   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
293                            BlockFrequency WeightedOutliningRcost,
294                            OptimizationRemarkEmitter &ORE);
295 
296   // Try to inline DuplicateFunction (cloned from F with call to
297   // the OutlinedFunction into its callers. Return true
298   // if there is any successful inlining.
299   bool tryPartialInline(FunctionCloner &Cloner);
300 
301   // Compute the mapping from use site of DuplicationFunction to the enclosing
302   // BB's profile count.
303   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
304                                      DenseMap<User *, uint64_t> &SiteCountMap);
305 
306   bool IsLimitReached() {
307     return (MaxNumPartialInlining != -1 &&
308             NumPartialInlining >= MaxNumPartialInlining);
309   }
310 
311   static CallBase *getSupportedCallBase(User *U) {
312     if (isa<CallInst>(U) || isa<InvokeInst>(U))
313       return cast<CallBase>(U);
314     llvm_unreachable("All uses must be calls");
315     return nullptr;
316   }
317 
318   static CallBase *getOneCallSiteTo(Function *F) {
319     User *User = *F->user_begin();
320     return getSupportedCallBase(User);
321   }
322 
323   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
324     CallBase *CB = getOneCallSiteTo(F);
325     DebugLoc DLoc = CB->getDebugLoc();
326     BasicBlock *Block = CB->getParent();
327     return std::make_tuple(DLoc, Block);
328   }
329 
330   // Returns the costs associated with function outlining:
331   // - The first value is the non-weighted runtime cost for making the call
332   //   to the outlined function, including the addtional  setup cost in the
333   //    outlined function itself;
334   // - The second value is the estimated size of the new call sequence in
335   //   basic block Cloner.OutliningCallBB;
336   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
337 
338   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
339   // approximate both the size and runtime cost (Note that in the current
340   // inline cost analysis, there is no clear distinction there either).
341   static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
342 
343   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
344   std::unique_ptr<FunctionOutliningMultiRegionInfo>
345   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
346 };
347 
348 struct PartialInlinerLegacyPass : public ModulePass {
349   static char ID; // Pass identification, replacement for typeid
350 
351   PartialInlinerLegacyPass() : ModulePass(ID) {
352     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
353   }
354 
355   void getAnalysisUsage(AnalysisUsage &AU) const override {
356     AU.addRequired<AssumptionCacheTracker>();
357     AU.addRequired<ProfileSummaryInfoWrapperPass>();
358     AU.addRequired<TargetTransformInfoWrapperPass>();
359     AU.addRequired<TargetLibraryInfoWrapperPass>();
360   }
361 
362   bool runOnModule(Module &M) override {
363     if (skipModule(M))
364       return false;
365 
366     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
367     TargetTransformInfoWrapperPass *TTIWP =
368         &getAnalysis<TargetTransformInfoWrapperPass>();
369     ProfileSummaryInfo &PSI =
370         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
371 
372     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
373       return ACT->getAssumptionCache(F);
374     };
375 
376     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
377       return ACT->lookupAssumptionCache(F);
378     };
379 
380     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
381       return TTIWP->getTTI(F);
382     };
383 
384     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
385       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
386     };
387 
388     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
389                               GetTLI, PSI)
390         .run(M);
391   }
392 };
393 
394 } // end anonymous namespace
395 
396 std::unique_ptr<FunctionOutliningMultiRegionInfo>
397 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
398                                                     OptimizationRemarkEmitter &ORE) {
399   BasicBlock *EntryBlock = &F->front();
400 
401   DominatorTree DT(*F);
402   LoopInfo LI(DT);
403   BranchProbabilityInfo BPI(*F, LI);
404   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
405   BlockFrequencyInfo *BFI;
406   if (!GetBFI) {
407     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
408     BFI = ScopedBFI.get();
409   } else
410     BFI = &(GetBFI(*F));
411 
412   // Return if we don't have profiling information.
413   if (!PSI.hasInstrumentationProfile())
414     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
415 
416   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
417       std::make_unique<FunctionOutliningMultiRegionInfo>();
418 
419   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
420     BasicBlock *Dom = BlockList.front();
421     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
422   };
423 
424   auto IsSingleExit =
425       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
426     BasicBlock *ExitBlock = nullptr;
427     for (auto *Block : BlockList) {
428       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
429         if (!is_contained(BlockList, *SI)) {
430           if (ExitBlock) {
431             ORE.emit([&]() {
432               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
433                                               &SI->front())
434                      << "Region dominated by "
435                      << ore::NV("Block", BlockList.front()->getName())
436                      << " has more than one region exit edge.";
437             });
438             return nullptr;
439           } else
440             ExitBlock = Block;
441         }
442       }
443     }
444     return ExitBlock;
445   };
446 
447   auto BBProfileCount = [BFI](BasicBlock *BB) {
448     return BFI->getBlockProfileCount(BB)
449                ? BFI->getBlockProfileCount(BB).getValue()
450                : 0;
451   };
452 
453   // Use the same computeBBInlineCost function to compute the cost savings of
454   // the outlining the candidate region.
455   TargetTransformInfo *FTTI = &GetTTI(*F);
456   int OverallFunctionCost = 0;
457   for (auto &BB : *F)
458     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
459 
460 #ifndef NDEBUG
461   if (TracePartialInlining)
462     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
463 #endif
464   int MinOutlineRegionCost =
465       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
466   BranchProbability MinBranchProbability(
467       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
468       MinBlockCounterExecution);
469   bool ColdCandidateFound = false;
470   BasicBlock *CurrEntry = EntryBlock;
471   std::vector<BasicBlock *> DFS;
472   DenseMap<BasicBlock *, bool> VisitedMap;
473   DFS.push_back(CurrEntry);
474   VisitedMap[CurrEntry] = true;
475   // Use Depth First Search on the basic blocks to find CFG edges that are
476   // considered cold.
477   // Cold regions considered must also have its inline cost compared to the
478   // overall inline cost of the original function.  The region is outlined only
479   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
480   // more.
481   while (!DFS.empty()) {
482     auto *thisBB = DFS.back();
483     DFS.pop_back();
484     // Only consider regions with predecessor blocks that are considered
485     // not-cold (default: part of the top 99.99% of all block counters)
486     // AND greater than our minimum block execution count (default: 100).
487     if (PSI.isColdBlock(thisBB, BFI) ||
488         BBProfileCount(thisBB) < MinBlockCounterExecution)
489       continue;
490     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
491       if (VisitedMap[*SI])
492         continue;
493       VisitedMap[*SI] = true;
494       DFS.push_back(*SI);
495       // If branch isn't cold, we skip to the next one.
496       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
497       if (SuccProb > MinBranchProbability)
498         continue;
499 #ifndef NDEBUG
500       if (TracePartialInlining) {
501         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
502                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
503                << "\n";
504       }
505 #endif
506       SmallVector<BasicBlock *, 8> DominateVector;
507       DT.getDescendants(*SI, DominateVector);
508       // We can only outline single entry regions (for now).
509       if (!IsSingleEntry(DominateVector))
510         continue;
511       BasicBlock *ExitBlock = nullptr;
512       // We can only outline single exit regions (for now).
513       if (!(ExitBlock = IsSingleExit(DominateVector)))
514         continue;
515       int OutlineRegionCost = 0;
516       for (auto *BB : DominateVector)
517         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
518 
519 #ifndef NDEBUG
520       if (TracePartialInlining)
521         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
522 #endif
523 
524       if (OutlineRegionCost < MinOutlineRegionCost) {
525         ORE.emit([&]() {
526           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
527                                             &SI->front())
528                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
529                  << ore::NV("Cost", MinOutlineRegionCost);
530         });
531         continue;
532       }
533       // For now, ignore blocks that belong to a SISE region that is a
534       // candidate for outlining.  In the future, we may want to look
535       // at inner regions because the outer region may have live-exit
536       // variables.
537       for (auto *BB : DominateVector)
538         VisitedMap[BB] = true;
539       // ReturnBlock here means the block after the outline call
540       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
541       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
542       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
543           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
544       OutliningInfo->ORI.push_back(RegInfo);
545 #ifndef NDEBUG
546       if (TracePartialInlining) {
547         dbgs() << "Found Cold Candidate starting at block: "
548                << DominateVector.front()->getName() << "\n";
549       }
550 #endif
551       ColdCandidateFound = true;
552       NumColdRegionsFound++;
553     }
554   }
555   if (ColdCandidateFound)
556     return OutliningInfo;
557   else
558     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
559 }
560 
561 std::unique_ptr<FunctionOutliningInfo>
562 PartialInlinerImpl::computeOutliningInfo(Function *F) {
563   BasicBlock *EntryBlock = &F->front();
564   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
565   if (!BR || BR->isUnconditional())
566     return std::unique_ptr<FunctionOutliningInfo>();
567 
568   // Returns true if Succ is BB's successor
569   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
570     return is_contained(successors(BB), Succ);
571   };
572 
573   auto IsReturnBlock = [](BasicBlock *BB) {
574     Instruction *TI = BB->getTerminator();
575     return isa<ReturnInst>(TI);
576   };
577 
578   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
579     if (IsReturnBlock(Succ1))
580       return std::make_tuple(Succ1, Succ2);
581     if (IsReturnBlock(Succ2))
582       return std::make_tuple(Succ2, Succ1);
583 
584     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
585   };
586 
587   // Detect a triangular shape:
588   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
589     if (IsSuccessor(Succ1, Succ2))
590       return std::make_tuple(Succ1, Succ2);
591     if (IsSuccessor(Succ2, Succ1))
592       return std::make_tuple(Succ2, Succ1);
593 
594     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
595   };
596 
597   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
598       std::make_unique<FunctionOutliningInfo>();
599 
600   BasicBlock *CurrEntry = EntryBlock;
601   bool CandidateFound = false;
602   do {
603     // The number of blocks to be inlined has already reached
604     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
605     // disables partial inlining for the function.
606     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
607       break;
608 
609     if (succ_size(CurrEntry) != 2)
610       break;
611 
612     BasicBlock *Succ1 = *succ_begin(CurrEntry);
613     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
614 
615     BasicBlock *ReturnBlock, *NonReturnBlock;
616     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
617 
618     if (ReturnBlock) {
619       OutliningInfo->Entries.push_back(CurrEntry);
620       OutliningInfo->ReturnBlock = ReturnBlock;
621       OutliningInfo->NonReturnBlock = NonReturnBlock;
622       CandidateFound = true;
623       break;
624     }
625 
626     BasicBlock *CommSucc;
627     BasicBlock *OtherSucc;
628     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
629 
630     if (!CommSucc)
631       break;
632 
633     OutliningInfo->Entries.push_back(CurrEntry);
634     CurrEntry = OtherSucc;
635   } while (true);
636 
637   if (!CandidateFound)
638     return std::unique_ptr<FunctionOutliningInfo>();
639 
640   // Do sanity check of the entries: threre should not
641   // be any successors (not in the entry set) other than
642   // {ReturnBlock, NonReturnBlock}
643   assert(OutliningInfo->Entries[0] == &F->front() &&
644          "Function Entry must be the first in Entries vector");
645   DenseSet<BasicBlock *> Entries;
646   for (BasicBlock *E : OutliningInfo->Entries)
647     Entries.insert(E);
648 
649   // Returns true of BB has Predecessor which is not
650   // in Entries set.
651   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
652     for (auto Pred : predecessors(BB)) {
653       if (!Entries.count(Pred))
654         return true;
655     }
656     return false;
657   };
658   auto CheckAndNormalizeCandidate =
659       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
660         for (BasicBlock *E : OutliningInfo->Entries) {
661           for (auto Succ : successors(E)) {
662             if (Entries.count(Succ))
663               continue;
664             if (Succ == OutliningInfo->ReturnBlock)
665               OutliningInfo->ReturnBlockPreds.push_back(E);
666             else if (Succ != OutliningInfo->NonReturnBlock)
667               return false;
668           }
669           // There should not be any outside incoming edges either:
670           if (HasNonEntryPred(E))
671             return false;
672         }
673         return true;
674       };
675 
676   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
677     return std::unique_ptr<FunctionOutliningInfo>();
678 
679   // Now further growing the candidate's inlining region by
680   // peeling off dominating blocks from the outlining region:
681   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
682     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
683     if (succ_size(Cand) != 2)
684       break;
685 
686     if (HasNonEntryPred(Cand))
687       break;
688 
689     BasicBlock *Succ1 = *succ_begin(Cand);
690     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
691 
692     BasicBlock *ReturnBlock, *NonReturnBlock;
693     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
694     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
695       break;
696 
697     if (NonReturnBlock->getSinglePredecessor() != Cand)
698       break;
699 
700     // Now grow and update OutlininigInfo:
701     OutliningInfo->Entries.push_back(Cand);
702     OutliningInfo->NonReturnBlock = NonReturnBlock;
703     OutliningInfo->ReturnBlockPreds.push_back(Cand);
704     Entries.insert(Cand);
705   }
706 
707   return OutliningInfo;
708 }
709 
710 // Check if there is PGO data or user annotated branch data:
711 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
712   if (F->hasProfileData())
713     return true;
714   // Now check if any of the entry block has MD_prof data:
715   for (auto *E : OI->Entries) {
716     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
717     if (!BR || BR->isUnconditional())
718       continue;
719     uint64_t T, F;
720     if (BR->extractProfMetadata(T, F))
721       return true;
722   }
723   return false;
724 }
725 
726 BranchProbability
727 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
728   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
729   auto EntryFreq =
730       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
731   auto OutliningCallFreq =
732       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
733   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
734   // we outlined any regions, so we may encounter situations where the
735   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
736   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
737     OutliningCallFreq = EntryFreq;
738   }
739   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
740       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
741 
742   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
743     return OutlineRegionRelFreq;
744 
745   // When profile data is not available, we need to be conservative in
746   // estimating the overall savings. Static branch prediction can usually
747   // guess the branch direction right (taken/non-taken), but the guessed
748   // branch probability is usually not biased enough. In case when the
749   // outlined region is predicted to be likely, its probability needs
750   // to be made higher (more biased) to not under-estimate the cost of
751   // function outlining. On the other hand, if the outlined region
752   // is predicted to be less likely, the predicted probablity is usually
753   // higher than the actual. For instance, the actual probability of the
754   // less likely target is only 5%, but the guessed probablity can be
755   // 40%. In the latter case, there is no need for further adjustement.
756   // FIXME: add an option for this.
757   if (OutlineRegionRelFreq < BranchProbability(45, 100))
758     return OutlineRegionRelFreq;
759 
760   OutlineRegionRelFreq = std::max(
761       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
762 
763   return OutlineRegionRelFreq;
764 }
765 
766 bool PartialInlinerImpl::shouldPartialInline(
767     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
768     OptimizationRemarkEmitter &ORE) {
769   using namespace ore;
770 
771   Function *Callee = CB.getCalledFunction();
772   assert(Callee == Cloner.ClonedFunc);
773 
774   if (SkipCostAnalysis)
775     return isInlineViable(*Callee).isSuccess();
776 
777   Function *Caller = CB.getCaller();
778   auto &CalleeTTI = GetTTI(*Callee);
779   bool RemarksEnabled =
780       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
781           DEBUG_TYPE);
782   InlineCost IC =
783       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
784                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
785 
786   if (IC.isAlways()) {
787     ORE.emit([&]() {
788       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
789              << NV("Callee", Cloner.OrigFunc)
790              << " should always be fully inlined, not partially";
791     });
792     return false;
793   }
794 
795   if (IC.isNever()) {
796     ORE.emit([&]() {
797       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
798              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
799              << NV("Caller", Caller)
800              << " because it should never be inlined (cost=never)";
801     });
802     return false;
803   }
804 
805   if (!IC) {
806     ORE.emit([&]() {
807       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
808              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
809              << NV("Caller", Caller) << " because too costly to inline (cost="
810              << NV("Cost", IC.getCost()) << ", threshold="
811              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
812     });
813     return false;
814   }
815   const DataLayout &DL = Caller->getParent()->getDataLayout();
816 
817   // The savings of eliminating the call:
818   int NonWeightedSavings = getCallsiteCost(CB, DL);
819   BlockFrequency NormWeightedSavings(NonWeightedSavings);
820 
821   // Weighted saving is smaller than weighted cost, return false
822   if (NormWeightedSavings < WeightedOutliningRcost) {
823     ORE.emit([&]() {
824       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
825                                         &CB)
826              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
827              << NV("Caller", Caller) << " runtime overhead (overhead="
828              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
829              << ", savings="
830              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
831              << ")"
832              << " of making the outlined call is too high";
833     });
834 
835     return false;
836   }
837 
838   ORE.emit([&]() {
839     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
840            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
841            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
842            << " (threshold="
843            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
844   });
845   return true;
846 }
847 
848 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
849 // For now use a simplified version. The returned 'InlineCost' will be used
850 // to esimate the size cost as well as runtime cost of the BB.
851 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
852                                             TargetTransformInfo *TTI) {
853   int InlineCost = 0;
854   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
855   for (Instruction &I : BB->instructionsWithoutDebug()) {
856     // Skip free instructions.
857     switch (I.getOpcode()) {
858     case Instruction::BitCast:
859     case Instruction::PtrToInt:
860     case Instruction::IntToPtr:
861     case Instruction::Alloca:
862     case Instruction::PHI:
863       continue;
864     case Instruction::GetElementPtr:
865       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
866         continue;
867       break;
868     default:
869       break;
870     }
871 
872     if (I.isLifetimeStartOrEnd())
873       continue;
874 
875     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
876       Intrinsic::ID IID = II->getIntrinsicID();
877       SmallVector<Type *, 4> Tys;
878       FastMathFlags FMF;
879       for (Value *Val : II->args())
880         Tys.push_back(Val->getType());
881 
882       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
883         FMF = FPMO->getFastMathFlags();
884 
885       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
886       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
887       continue;
888     }
889 
890     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
891       InlineCost += getCallsiteCost(*CI, DL);
892       continue;
893     }
894 
895     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
896       InlineCost += getCallsiteCost(*II, DL);
897       continue;
898     }
899 
900     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
901       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
902       continue;
903     }
904     InlineCost += InlineConstants::InstrCost;
905   }
906   return InlineCost;
907 }
908 
909 std::tuple<int, int>
910 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
911   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
912   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
913     Function *OutlinedFunc = FuncBBPair.first;
914     BasicBlock* OutliningCallBB = FuncBBPair.second;
915     // Now compute the cost of the call sequence to the outlined function
916     // 'OutlinedFunction' in BB 'OutliningCallBB':
917     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
918     OutliningFuncCallCost +=
919         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
920 
921     // Now compute the cost of the extracted/outlined function itself:
922     for (BasicBlock &BB : *OutlinedFunc)
923       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
924   }
925   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
926          "Outlined function cost should be no less than the outlined region");
927 
928   // The code extractor introduces a new root and exit stub blocks with
929   // additional unconditional branches. Those branches will be eliminated
930   // later with bb layout. The cost should be adjusted accordingly:
931   OutlinedFunctionCost -=
932       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
933 
934   int OutliningRuntimeOverhead =
935       OutliningFuncCallCost +
936       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
937       ExtraOutliningPenalty;
938 
939   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
940 }
941 
942 // Create the callsite to profile count map which is
943 // used to update the original function's entry count,
944 // after the function is partially inlined into the callsite.
945 void PartialInlinerImpl::computeCallsiteToProfCountMap(
946     Function *DuplicateFunction,
947     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
948   std::vector<User *> Users(DuplicateFunction->user_begin(),
949                             DuplicateFunction->user_end());
950   Function *CurrentCaller = nullptr;
951   std::unique_ptr<BlockFrequencyInfo> TempBFI;
952   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
953 
954   auto ComputeCurrBFI = [&,this](Function *Caller) {
955       // For the old pass manager:
956       if (!GetBFI) {
957         DominatorTree DT(*Caller);
958         LoopInfo LI(DT);
959         BranchProbabilityInfo BPI(*Caller, LI);
960         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
961         CurrentCallerBFI = TempBFI.get();
962       } else {
963         // New pass manager:
964         CurrentCallerBFI = &(GetBFI(*Caller));
965       }
966   };
967 
968   for (User *User : Users) {
969     CallBase *CB = getSupportedCallBase(User);
970     Function *Caller = CB->getCaller();
971     if (CurrentCaller != Caller) {
972       CurrentCaller = Caller;
973       ComputeCurrBFI(Caller);
974     } else {
975       assert(CurrentCallerBFI && "CallerBFI is not set");
976     }
977     BasicBlock *CallBB = CB->getParent();
978     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
979     if (Count)
980       CallSiteToProfCountMap[User] = *Count;
981     else
982       CallSiteToProfCountMap[User] = 0;
983   }
984 }
985 
986 PartialInlinerImpl::FunctionCloner::FunctionCloner(
987     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
988     function_ref<AssumptionCache *(Function &)> LookupAC,
989     function_ref<TargetTransformInfo &(Function &)> GetTTI)
990     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
991   ClonedOI = std::make_unique<FunctionOutliningInfo>();
992 
993   // Clone the function, so that we can hack away on it.
994   ValueToValueMapTy VMap;
995   ClonedFunc = CloneFunction(F, VMap);
996 
997   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
998   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
999   for (BasicBlock *BB : OI->Entries) {
1000     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1001   }
1002   for (BasicBlock *E : OI->ReturnBlockPreds) {
1003     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1004     ClonedOI->ReturnBlockPreds.push_back(NewE);
1005   }
1006   // Go ahead and update all uses to the duplicate, so that we can just
1007   // use the inliner functionality when we're done hacking.
1008   F->replaceAllUsesWith(ClonedFunc);
1009 }
1010 
1011 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1012     Function *F, FunctionOutliningMultiRegionInfo *OI,
1013     OptimizationRemarkEmitter &ORE,
1014     function_ref<AssumptionCache *(Function &)> LookupAC,
1015     function_ref<TargetTransformInfo &(Function &)> GetTTI)
1016     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1017   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1018 
1019   // Clone the function, so that we can hack away on it.
1020   ValueToValueMapTy VMap;
1021   ClonedFunc = CloneFunction(F, VMap);
1022 
1023   // Go through all Outline Candidate Regions and update all BasicBlock
1024   // information.
1025   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1026        OI->ORI) {
1027     SmallVector<BasicBlock *, 8> Region;
1028     for (BasicBlock *BB : RegionInfo.Region) {
1029       Region.push_back(cast<BasicBlock>(VMap[BB]));
1030     }
1031     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1032     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1033     BasicBlock *NewReturnBlock = nullptr;
1034     if (RegionInfo.ReturnBlock)
1035       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1036     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1037         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1038     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1039   }
1040   // Go ahead and update all uses to the duplicate, so that we can just
1041   // use the inliner functionality when we're done hacking.
1042   F->replaceAllUsesWith(ClonedFunc);
1043 }
1044 
1045 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1046   auto getFirstPHI = [](BasicBlock *BB) {
1047     BasicBlock::iterator I = BB->begin();
1048     PHINode *FirstPhi = nullptr;
1049     while (I != BB->end()) {
1050       PHINode *Phi = dyn_cast<PHINode>(I);
1051       if (!Phi)
1052         break;
1053       if (!FirstPhi) {
1054         FirstPhi = Phi;
1055         break;
1056       }
1057     }
1058     return FirstPhi;
1059   };
1060 
1061   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1062   // blocks.
1063   if (!ClonedOI)
1064     return;
1065 
1066   // Special hackery is needed with PHI nodes that have inputs from more than
1067   // one extracted block.  For simplicity, just split the PHIs into a two-level
1068   // sequence of PHIs, some of which will go in the extracted region, and some
1069   // of which will go outside.
1070   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1071   // only split block when necessary:
1072   PHINode *FirstPhi = getFirstPHI(PreReturn);
1073   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1074 
1075   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1076     return;
1077 
1078   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1079     Value *CommonValue = PN->getIncomingValue(0);
1080     if (all_of(PN->incoming_values(),
1081                [&](Value *V) { return V == CommonValue; }))
1082       return CommonValue;
1083     return nullptr;
1084   };
1085 
1086   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1087       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1088   BasicBlock::iterator I = PreReturn->begin();
1089   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1090   SmallVector<Instruction *, 4> DeadPhis;
1091   while (I != PreReturn->end()) {
1092     PHINode *OldPhi = dyn_cast<PHINode>(I);
1093     if (!OldPhi)
1094       break;
1095 
1096     PHINode *RetPhi =
1097         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1098     OldPhi->replaceAllUsesWith(RetPhi);
1099     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1100 
1101     RetPhi->addIncoming(&*I, PreReturn);
1102     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1103       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1104       OldPhi->removeIncomingValue(E);
1105     }
1106 
1107     // After incoming values splitting, the old phi may become trivial.
1108     // Keeping the trivial phi can introduce definition inside the outline
1109     // region which is live-out, causing necessary overhead (load, store
1110     // arg passing etc).
1111     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1112       OldPhi->replaceAllUsesWith(OldPhiVal);
1113       DeadPhis.push_back(OldPhi);
1114     }
1115     ++I;
1116   }
1117   for (auto *DP : DeadPhis)
1118     DP->eraseFromParent();
1119 
1120   for (auto E : ClonedOI->ReturnBlockPreds) {
1121     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1122   }
1123 }
1124 
1125 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1126 
1127   auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
1128     int Cost = 0;
1129     for (BasicBlock* BB : Region)
1130       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1131     return Cost;
1132   };
1133 
1134   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1135 
1136   if (ClonedOMRI->ORI.empty())
1137     return false;
1138 
1139   // The CodeExtractor needs a dominator tree.
1140   DominatorTree DT;
1141   DT.recalculate(*ClonedFunc);
1142 
1143   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1144   LoopInfo LI(DT);
1145   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1146   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1147 
1148   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1149   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1150 
1151   SetVector<Value *> Inputs, Outputs, Sinks;
1152   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1153        ClonedOMRI->ORI) {
1154     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1155 
1156     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1157                      ClonedFuncBFI.get(), &BPI,
1158                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1159                      /* AllowVarargs */ false);
1160 
1161     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1162 
1163 #ifndef NDEBUG
1164     if (TracePartialInlining) {
1165       dbgs() << "inputs: " << Inputs.size() << "\n";
1166       dbgs() << "outputs: " << Outputs.size() << "\n";
1167       for (Value *value : Inputs)
1168         dbgs() << "value used in func: " << *value << "\n";
1169       for (Value *output : Outputs)
1170         dbgs() << "instr used in func: " << *output << "\n";
1171     }
1172 #endif
1173     // Do not extract regions that have live exit variables.
1174     if (Outputs.size() > 0 && !ForceLiveExit)
1175       continue;
1176 
1177     Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1178 
1179     if (OutlinedFunc) {
1180       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1181       BasicBlock *OutliningCallBB = OCS->getParent();
1182       assert(OutliningCallBB->getParent() == ClonedFunc);
1183       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1184       NumColdRegionsOutlined++;
1185       OutlinedRegionCost += CurrentOutlinedRegionCost;
1186 
1187       if (MarkOutlinedColdCC) {
1188         OutlinedFunc->setCallingConv(CallingConv::Cold);
1189         OCS->setCallingConv(CallingConv::Cold);
1190       }
1191     } else
1192       ORE.emit([&]() {
1193         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1194                                         &RegionInfo.Region.front()->front())
1195                << "Failed to extract region at block "
1196                << ore::NV("Block", RegionInfo.Region.front());
1197       });
1198   }
1199 
1200   return !OutlinedFunctions.empty();
1201 }
1202 
1203 Function *
1204 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1205   // Returns true if the block is to be partial inlined into the caller
1206   // (i.e. not to be extracted to the out of line function)
1207   auto ToBeInlined = [&, this](BasicBlock *BB) {
1208     return BB == ClonedOI->ReturnBlock ||
1209            llvm::is_contained(ClonedOI->Entries, BB);
1210   };
1211 
1212   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1213   // The CodeExtractor needs a dominator tree.
1214   DominatorTree DT;
1215   DT.recalculate(*ClonedFunc);
1216 
1217   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1218   LoopInfo LI(DT);
1219   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1220   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1221 
1222   // Gather up the blocks that we're going to extract.
1223   std::vector<BasicBlock *> ToExtract;
1224   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1225   ToExtract.push_back(ClonedOI->NonReturnBlock);
1226   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1227       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1228   for (BasicBlock &BB : *ClonedFunc)
1229     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1230       ToExtract.push_back(&BB);
1231       // FIXME: the code extractor may hoist/sink more code
1232       // into the outlined function which may make the outlining
1233       // overhead (the difference of the outlined function cost
1234       // and OutliningRegionCost) look larger.
1235       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1236     }
1237 
1238   // Extract the body of the if.
1239   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1240   Function *OutlinedFunc =
1241       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1242                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1243                     /* AllowVarargs */ true)
1244           .extractCodeRegion(CEAC);
1245 
1246   if (OutlinedFunc) {
1247     BasicBlock *OutliningCallBB =
1248         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1249             ->getParent();
1250     assert(OutliningCallBB->getParent() == ClonedFunc);
1251     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1252   } else
1253     ORE.emit([&]() {
1254       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1255                                       &ToExtract.front()->front())
1256              << "Failed to extract region at block "
1257              << ore::NV("Block", ToExtract.front());
1258     });
1259 
1260   return OutlinedFunc;
1261 }
1262 
1263 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1264   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1265   // users (function pointers, etc.) back to the original function.
1266   ClonedFunc->replaceAllUsesWith(OrigFunc);
1267   ClonedFunc->eraseFromParent();
1268   if (!IsFunctionInlined) {
1269     // Remove each function that was speculatively created if there is no
1270     // reference.
1271     for (auto FuncBBPair : OutlinedFunctions) {
1272       Function *Func = FuncBBPair.first;
1273       Func->eraseFromParent();
1274     }
1275   }
1276 }
1277 
1278 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1279 
1280   if (F->hasAddressTaken())
1281     return {false, nullptr};
1282 
1283   // Let inliner handle it
1284   if (F->hasFnAttribute(Attribute::AlwaysInline))
1285     return {false, nullptr};
1286 
1287   if (F->hasFnAttribute(Attribute::NoInline))
1288     return {false, nullptr};
1289 
1290   if (PSI.isFunctionEntryCold(F))
1291     return {false, nullptr};
1292 
1293   if (F->users().empty())
1294     return {false, nullptr};
1295 
1296   OptimizationRemarkEmitter ORE(F);
1297 
1298   // Only try to outline cold regions if we have a profile summary, which
1299   // implies we have profiling information.
1300   if (PSI.hasProfileSummary() && F->hasProfileData() &&
1301       !DisableMultiRegionPartialInline) {
1302     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1303         computeOutliningColdRegionsInfo(F, ORE);
1304     if (OMRI) {
1305       FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1306 
1307 #ifndef NDEBUG
1308       if (TracePartialInlining) {
1309         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1310         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1311                << "\n";
1312       }
1313 #endif
1314       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1315 
1316       if (DidOutline) {
1317 #ifndef NDEBUG
1318         if (TracePartialInlining) {
1319           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1320           Cloner.ClonedFunc->print(dbgs());
1321           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1322         }
1323 #endif
1324 
1325         if (tryPartialInline(Cloner))
1326           return {true, nullptr};
1327       }
1328     }
1329   }
1330 
1331   // Fall-thru to regular partial inlining if we:
1332   //    i) can't find any cold regions to outline, or
1333   //   ii) can't inline the outlined function anywhere.
1334   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1335   if (!OI)
1336     return {false, nullptr};
1337 
1338   FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1339   Cloner.NormalizeReturnBlock();
1340 
1341   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1342 
1343   if (!OutlinedFunction)
1344     return {false, nullptr};
1345 
1346   bool AnyInline = tryPartialInline(Cloner);
1347 
1348   if (AnyInline)
1349     return {true, OutlinedFunction};
1350 
1351   return {false, nullptr};
1352 }
1353 
1354 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1355   if (Cloner.OutlinedFunctions.empty())
1356     return false;
1357 
1358   int SizeCost = 0;
1359   BlockFrequency WeightedRcost;
1360   int NonWeightedRcost;
1361   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1362 
1363   // Only calculate RelativeToEntryFreq when we are doing single region
1364   // outlining.
1365   BranchProbability RelativeToEntryFreq;
1366   if (Cloner.ClonedOI) {
1367     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1368   } else
1369     // RelativeToEntryFreq doesn't make sense when we have more than one
1370     // outlined call because each call will have a different relative frequency
1371     // to the entry block.  We can consider using the average, but the
1372     // usefulness of that information is questionable. For now, assume we never
1373     // execute the calls to outlined functions.
1374     RelativeToEntryFreq = BranchProbability(0, 1);
1375 
1376   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1377 
1378   // The call sequence(s) to the outlined function(s) are larger than the sum of
1379   // the original outlined region size(s), it does not increase the chances of
1380   // inlining the function with outlining (The inliner uses the size increase to
1381   // model the cost of inlining a callee).
1382   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1383     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1384     DebugLoc DLoc;
1385     BasicBlock *Block;
1386     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1387     OrigFuncORE.emit([&]() {
1388       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1389                                         DLoc, Block)
1390              << ore::NV("Function", Cloner.OrigFunc)
1391              << " not partially inlined into callers (Original Size = "
1392              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1393              << ", Size of call sequence to outlined function = "
1394              << ore::NV("NewSize", SizeCost) << ")";
1395     });
1396     return false;
1397   }
1398 
1399   assert(Cloner.OrigFunc->users().empty() &&
1400          "F's users should all be replaced!");
1401 
1402   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1403                             Cloner.ClonedFunc->user_end());
1404 
1405   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1406   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1407   if (CalleeEntryCount)
1408     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1409 
1410   uint64_t CalleeEntryCountV =
1411       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1412 
1413   bool AnyInline = false;
1414   for (User *User : Users) {
1415     CallBase *CB = getSupportedCallBase(User);
1416 
1417     if (IsLimitReached())
1418       continue;
1419 
1420     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1421     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1422       continue;
1423 
1424     // Construct remark before doing the inlining, as after successful inlining
1425     // the callsite is removed.
1426     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1427     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1428        << ore::NV("Caller", CB->getCaller());
1429 
1430     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1431     // We can only forward varargs when we outlined a single region, else we
1432     // bail on vararg functions.
1433     if (!InlineFunction(*CB, IFI, nullptr, true,
1434                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1435                                          : nullptr))
1436              .isSuccess())
1437       continue;
1438 
1439     CallerORE.emit(OR);
1440 
1441     // Now update the entry count:
1442     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1443       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1444       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1445     }
1446 
1447     AnyInline = true;
1448     NumPartialInlining++;
1449     // Update the stats
1450     if (Cloner.ClonedOI)
1451       NumPartialInlined++;
1452     else
1453       NumColdOutlinePartialInlined++;
1454 
1455   }
1456 
1457   if (AnyInline) {
1458     Cloner.IsFunctionInlined = true;
1459     if (CalleeEntryCount)
1460       Cloner.OrigFunc->setEntryCount(
1461           CalleeEntryCount.setCount(CalleeEntryCountV));
1462     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1463     OrigFuncORE.emit([&]() {
1464       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1465              << "Partially inlined into at least one caller";
1466     });
1467 
1468   }
1469 
1470   return AnyInline;
1471 }
1472 
1473 bool PartialInlinerImpl::run(Module &M) {
1474   if (DisablePartialInlining)
1475     return false;
1476 
1477   std::vector<Function *> Worklist;
1478   Worklist.reserve(M.size());
1479   for (Function &F : M)
1480     if (!F.use_empty() && !F.isDeclaration())
1481       Worklist.push_back(&F);
1482 
1483   bool Changed = false;
1484   while (!Worklist.empty()) {
1485     Function *CurrFunc = Worklist.back();
1486     Worklist.pop_back();
1487 
1488     if (CurrFunc->use_empty())
1489       continue;
1490 
1491     bool Recursive = false;
1492     for (User *U : CurrFunc->users())
1493       if (Instruction *I = dyn_cast<Instruction>(U))
1494         if (I->getParent()->getParent() == CurrFunc) {
1495           Recursive = true;
1496           break;
1497         }
1498     if (Recursive)
1499       continue;
1500 
1501     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1502     if (Result.second)
1503       Worklist.push_back(Result.second);
1504     Changed |= Result.first;
1505   }
1506 
1507   return Changed;
1508 }
1509 
1510 char PartialInlinerLegacyPass::ID = 0;
1511 
1512 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1513                       "Partial Inliner", false, false)
1514 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1515 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1516 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1517 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1518 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1519                     "Partial Inliner", false, false)
1520 
1521 ModulePass *llvm::createPartialInliningPass() {
1522   return new PartialInlinerLegacyPass();
1523 }
1524 
1525 PreservedAnalyses PartialInlinerPass::run(Module &M,
1526                                           ModuleAnalysisManager &AM) {
1527   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1528 
1529   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1530     return FAM.getResult<AssumptionAnalysis>(F);
1531   };
1532 
1533   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1534     return FAM.getCachedResult<AssumptionAnalysis>(F);
1535   };
1536 
1537   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1538     return FAM.getResult<BlockFrequencyAnalysis>(F);
1539   };
1540 
1541   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1542     return FAM.getResult<TargetIRAnalysis>(F);
1543   };
1544 
1545   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1546     return FAM.getResult<TargetLibraryAnalysis>(F);
1547   };
1548 
1549   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1550 
1551   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1552                          GetTLI, PSI, GetBFI)
1553           .run(M))
1554     return PreservedAnalyses::none();
1555   return PreservedAnalyses::all();
1556 }
1557