1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/InitializePasses.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/BlockFrequency.h"
48 #include "llvm/Support/BranchProbability.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/Utils/Cloning.h"
54 #include "llvm/Transforms/Utils/CodeExtractor.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <functional>
60 #include <iterator>
61 #include <memory>
62 #include <tuple>
63 #include <vector>
64 
65 using namespace llvm;
66 
67 #define DEBUG_TYPE "partial-inlining"
68 
69 STATISTIC(NumPartialInlined,
70           "Number of callsites functions partially inlined into.");
71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72                                         "cold outlined regions were partially "
73                                         "inlined into its caller(s).");
74 STATISTIC(NumColdRegionsFound,
75            "Number of cold single entry/exit regions found.");
76 STATISTIC(NumColdRegionsOutlined,
77            "Number of cold single entry/exit regions outlined.");
78 
79 // Command line option to disable partial-inlining. The default is false:
80 static cl::opt<bool>
81     DisablePartialInlining("disable-partial-inlining", cl::init(false),
82                            cl::Hidden, cl::desc("Disable partial inlining"));
83 // Command line option to disable multi-region partial-inlining. The default is
84 // false:
85 static cl::opt<bool> DisableMultiRegionPartialInline(
86     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87     cl::desc("Disable multi-region partial inlining"));
88 
89 // Command line option to force outlining in regions with live exit variables.
90 // The default is false:
91 static cl::opt<bool>
92     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93                cl::desc("Force outline regions with live exits"));
94 
95 // Command line option to enable marking outline functions with Cold Calling
96 // Convention. The default is false:
97 static cl::opt<bool>
98     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99                        cl::desc("Mark outline function calls with ColdCC"));
100 
101 #ifndef NDEBUG
102 // Command line option to debug partial-inlining. The default is none:
103 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
104                                           cl::init(false), cl::Hidden,
105                                           cl::desc("Trace partial inlining."));
106 #endif
107 
108 // This is an option used by testing:
109 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
110                                       cl::init(false), cl::ZeroOrMore,
111                                       cl::ReallyHidden,
112                                       cl::desc("Skip Cost Analysis"));
113 // Used to determine if a cold region is worth outlining based on
114 // its inlining cost compared to the original function.  Default is set at 10%.
115 // ie. if the cold region reduces the inlining cost of the original function by
116 // at least 10%.
117 static cl::opt<float> MinRegionSizeRatio(
118     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
119     cl::desc("Minimum ratio comparing relative sizes of each "
120              "outline candidate and original function"));
121 // Used to tune the minimum number of execution counts needed in the predecessor
122 // block to the cold edge. ie. confidence interval.
123 static cl::opt<unsigned>
124     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
125                              cl::desc("Minimum block executions to consider "
126                                       "its BranchProbabilityInfo valid"));
127 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
128 // if the branch probability is 10% or less, then it is deemed as 'cold'.
129 static cl::opt<float> ColdBranchRatio(
130     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
131     cl::desc("Minimum BranchProbability to consider a region cold."));
132 
133 static cl::opt<unsigned> MaxNumInlineBlocks(
134     "max-num-inline-blocks", cl::init(5), cl::Hidden,
135     cl::desc("Max number of blocks to be partially inlined"));
136 
137 // Command line option to set the maximum number of partial inlining allowed
138 // for the module. The default value of -1 means no limit.
139 static cl::opt<int> MaxNumPartialInlining(
140     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
141     cl::desc("Max number of partial inlining. The default is unlimited"));
142 
143 // Used only when PGO or user annotated branch data is absent. It is
144 // the least value that is used to weigh the outline region. If BFI
145 // produces larger value, the BFI value will be used.
146 static cl::opt<int>
147     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
148                              cl::Hidden, cl::ZeroOrMore,
149                              cl::desc("Relative frequency of outline region to "
150                                       "the entry block"));
151 
152 static cl::opt<unsigned> ExtraOutliningPenalty(
153     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
154     cl::desc("A debug option to add additional penalty to the computed one."));
155 
156 namespace {
157 
158 struct FunctionOutliningInfo {
159   FunctionOutliningInfo() = default;
160 
161   // Returns the number of blocks to be inlined including all blocks
162   // in Entries and one return block.
163   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
164 
165   // A set of blocks including the function entry that guard
166   // the region to be outlined.
167   SmallVector<BasicBlock *, 4> Entries;
168 
169   // The return block that is not included in the outlined region.
170   BasicBlock *ReturnBlock = nullptr;
171 
172   // The dominating block of the region to be outlined.
173   BasicBlock *NonReturnBlock = nullptr;
174 
175   // The set of blocks in Entries that that are predecessors to ReturnBlock
176   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
177 };
178 
179 struct FunctionOutliningMultiRegionInfo {
180   FunctionOutliningMultiRegionInfo()
181       : ORI() {}
182 
183   // Container for outline regions
184   struct OutlineRegionInfo {
185     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
186                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
187                       BasicBlock *ReturnBlock)
188         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
189           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
190     SmallVector<BasicBlock *, 8> Region;
191     BasicBlock *EntryBlock;
192     BasicBlock *ExitBlock;
193     BasicBlock *ReturnBlock;
194   };
195 
196   SmallVector<OutlineRegionInfo, 4> ORI;
197 };
198 
199 struct PartialInlinerImpl {
200 
201   PartialInlinerImpl(
202       std::function<AssumptionCache &(Function &)> *GetAC,
203       function_ref<AssumptionCache *(Function &)> LookupAC,
204       std::function<TargetTransformInfo &(Function &)> *GTTI,
205       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
206       std::function<const TargetLibraryInfo &(Function &)> *GTLI,
207       ProfileSummaryInfo *ProfSI)
208       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
209         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
210 
211   bool run(Module &M);
212   // Main part of the transformation that calls helper functions to find
213   // outlining candidates, clone & outline the function, and attempt to
214   // partially inline the resulting function. Returns true if
215   // inlining was successful, false otherwise.  Also returns the outline
216   // function (only if we partially inlined early returns) as there is a
217   // possibility to further "peel" early return statements that were left in the
218   // outline function due to code size.
219   std::pair<bool, Function *> unswitchFunction(Function *F);
220 
221   // This class speculatively clones the function to be partial inlined.
222   // At the end of partial inlining, the remaining callsites to the cloned
223   // function that are not partially inlined will be fixed up to reference
224   // the original function, and the cloned function will be erased.
225   struct FunctionCloner {
226     // Two constructors, one for single region outlining, the other for
227     // multi-region outlining.
228     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
229                    OptimizationRemarkEmitter &ORE,
230                    function_ref<AssumptionCache *(Function &)> LookupAC);
231     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
232                    OptimizationRemarkEmitter &ORE,
233                    function_ref<AssumptionCache *(Function &)> LookupAC);
234     ~FunctionCloner();
235 
236     // Prepare for function outlining: making sure there is only
237     // one incoming edge from the extracted/outlined region to
238     // the return block.
239     void NormalizeReturnBlock();
240 
241     // Do function outlining for cold regions.
242     bool doMultiRegionFunctionOutlining();
243     // Do function outlining for region after early return block(s).
244     // NOTE: For vararg functions that do the vararg handling in the outlined
245     //       function, we temporarily generate IR that does not properly
246     //       forward varargs to the outlined function. Calling InlineFunction
247     //       will update calls to the outlined functions to properly forward
248     //       the varargs.
249     Function *doSingleRegionFunctionOutlining();
250 
251     Function *OrigFunc = nullptr;
252     Function *ClonedFunc = nullptr;
253 
254     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
255     // Keep track of Outlined Functions and the basic block they're called from.
256     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
257 
258     // ClonedFunc is inlined in one of its callers after function
259     // outlining.
260     bool IsFunctionInlined = false;
261     // The cost of the region to be outlined.
262     int OutlinedRegionCost = 0;
263     // ClonedOI is specific to outlining non-early return blocks.
264     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
265     // ClonedOMRI is specific to outlining cold regions.
266     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
267     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
268     OptimizationRemarkEmitter &ORE;
269     function_ref<AssumptionCache *(Function &)> LookupAC;
270   };
271 
272 private:
273   int NumPartialInlining = 0;
274   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
275   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
276   std::function<TargetTransformInfo &(Function &)> *GetTTI;
277   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
278   std::function<const TargetLibraryInfo &(Function &)> *GetTLI;
279   ProfileSummaryInfo *PSI;
280 
281   // Return the frequency of the OutlininingBB relative to F's entry point.
282   // The result is no larger than 1 and is represented using BP.
283   // (Note that the outlined region's 'head' block can only have incoming
284   // edges from the guarding entry blocks).
285   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
286 
287   // Return true if the callee of CS should be partially inlined with
288   // profit.
289   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
290                            BlockFrequency WeightedOutliningRcost,
291                            OptimizationRemarkEmitter &ORE);
292 
293   // Try to inline DuplicateFunction (cloned from F with call to
294   // the OutlinedFunction into its callers. Return true
295   // if there is any successful inlining.
296   bool tryPartialInline(FunctionCloner &Cloner);
297 
298   // Compute the mapping from use site of DuplicationFunction to the enclosing
299   // BB's profile count.
300   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
301                                      DenseMap<User *, uint64_t> &SiteCountMap);
302 
303   bool IsLimitReached() {
304     return (MaxNumPartialInlining != -1 &&
305             NumPartialInlining >= MaxNumPartialInlining);
306   }
307 
308   static CallSite getCallSite(User *U) {
309     CallSite CS;
310     if (CallInst *CI = dyn_cast<CallInst>(U))
311       CS = CallSite(CI);
312     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
313       CS = CallSite(II);
314     else
315       llvm_unreachable("All uses must be calls");
316     return CS;
317   }
318 
319   static CallSite getOneCallSiteTo(Function *F) {
320     User *User = *F->user_begin();
321     return getCallSite(User);
322   }
323 
324   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
325     CallSite CS = getOneCallSiteTo(F);
326     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
327     BasicBlock *Block = CS.getParent();
328     return std::make_tuple(DLoc, Block);
329   }
330 
331   // Returns the costs associated with function outlining:
332   // - The first value is the non-weighted runtime cost for making the call
333   //   to the outlined function, including the addtional  setup cost in the
334   //    outlined function itself;
335   // - The second value is the estimated size of the new call sequence in
336   //   basic block Cloner.OutliningCallBB;
337   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
338 
339   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
340   // approximate both the size and runtime cost (Note that in the current
341   // inline cost analysis, there is no clear distinction there either).
342   static int computeBBInlineCost(BasicBlock *BB);
343 
344   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
345   std::unique_ptr<FunctionOutliningMultiRegionInfo>
346   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
347 };
348 
349 struct PartialInlinerLegacyPass : public ModulePass {
350   static char ID; // Pass identification, replacement for typeid
351 
352   PartialInlinerLegacyPass() : ModulePass(ID) {
353     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
354   }
355 
356   void getAnalysisUsage(AnalysisUsage &AU) const override {
357     AU.addRequired<AssumptionCacheTracker>();
358     AU.addRequired<ProfileSummaryInfoWrapperPass>();
359     AU.addRequired<TargetTransformInfoWrapperPass>();
360     AU.addRequired<TargetLibraryInfoWrapperPass>();
361   }
362 
363   bool runOnModule(Module &M) override {
364     if (skipModule(M))
365       return false;
366 
367     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
368     TargetTransformInfoWrapperPass *TTIWP =
369         &getAnalysis<TargetTransformInfoWrapperPass>();
370     ProfileSummaryInfo *PSI =
371         &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
372 
373     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
374         [&ACT](Function &F) -> AssumptionCache & {
375       return ACT->getAssumptionCache(F);
376     };
377 
378     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
379       return ACT->lookupAssumptionCache(F);
380     };
381 
382     std::function<TargetTransformInfo &(Function &)> GetTTI =
383         [&TTIWP](Function &F) -> TargetTransformInfo & {
384       return TTIWP->getTTI(F);
385     };
386 
387     std::function<const TargetLibraryInfo &(Function &)> GetTLI =
388         [this](Function &F) -> TargetLibraryInfo & {
389       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
390     };
391 
392     return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
393                               &GetTTI, NoneType::None, &GetTLI, PSI)
394         .run(M);
395   }
396 };
397 
398 } // end anonymous namespace
399 
400 std::unique_ptr<FunctionOutliningMultiRegionInfo>
401 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
402                                                     OptimizationRemarkEmitter &ORE) {
403   BasicBlock *EntryBlock = &F->front();
404 
405   DominatorTree DT(*F);
406   LoopInfo LI(DT);
407   BranchProbabilityInfo BPI(*F, LI);
408   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
409   BlockFrequencyInfo *BFI;
410   if (!GetBFI) {
411     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
412     BFI = ScopedBFI.get();
413   } else
414     BFI = &(*GetBFI)(*F);
415 
416   // Return if we don't have profiling information.
417   if (!PSI->hasInstrumentationProfile())
418     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
419 
420   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
421       std::make_unique<FunctionOutliningMultiRegionInfo>();
422 
423   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
424     BasicBlock *Dom = BlockList.front();
425     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
426   };
427 
428   auto IsSingleExit =
429       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
430     BasicBlock *ExitBlock = nullptr;
431     for (auto *Block : BlockList) {
432       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
433         if (!is_contained(BlockList, *SI)) {
434           if (ExitBlock) {
435             ORE.emit([&]() {
436               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
437                                               &SI->front())
438                      << "Region dominated by "
439                      << ore::NV("Block", BlockList.front()->getName())
440                      << " has more than one region exit edge.";
441             });
442             return nullptr;
443           } else
444             ExitBlock = Block;
445         }
446       }
447     }
448     return ExitBlock;
449   };
450 
451   auto BBProfileCount = [BFI](BasicBlock *BB) {
452     return BFI->getBlockProfileCount(BB)
453                ? BFI->getBlockProfileCount(BB).getValue()
454                : 0;
455   };
456 
457   // Use the same computeBBInlineCost function to compute the cost savings of
458   // the outlining the candidate region.
459   int OverallFunctionCost = 0;
460   for (auto &BB : *F)
461     OverallFunctionCost += computeBBInlineCost(&BB);
462 
463 #ifndef NDEBUG
464   if (TracePartialInlining)
465     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
466 #endif
467   int MinOutlineRegionCost =
468       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
469   BranchProbability MinBranchProbability(
470       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
471       MinBlockCounterExecution);
472   bool ColdCandidateFound = false;
473   BasicBlock *CurrEntry = EntryBlock;
474   std::vector<BasicBlock *> DFS;
475   DenseMap<BasicBlock *, bool> VisitedMap;
476   DFS.push_back(CurrEntry);
477   VisitedMap[CurrEntry] = true;
478   // Use Depth First Search on the basic blocks to find CFG edges that are
479   // considered cold.
480   // Cold regions considered must also have its inline cost compared to the
481   // overall inline cost of the original function.  The region is outlined only
482   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
483   // more.
484   while (!DFS.empty()) {
485     auto *thisBB = DFS.back();
486     DFS.pop_back();
487     // Only consider regions with predecessor blocks that are considered
488     // not-cold (default: part of the top 99.99% of all block counters)
489     // AND greater than our minimum block execution count (default: 100).
490     if (PSI->isColdBlock(thisBB, BFI) ||
491         BBProfileCount(thisBB) < MinBlockCounterExecution)
492       continue;
493     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
494       if (VisitedMap[*SI])
495         continue;
496       VisitedMap[*SI] = true;
497       DFS.push_back(*SI);
498       // If branch isn't cold, we skip to the next one.
499       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
500       if (SuccProb > MinBranchProbability)
501         continue;
502 #ifndef NDEBUG
503       if (TracePartialInlining) {
504         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
505                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
506                << "\n";
507       }
508 #endif
509       SmallVector<BasicBlock *, 8> DominateVector;
510       DT.getDescendants(*SI, DominateVector);
511       // We can only outline single entry regions (for now).
512       if (!IsSingleEntry(DominateVector))
513         continue;
514       BasicBlock *ExitBlock = nullptr;
515       // We can only outline single exit regions (for now).
516       if (!(ExitBlock = IsSingleExit(DominateVector)))
517         continue;
518       int OutlineRegionCost = 0;
519       for (auto *BB : DominateVector)
520         OutlineRegionCost += computeBBInlineCost(BB);
521 
522 #ifndef NDEBUG
523       if (TracePartialInlining)
524         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
525 #endif
526 
527       if (OutlineRegionCost < MinOutlineRegionCost) {
528         ORE.emit([&]() {
529           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
530                                             &SI->front())
531                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
532                  << ore::NV("Cost", MinOutlineRegionCost);
533         });
534         continue;
535       }
536       // For now, ignore blocks that belong to a SISE region that is a
537       // candidate for outlining.  In the future, we may want to look
538       // at inner regions because the outer region may have live-exit
539       // variables.
540       for (auto *BB : DominateVector)
541         VisitedMap[BB] = true;
542       // ReturnBlock here means the block after the outline call
543       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
544       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
545       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
546           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
547       OutliningInfo->ORI.push_back(RegInfo);
548 #ifndef NDEBUG
549       if (TracePartialInlining) {
550         dbgs() << "Found Cold Candidate starting at block: "
551                << DominateVector.front()->getName() << "\n";
552       }
553 #endif
554       ColdCandidateFound = true;
555       NumColdRegionsFound++;
556     }
557   }
558   if (ColdCandidateFound)
559     return OutliningInfo;
560   else
561     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
562 }
563 
564 std::unique_ptr<FunctionOutliningInfo>
565 PartialInlinerImpl::computeOutliningInfo(Function *F) {
566   BasicBlock *EntryBlock = &F->front();
567   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
568   if (!BR || BR->isUnconditional())
569     return std::unique_ptr<FunctionOutliningInfo>();
570 
571   // Returns true if Succ is BB's successor
572   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
573     return is_contained(successors(BB), Succ);
574   };
575 
576   auto IsReturnBlock = [](BasicBlock *BB) {
577     Instruction *TI = BB->getTerminator();
578     return isa<ReturnInst>(TI);
579   };
580 
581   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
582     if (IsReturnBlock(Succ1))
583       return std::make_tuple(Succ1, Succ2);
584     if (IsReturnBlock(Succ2))
585       return std::make_tuple(Succ2, Succ1);
586 
587     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
588   };
589 
590   // Detect a triangular shape:
591   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
592     if (IsSuccessor(Succ1, Succ2))
593       return std::make_tuple(Succ1, Succ2);
594     if (IsSuccessor(Succ2, Succ1))
595       return std::make_tuple(Succ2, Succ1);
596 
597     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
598   };
599 
600   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
601       std::make_unique<FunctionOutliningInfo>();
602 
603   BasicBlock *CurrEntry = EntryBlock;
604   bool CandidateFound = false;
605   do {
606     // The number of blocks to be inlined has already reached
607     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
608     // disables partial inlining for the function.
609     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
610       break;
611 
612     if (succ_size(CurrEntry) != 2)
613       break;
614 
615     BasicBlock *Succ1 = *succ_begin(CurrEntry);
616     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
617 
618     BasicBlock *ReturnBlock, *NonReturnBlock;
619     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
620 
621     if (ReturnBlock) {
622       OutliningInfo->Entries.push_back(CurrEntry);
623       OutliningInfo->ReturnBlock = ReturnBlock;
624       OutliningInfo->NonReturnBlock = NonReturnBlock;
625       CandidateFound = true;
626       break;
627     }
628 
629     BasicBlock *CommSucc;
630     BasicBlock *OtherSucc;
631     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
632 
633     if (!CommSucc)
634       break;
635 
636     OutliningInfo->Entries.push_back(CurrEntry);
637     CurrEntry = OtherSucc;
638   } while (true);
639 
640   if (!CandidateFound)
641     return std::unique_ptr<FunctionOutliningInfo>();
642 
643   // Do sanity check of the entries: threre should not
644   // be any successors (not in the entry set) other than
645   // {ReturnBlock, NonReturnBlock}
646   assert(OutliningInfo->Entries[0] == &F->front() &&
647          "Function Entry must be the first in Entries vector");
648   DenseSet<BasicBlock *> Entries;
649   for (BasicBlock *E : OutliningInfo->Entries)
650     Entries.insert(E);
651 
652   // Returns true of BB has Predecessor which is not
653   // in Entries set.
654   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
655     for (auto Pred : predecessors(BB)) {
656       if (!Entries.count(Pred))
657         return true;
658     }
659     return false;
660   };
661   auto CheckAndNormalizeCandidate =
662       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
663         for (BasicBlock *E : OutliningInfo->Entries) {
664           for (auto Succ : successors(E)) {
665             if (Entries.count(Succ))
666               continue;
667             if (Succ == OutliningInfo->ReturnBlock)
668               OutliningInfo->ReturnBlockPreds.push_back(E);
669             else if (Succ != OutliningInfo->NonReturnBlock)
670               return false;
671           }
672           // There should not be any outside incoming edges either:
673           if (HasNonEntryPred(E))
674             return false;
675         }
676         return true;
677       };
678 
679   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
680     return std::unique_ptr<FunctionOutliningInfo>();
681 
682   // Now further growing the candidate's inlining region by
683   // peeling off dominating blocks from the outlining region:
684   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
685     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
686     if (succ_size(Cand) != 2)
687       break;
688 
689     if (HasNonEntryPred(Cand))
690       break;
691 
692     BasicBlock *Succ1 = *succ_begin(Cand);
693     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
694 
695     BasicBlock *ReturnBlock, *NonReturnBlock;
696     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
697     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
698       break;
699 
700     if (NonReturnBlock->getSinglePredecessor() != Cand)
701       break;
702 
703     // Now grow and update OutlininigInfo:
704     OutliningInfo->Entries.push_back(Cand);
705     OutliningInfo->NonReturnBlock = NonReturnBlock;
706     OutliningInfo->ReturnBlockPreds.push_back(Cand);
707     Entries.insert(Cand);
708   }
709 
710   return OutliningInfo;
711 }
712 
713 // Check if there is PGO data or user annotated branch data:
714 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
715   if (F->hasProfileData())
716     return true;
717   // Now check if any of the entry block has MD_prof data:
718   for (auto *E : OI->Entries) {
719     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
720     if (!BR || BR->isUnconditional())
721       continue;
722     uint64_t T, F;
723     if (BR->extractProfMetadata(T, F))
724       return true;
725   }
726   return false;
727 }
728 
729 BranchProbability
730 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
731   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
732   auto EntryFreq =
733       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
734   auto OutliningCallFreq =
735       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
736   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
737   // we outlined any regions, so we may encounter situations where the
738   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
739   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
740     OutliningCallFreq = EntryFreq;
741   }
742   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
743       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
744 
745   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
746     return OutlineRegionRelFreq;
747 
748   // When profile data is not available, we need to be conservative in
749   // estimating the overall savings. Static branch prediction can usually
750   // guess the branch direction right (taken/non-taken), but the guessed
751   // branch probability is usually not biased enough. In case when the
752   // outlined region is predicted to be likely, its probability needs
753   // to be made higher (more biased) to not under-estimate the cost of
754   // function outlining. On the other hand, if the outlined region
755   // is predicted to be less likely, the predicted probablity is usually
756   // higher than the actual. For instance, the actual probability of the
757   // less likely target is only 5%, but the guessed probablity can be
758   // 40%. In the latter case, there is no need for further adjustement.
759   // FIXME: add an option for this.
760   if (OutlineRegionRelFreq < BranchProbability(45, 100))
761     return OutlineRegionRelFreq;
762 
763   OutlineRegionRelFreq = std::max(
764       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
765 
766   return OutlineRegionRelFreq;
767 }
768 
769 bool PartialInlinerImpl::shouldPartialInline(
770     CallSite CS, FunctionCloner &Cloner,
771     BlockFrequency WeightedOutliningRcost,
772     OptimizationRemarkEmitter &ORE) {
773   using namespace ore;
774 
775   Instruction *Call = CS.getInstruction();
776   Function *Callee = CS.getCalledFunction();
777   assert(Callee == Cloner.ClonedFunc);
778 
779   if (SkipCostAnalysis)
780     return isInlineViable(*Callee).isSuccess();
781 
782   Function *Caller = CS.getCaller();
783   auto &CalleeTTI = (*GetTTI)(*Callee);
784   bool RemarksEnabled =
785       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
786           DEBUG_TYPE);
787   assert(Call && "invalid callsite for partial inline");
788   InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
789                                 CalleeTTI, *GetAssumptionCache, GetBFI, *GetTLI,
790                                 PSI, RemarksEnabled ? &ORE : nullptr);
791 
792   if (IC.isAlways()) {
793     ORE.emit([&]() {
794       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
795              << NV("Callee", Cloner.OrigFunc)
796              << " should always be fully inlined, not partially";
797     });
798     return false;
799   }
800 
801   if (IC.isNever()) {
802     ORE.emit([&]() {
803       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
804              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
805              << NV("Caller", Caller)
806              << " because it should never be inlined (cost=never)";
807     });
808     return false;
809   }
810 
811   if (!IC) {
812     ORE.emit([&]() {
813       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
814              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
815              << NV("Caller", Caller) << " because too costly to inline (cost="
816              << NV("Cost", IC.getCost()) << ", threshold="
817              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
818     });
819     return false;
820   }
821   const DataLayout &DL = Caller->getParent()->getDataLayout();
822 
823   // The savings of eliminating the call:
824   int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
825   BlockFrequency NormWeightedSavings(NonWeightedSavings);
826 
827   // Weighted saving is smaller than weighted cost, return false
828   if (NormWeightedSavings < WeightedOutliningRcost) {
829     ORE.emit([&]() {
830       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
831                                         Call)
832              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
833              << NV("Caller", Caller) << " runtime overhead (overhead="
834              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
835              << ", savings="
836              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
837              << ")"
838              << " of making the outlined call is too high";
839     });
840 
841     return false;
842   }
843 
844   ORE.emit([&]() {
845     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
846            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
847            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
848            << " (threshold="
849            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
850   });
851   return true;
852 }
853 
854 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
855 // For now use a simplified version. The returned 'InlineCost' will be used
856 // to esimate the size cost as well as runtime cost of the BB.
857 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
858   int InlineCost = 0;
859   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
860   for (Instruction &I : BB->instructionsWithoutDebug()) {
861     // Skip free instructions.
862     switch (I.getOpcode()) {
863     case Instruction::BitCast:
864     case Instruction::PtrToInt:
865     case Instruction::IntToPtr:
866     case Instruction::Alloca:
867     case Instruction::PHI:
868       continue;
869     case Instruction::GetElementPtr:
870       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
871         continue;
872       break;
873     default:
874       break;
875     }
876 
877     if (I.isLifetimeStartOrEnd())
878       continue;
879 
880     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
881       InlineCost += getCallsiteCost(*CI, DL);
882       continue;
883     }
884 
885     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
886       InlineCost += getCallsiteCost(*II, DL);
887       continue;
888     }
889 
890     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
891       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
892       continue;
893     }
894     InlineCost += InlineConstants::InstrCost;
895   }
896   return InlineCost;
897 }
898 
899 std::tuple<int, int>
900 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
901   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
902   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
903     Function *OutlinedFunc = FuncBBPair.first;
904     BasicBlock* OutliningCallBB = FuncBBPair.second;
905     // Now compute the cost of the call sequence to the outlined function
906     // 'OutlinedFunction' in BB 'OutliningCallBB':
907     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
908 
909     // Now compute the cost of the extracted/outlined function itself:
910     for (BasicBlock &BB : *OutlinedFunc)
911       OutlinedFunctionCost += computeBBInlineCost(&BB);
912   }
913   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
914          "Outlined function cost should be no less than the outlined region");
915 
916   // The code extractor introduces a new root and exit stub blocks with
917   // additional unconditional branches. Those branches will be eliminated
918   // later with bb layout. The cost should be adjusted accordingly:
919   OutlinedFunctionCost -=
920       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
921 
922   int OutliningRuntimeOverhead =
923       OutliningFuncCallCost +
924       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
925       ExtraOutliningPenalty;
926 
927   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
928 }
929 
930 // Create the callsite to profile count map which is
931 // used to update the original function's entry count,
932 // after the function is partially inlined into the callsite.
933 void PartialInlinerImpl::computeCallsiteToProfCountMap(
934     Function *DuplicateFunction,
935     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
936   std::vector<User *> Users(DuplicateFunction->user_begin(),
937                             DuplicateFunction->user_end());
938   Function *CurrentCaller = nullptr;
939   std::unique_ptr<BlockFrequencyInfo> TempBFI;
940   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
941 
942   auto ComputeCurrBFI = [&,this](Function *Caller) {
943       // For the old pass manager:
944       if (!GetBFI) {
945         DominatorTree DT(*Caller);
946         LoopInfo LI(DT);
947         BranchProbabilityInfo BPI(*Caller, LI);
948         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
949         CurrentCallerBFI = TempBFI.get();
950       } else {
951         // New pass manager:
952         CurrentCallerBFI = &(*GetBFI)(*Caller);
953       }
954   };
955 
956   for (User *User : Users) {
957     CallSite CS = getCallSite(User);
958     Function *Caller = CS.getCaller();
959     if (CurrentCaller != Caller) {
960       CurrentCaller = Caller;
961       ComputeCurrBFI(Caller);
962     } else {
963       assert(CurrentCallerBFI && "CallerBFI is not set");
964     }
965     BasicBlock *CallBB = CS.getInstruction()->getParent();
966     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
967     if (Count)
968       CallSiteToProfCountMap[User] = *Count;
969     else
970       CallSiteToProfCountMap[User] = 0;
971   }
972 }
973 
974 PartialInlinerImpl::FunctionCloner::FunctionCloner(
975     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
976     function_ref<AssumptionCache *(Function &)> LookupAC)
977     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
978   ClonedOI = std::make_unique<FunctionOutliningInfo>();
979 
980   // Clone the function, so that we can hack away on it.
981   ValueToValueMapTy VMap;
982   ClonedFunc = CloneFunction(F, VMap);
983 
984   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
985   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
986   for (BasicBlock *BB : OI->Entries) {
987     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
988   }
989   for (BasicBlock *E : OI->ReturnBlockPreds) {
990     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
991     ClonedOI->ReturnBlockPreds.push_back(NewE);
992   }
993   // Go ahead and update all uses to the duplicate, so that we can just
994   // use the inliner functionality when we're done hacking.
995   F->replaceAllUsesWith(ClonedFunc);
996 }
997 
998 PartialInlinerImpl::FunctionCloner::FunctionCloner(
999     Function *F, FunctionOutliningMultiRegionInfo *OI,
1000     OptimizationRemarkEmitter &ORE,
1001     function_ref<AssumptionCache *(Function &)> LookupAC)
1002     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC) {
1003   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1004 
1005   // Clone the function, so that we can hack away on it.
1006   ValueToValueMapTy VMap;
1007   ClonedFunc = CloneFunction(F, VMap);
1008 
1009   // Go through all Outline Candidate Regions and update all BasicBlock
1010   // information.
1011   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1012        OI->ORI) {
1013     SmallVector<BasicBlock *, 8> Region;
1014     for (BasicBlock *BB : RegionInfo.Region) {
1015       Region.push_back(cast<BasicBlock>(VMap[BB]));
1016     }
1017     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1018     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1019     BasicBlock *NewReturnBlock = nullptr;
1020     if (RegionInfo.ReturnBlock)
1021       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1022     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1023         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1024     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1025   }
1026   // Go ahead and update all uses to the duplicate, so that we can just
1027   // use the inliner functionality when we're done hacking.
1028   F->replaceAllUsesWith(ClonedFunc);
1029 }
1030 
1031 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1032   auto getFirstPHI = [](BasicBlock *BB) {
1033     BasicBlock::iterator I = BB->begin();
1034     PHINode *FirstPhi = nullptr;
1035     while (I != BB->end()) {
1036       PHINode *Phi = dyn_cast<PHINode>(I);
1037       if (!Phi)
1038         break;
1039       if (!FirstPhi) {
1040         FirstPhi = Phi;
1041         break;
1042       }
1043     }
1044     return FirstPhi;
1045   };
1046 
1047   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1048   // blocks.
1049   if (!ClonedOI)
1050     return;
1051 
1052   // Special hackery is needed with PHI nodes that have inputs from more than
1053   // one extracted block.  For simplicity, just split the PHIs into a two-level
1054   // sequence of PHIs, some of which will go in the extracted region, and some
1055   // of which will go outside.
1056   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1057   // only split block when necessary:
1058   PHINode *FirstPhi = getFirstPHI(PreReturn);
1059   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1060 
1061   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1062     return;
1063 
1064   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1065     Value *CommonValue = PN->getIncomingValue(0);
1066     if (all_of(PN->incoming_values(),
1067                [&](Value *V) { return V == CommonValue; }))
1068       return CommonValue;
1069     return nullptr;
1070   };
1071 
1072   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1073       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1074   BasicBlock::iterator I = PreReturn->begin();
1075   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1076   SmallVector<Instruction *, 4> DeadPhis;
1077   while (I != PreReturn->end()) {
1078     PHINode *OldPhi = dyn_cast<PHINode>(I);
1079     if (!OldPhi)
1080       break;
1081 
1082     PHINode *RetPhi =
1083         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1084     OldPhi->replaceAllUsesWith(RetPhi);
1085     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1086 
1087     RetPhi->addIncoming(&*I, PreReturn);
1088     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1089       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1090       OldPhi->removeIncomingValue(E);
1091     }
1092 
1093     // After incoming values splitting, the old phi may become trivial.
1094     // Keeping the trivial phi can introduce definition inside the outline
1095     // region which is live-out, causing necessary overhead (load, store
1096     // arg passing etc).
1097     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1098       OldPhi->replaceAllUsesWith(OldPhiVal);
1099       DeadPhis.push_back(OldPhi);
1100     }
1101     ++I;
1102   }
1103   for (auto *DP : DeadPhis)
1104     DP->eraseFromParent();
1105 
1106   for (auto E : ClonedOI->ReturnBlockPreds) {
1107     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1108   }
1109 }
1110 
1111 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1112 
1113   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1114     int Cost = 0;
1115     for (BasicBlock* BB : Region)
1116       Cost += computeBBInlineCost(BB);
1117     return Cost;
1118   };
1119 
1120   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1121 
1122   if (ClonedOMRI->ORI.empty())
1123     return false;
1124 
1125   // The CodeExtractor needs a dominator tree.
1126   DominatorTree DT;
1127   DT.recalculate(*ClonedFunc);
1128 
1129   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1130   LoopInfo LI(DT);
1131   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1132   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1133 
1134   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1135   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1136 
1137   SetVector<Value *> Inputs, Outputs, Sinks;
1138   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1139        ClonedOMRI->ORI) {
1140     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1141 
1142     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1143                      ClonedFuncBFI.get(), &BPI,
1144                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1145                      /* AllowVarargs */ false);
1146 
1147     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1148 
1149 #ifndef NDEBUG
1150     if (TracePartialInlining) {
1151       dbgs() << "inputs: " << Inputs.size() << "\n";
1152       dbgs() << "outputs: " << Outputs.size() << "\n";
1153       for (Value *value : Inputs)
1154         dbgs() << "value used in func: " << *value << "\n";
1155       for (Value *output : Outputs)
1156         dbgs() << "instr used in func: " << *output << "\n";
1157     }
1158 #endif
1159     // Do not extract regions that have live exit variables.
1160     if (Outputs.size() > 0 && !ForceLiveExit)
1161       continue;
1162 
1163     Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
1164 
1165     if (OutlinedFunc) {
1166       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1167       BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1168       assert(OutliningCallBB->getParent() == ClonedFunc);
1169       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1170       NumColdRegionsOutlined++;
1171       OutlinedRegionCost += CurrentOutlinedRegionCost;
1172 
1173       if (MarkOutlinedColdCC) {
1174         OutlinedFunc->setCallingConv(CallingConv::Cold);
1175         OCS.setCallingConv(CallingConv::Cold);
1176       }
1177     } else
1178       ORE.emit([&]() {
1179         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1180                                         &RegionInfo.Region.front()->front())
1181                << "Failed to extract region at block "
1182                << ore::NV("Block", RegionInfo.Region.front());
1183       });
1184   }
1185 
1186   return !OutlinedFunctions.empty();
1187 }
1188 
1189 Function *
1190 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1191   // Returns true if the block is to be partial inlined into the caller
1192   // (i.e. not to be extracted to the out of line function)
1193   auto ToBeInlined = [&, this](BasicBlock *BB) {
1194     return BB == ClonedOI->ReturnBlock ||
1195            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1196             ClonedOI->Entries.end());
1197   };
1198 
1199   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1200   // The CodeExtractor needs a dominator tree.
1201   DominatorTree DT;
1202   DT.recalculate(*ClonedFunc);
1203 
1204   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1205   LoopInfo LI(DT);
1206   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1207   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1208 
1209   // Gather up the blocks that we're going to extract.
1210   std::vector<BasicBlock *> ToExtract;
1211   ToExtract.push_back(ClonedOI->NonReturnBlock);
1212   OutlinedRegionCost +=
1213       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1214   for (BasicBlock &BB : *ClonedFunc)
1215     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1216       ToExtract.push_back(&BB);
1217       // FIXME: the code extractor may hoist/sink more code
1218       // into the outlined function which may make the outlining
1219       // overhead (the difference of the outlined function cost
1220       // and OutliningRegionCost) look larger.
1221       OutlinedRegionCost += computeBBInlineCost(&BB);
1222     }
1223 
1224   // Extract the body of the if.
1225   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1226   Function *OutlinedFunc =
1227       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1228                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1229                     /* AllowVarargs */ true)
1230           .extractCodeRegion(CEAC);
1231 
1232   if (OutlinedFunc) {
1233     BasicBlock *OutliningCallBB =
1234         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1235             .getInstruction()
1236             ->getParent();
1237     assert(OutliningCallBB->getParent() == ClonedFunc);
1238     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1239   } else
1240     ORE.emit([&]() {
1241       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1242                                       &ToExtract.front()->front())
1243              << "Failed to extract region at block "
1244              << ore::NV("Block", ToExtract.front());
1245     });
1246 
1247   return OutlinedFunc;
1248 }
1249 
1250 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1251   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1252   // users (function pointers, etc.) back to the original function.
1253   ClonedFunc->replaceAllUsesWith(OrigFunc);
1254   ClonedFunc->eraseFromParent();
1255   if (!IsFunctionInlined) {
1256     // Remove each function that was speculatively created if there is no
1257     // reference.
1258     for (auto FuncBBPair : OutlinedFunctions) {
1259       Function *Func = FuncBBPair.first;
1260       Func->eraseFromParent();
1261     }
1262   }
1263 }
1264 
1265 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1266 
1267   if (F->hasAddressTaken())
1268     return {false, nullptr};
1269 
1270   // Let inliner handle it
1271   if (F->hasFnAttribute(Attribute::AlwaysInline))
1272     return {false, nullptr};
1273 
1274   if (F->hasFnAttribute(Attribute::NoInline))
1275     return {false, nullptr};
1276 
1277   if (PSI->isFunctionEntryCold(F))
1278     return {false, nullptr};
1279 
1280   if (F->users().empty())
1281     return {false, nullptr};
1282 
1283   OptimizationRemarkEmitter ORE(F);
1284 
1285   // Only try to outline cold regions if we have a profile summary, which
1286   // implies we have profiling information.
1287   if (PSI->hasProfileSummary() && F->hasProfileData() &&
1288       !DisableMultiRegionPartialInline) {
1289     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1290         computeOutliningColdRegionsInfo(F, ORE);
1291     if (OMRI) {
1292       FunctionCloner Cloner(F, OMRI.get(), ORE, LookupAssumptionCache);
1293 
1294 #ifndef NDEBUG
1295       if (TracePartialInlining) {
1296         dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1297         dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1298                << "\n";
1299       }
1300 #endif
1301       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1302 
1303       if (DidOutline) {
1304 #ifndef NDEBUG
1305         if (TracePartialInlining) {
1306           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1307           Cloner.ClonedFunc->print(dbgs());
1308           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1309         }
1310 #endif
1311 
1312         if (tryPartialInline(Cloner))
1313           return {true, nullptr};
1314       }
1315     }
1316   }
1317 
1318   // Fall-thru to regular partial inlining if we:
1319   //    i) can't find any cold regions to outline, or
1320   //   ii) can't inline the outlined function anywhere.
1321   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1322   if (!OI)
1323     return {false, nullptr};
1324 
1325   FunctionCloner Cloner(F, OI.get(), ORE, LookupAssumptionCache);
1326   Cloner.NormalizeReturnBlock();
1327 
1328   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1329 
1330   if (!OutlinedFunction)
1331     return {false, nullptr};
1332 
1333   bool AnyInline = tryPartialInline(Cloner);
1334 
1335   if (AnyInline)
1336     return {true, OutlinedFunction};
1337 
1338   return {false, nullptr};
1339 }
1340 
1341 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1342   if (Cloner.OutlinedFunctions.empty())
1343     return false;
1344 
1345   int SizeCost = 0;
1346   BlockFrequency WeightedRcost;
1347   int NonWeightedRcost;
1348   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1349 
1350   // Only calculate RelativeToEntryFreq when we are doing single region
1351   // outlining.
1352   BranchProbability RelativeToEntryFreq;
1353   if (Cloner.ClonedOI) {
1354     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1355   } else
1356     // RelativeToEntryFreq doesn't make sense when we have more than one
1357     // outlined call because each call will have a different relative frequency
1358     // to the entry block.  We can consider using the average, but the
1359     // usefulness of that information is questionable. For now, assume we never
1360     // execute the calls to outlined functions.
1361     RelativeToEntryFreq = BranchProbability(0, 1);
1362 
1363   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1364 
1365   // The call sequence(s) to the outlined function(s) are larger than the sum of
1366   // the original outlined region size(s), it does not increase the chances of
1367   // inlining the function with outlining (The inliner uses the size increase to
1368   // model the cost of inlining a callee).
1369   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1370     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1371     DebugLoc DLoc;
1372     BasicBlock *Block;
1373     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1374     OrigFuncORE.emit([&]() {
1375       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1376                                         DLoc, Block)
1377              << ore::NV("Function", Cloner.OrigFunc)
1378              << " not partially inlined into callers (Original Size = "
1379              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1380              << ", Size of call sequence to outlined function = "
1381              << ore::NV("NewSize", SizeCost) << ")";
1382     });
1383     return false;
1384   }
1385 
1386   assert(Cloner.OrigFunc->users().empty() &&
1387          "F's users should all be replaced!");
1388 
1389   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1390                             Cloner.ClonedFunc->user_end());
1391 
1392   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1393   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1394   if (CalleeEntryCount)
1395     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1396 
1397   uint64_t CalleeEntryCountV =
1398       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1399 
1400   bool AnyInline = false;
1401   for (User *User : Users) {
1402     CallSite CS = getCallSite(User);
1403 
1404     if (IsLimitReached())
1405       continue;
1406 
1407     OptimizationRemarkEmitter CallerORE(CS.getCaller());
1408     if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1409       continue;
1410 
1411     // Construct remark before doing the inlining, as after successful inlining
1412     // the callsite is removed.
1413     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1414     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1415        << ore::NV("Caller", CS.getCaller());
1416 
1417     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1418     // We can only forward varargs when we outlined a single region, else we
1419     // bail on vararg functions.
1420     if (!InlineFunction(CS, IFI, nullptr, true,
1421                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1422                                          : nullptr))
1423              .isSuccess())
1424       continue;
1425 
1426     CallerORE.emit(OR);
1427 
1428     // Now update the entry count:
1429     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1430       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1431       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1432     }
1433 
1434     AnyInline = true;
1435     NumPartialInlining++;
1436     // Update the stats
1437     if (Cloner.ClonedOI)
1438       NumPartialInlined++;
1439     else
1440       NumColdOutlinePartialInlined++;
1441 
1442   }
1443 
1444   if (AnyInline) {
1445     Cloner.IsFunctionInlined = true;
1446     if (CalleeEntryCount)
1447       Cloner.OrigFunc->setEntryCount(
1448           CalleeEntryCount.setCount(CalleeEntryCountV));
1449     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1450     OrigFuncORE.emit([&]() {
1451       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1452              << "Partially inlined into at least one caller";
1453     });
1454 
1455   }
1456 
1457   return AnyInline;
1458 }
1459 
1460 bool PartialInlinerImpl::run(Module &M) {
1461   if (DisablePartialInlining)
1462     return false;
1463 
1464   std::vector<Function *> Worklist;
1465   Worklist.reserve(M.size());
1466   for (Function &F : M)
1467     if (!F.use_empty() && !F.isDeclaration())
1468       Worklist.push_back(&F);
1469 
1470   bool Changed = false;
1471   while (!Worklist.empty()) {
1472     Function *CurrFunc = Worklist.back();
1473     Worklist.pop_back();
1474 
1475     if (CurrFunc->use_empty())
1476       continue;
1477 
1478     bool Recursive = false;
1479     for (User *U : CurrFunc->users())
1480       if (Instruction *I = dyn_cast<Instruction>(U))
1481         if (I->getParent()->getParent() == CurrFunc) {
1482           Recursive = true;
1483           break;
1484         }
1485     if (Recursive)
1486       continue;
1487 
1488     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1489     if (Result.second)
1490       Worklist.push_back(Result.second);
1491     Changed |= Result.first;
1492   }
1493 
1494   return Changed;
1495 }
1496 
1497 char PartialInlinerLegacyPass::ID = 0;
1498 
1499 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1500                       "Partial Inliner", false, false)
1501 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1502 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1503 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1504 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1505 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1506                     "Partial Inliner", false, false)
1507 
1508 ModulePass *llvm::createPartialInliningPass() {
1509   return new PartialInlinerLegacyPass();
1510 }
1511 
1512 PreservedAnalyses PartialInlinerPass::run(Module &M,
1513                                           ModuleAnalysisManager &AM) {
1514   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1515 
1516   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1517       [&FAM](Function &F) -> AssumptionCache & {
1518     return FAM.getResult<AssumptionAnalysis>(F);
1519   };
1520 
1521   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1522     return FAM.getCachedResult<AssumptionAnalysis>(F);
1523   };
1524 
1525   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1526       [&FAM](Function &F) -> BlockFrequencyInfo & {
1527     return FAM.getResult<BlockFrequencyAnalysis>(F);
1528   };
1529 
1530   std::function<TargetTransformInfo &(Function &)> GetTTI =
1531       [&FAM](Function &F) -> TargetTransformInfo & {
1532     return FAM.getResult<TargetIRAnalysis>(F);
1533   };
1534 
1535   std::function<const TargetLibraryInfo &(Function &)> GetTLI =
1536       [&FAM](Function &F) -> TargetLibraryInfo & {
1537     return FAM.getResult<TargetLibraryAnalysis>(F);
1538   };
1539 
1540   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1541 
1542   if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
1543                          {GetBFI}, &GetTLI, PSI)
1544           .run(M))
1545     return PreservedAnalyses::none();
1546   return PreservedAnalyses::all();
1547 }
1548