1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 // This is an option used by testing:
101 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
102                                       cl::init(false), cl::ZeroOrMore,
103                                       cl::ReallyHidden,
104                                       cl::desc("Skip Cost Analysis"));
105 // Used to determine if a cold region is worth outlining based on
106 // its inlining cost compared to the original function.  Default is set at 10%.
107 // ie. if the cold region reduces the inlining cost of the original function by
108 // at least 10%.
109 static cl::opt<float> MinRegionSizeRatio(
110     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
111     cl::desc("Minimum ratio comparing relative sizes of each "
112              "outline candidate and original function"));
113 // Used to tune the minimum number of execution counts needed in the predecessor
114 // block to the cold edge. ie. confidence interval.
115 static cl::opt<unsigned>
116     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
117                              cl::desc("Minimum block executions to consider "
118                                       "its BranchProbabilityInfo valid"));
119 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
120 // if the branch probability is 10% or less, then it is deemed as 'cold'.
121 static cl::opt<float> ColdBranchRatio(
122     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
123     cl::desc("Minimum BranchProbability to consider a region cold."));
124 
125 static cl::opt<unsigned> MaxNumInlineBlocks(
126     "max-num-inline-blocks", cl::init(5), cl::Hidden,
127     cl::desc("Max number of blocks to be partially inlined"));
128 
129 // Command line option to set the maximum number of partial inlining allowed
130 // for the module. The default value of -1 means no limit.
131 static cl::opt<int> MaxNumPartialInlining(
132     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
133     cl::desc("Max number of partial inlining. The default is unlimited"));
134 
135 // Used only when PGO or user annotated branch data is absent. It is
136 // the least value that is used to weigh the outline region. If BFI
137 // produces larger value, the BFI value will be used.
138 static cl::opt<int>
139     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
140                              cl::Hidden, cl::ZeroOrMore,
141                              cl::desc("Relative frequency of outline region to "
142                                       "the entry block"));
143 
144 static cl::opt<unsigned> ExtraOutliningPenalty(
145     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
146     cl::desc("A debug option to add additional penalty to the computed one."));
147 
148 namespace {
149 
150 struct FunctionOutliningInfo {
151   FunctionOutliningInfo() = default;
152 
153   // Returns the number of blocks to be inlined including all blocks
154   // in Entries and one return block.
155   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
156 
157   // A set of blocks including the function entry that guard
158   // the region to be outlined.
159   SmallVector<BasicBlock *, 4> Entries;
160 
161   // The return block that is not included in the outlined region.
162   BasicBlock *ReturnBlock = nullptr;
163 
164   // The dominating block of the region to be outlined.
165   BasicBlock *NonReturnBlock = nullptr;
166 
167   // The set of blocks in Entries that that are predecessors to ReturnBlock
168   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
169 };
170 
171 struct FunctionOutliningMultiRegionInfo {
172   FunctionOutliningMultiRegionInfo()
173       : ORI() {}
174 
175   // Container for outline regions
176   struct OutlineRegionInfo {
177     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
178                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
179                       BasicBlock *ReturnBlock)
180         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
181           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
182     SmallVector<BasicBlock *, 8> Region;
183     BasicBlock *EntryBlock;
184     BasicBlock *ExitBlock;
185     BasicBlock *ReturnBlock;
186   };
187 
188   SmallVector<OutlineRegionInfo, 4> ORI;
189 };
190 
191 struct PartialInlinerImpl {
192 
193   PartialInlinerImpl(
194       function_ref<AssumptionCache &(Function &)> GetAC,
195       function_ref<AssumptionCache *(Function &)> LookupAC,
196       function_ref<TargetTransformInfo &(Function &)> GTTI,
197       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
198       ProfileSummaryInfo &ProfSI,
199       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
200       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
201         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
202 
203   bool run(Module &M);
204   // Main part of the transformation that calls helper functions to find
205   // outlining candidates, clone & outline the function, and attempt to
206   // partially inline the resulting function. Returns true if
207   // inlining was successful, false otherwise.  Also returns the outline
208   // function (only if we partially inlined early returns) as there is a
209   // possibility to further "peel" early return statements that were left in the
210   // outline function due to code size.
211   std::pair<bool, Function *> unswitchFunction(Function &F);
212 
213   // This class speculatively clones the function to be partial inlined.
214   // At the end of partial inlining, the remaining callsites to the cloned
215   // function that are not partially inlined will be fixed up to reference
216   // the original function, and the cloned function will be erased.
217   struct FunctionCloner {
218     // Two constructors, one for single region outlining, the other for
219     // multi-region outlining.
220     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
221                    OptimizationRemarkEmitter &ORE,
222                    function_ref<AssumptionCache *(Function &)> LookupAC,
223                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
224     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
225                    OptimizationRemarkEmitter &ORE,
226                    function_ref<AssumptionCache *(Function &)> LookupAC,
227                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
228 
229     ~FunctionCloner();
230 
231     // Prepare for function outlining: making sure there is only
232     // one incoming edge from the extracted/outlined region to
233     // the return block.
234     void normalizeReturnBlock() const;
235 
236     // Do function outlining for cold regions.
237     bool doMultiRegionFunctionOutlining();
238     // Do function outlining for region after early return block(s).
239     // NOTE: For vararg functions that do the vararg handling in the outlined
240     //       function, we temporarily generate IR that does not properly
241     //       forward varargs to the outlined function. Calling InlineFunction
242     //       will update calls to the outlined functions to properly forward
243     //       the varargs.
244     Function *doSingleRegionFunctionOutlining();
245 
246     Function *OrigFunc = nullptr;
247     Function *ClonedFunc = nullptr;
248 
249     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250     // Keep track of Outlined Functions and the basic block they're called from.
251     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
252 
253     // ClonedFunc is inlined in one of its callers after function
254     // outlining.
255     bool IsFunctionInlined = false;
256     // The cost of the region to be outlined.
257     int OutlinedRegionCost = 0;
258     // ClonedOI is specific to outlining non-early return blocks.
259     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
260     // ClonedOMRI is specific to outlining cold regions.
261     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
262     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
263     OptimizationRemarkEmitter &ORE;
264     function_ref<AssumptionCache *(Function &)> LookupAC;
265     function_ref<TargetTransformInfo &(Function &)> GetTTI;
266   };
267 
268 private:
269   int NumPartialInlining = 0;
270   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
271   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
272   function_ref<TargetTransformInfo &(Function &)> GetTTI;
273   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
274   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
275   ProfileSummaryInfo &PSI;
276 
277   // Return the frequency of the OutlininingBB relative to F's entry point.
278   // The result is no larger than 1 and is represented using BP.
279   // (Note that the outlined region's 'head' block can only have incoming
280   // edges from the guarding entry blocks).
281   BranchProbability
282   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
283 
284   // Return true if the callee of CB should be partially inlined with
285   // profit.
286   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
287                            BlockFrequency WeightedOutliningRcost,
288                            OptimizationRemarkEmitter &ORE) const;
289 
290   // Try to inline DuplicateFunction (cloned from F with call to
291   // the OutlinedFunction into its callers. Return true
292   // if there is any successful inlining.
293   bool tryPartialInline(FunctionCloner &Cloner);
294 
295   // Compute the mapping from use site of DuplicationFunction to the enclosing
296   // BB's profile count.
297   void
298   computeCallsiteToProfCountMap(Function *DuplicateFunction,
299                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
300 
301   bool isLimitReached() const {
302     return (MaxNumPartialInlining != -1 &&
303             NumPartialInlining >= MaxNumPartialInlining);
304   }
305 
306   static CallBase *getSupportedCallBase(User *U) {
307     if (isa<CallInst>(U) || isa<InvokeInst>(U))
308       return cast<CallBase>(U);
309     llvm_unreachable("All uses must be calls");
310     return nullptr;
311   }
312 
313   static CallBase *getOneCallSiteTo(Function &F) {
314     User *User = *F.user_begin();
315     return getSupportedCallBase(User);
316   }
317 
318   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
319     CallBase *CB = getOneCallSiteTo(F);
320     DebugLoc DLoc = CB->getDebugLoc();
321     BasicBlock *Block = CB->getParent();
322     return std::make_tuple(DLoc, Block);
323   }
324 
325   // Returns the costs associated with function outlining:
326   // - The first value is the non-weighted runtime cost for making the call
327   //   to the outlined function, including the addtional  setup cost in the
328   //    outlined function itself;
329   // - The second value is the estimated size of the new call sequence in
330   //   basic block Cloner.OutliningCallBB;
331   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner) const;
332 
333   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
334   // approximate both the size and runtime cost (Note that in the current
335   // inline cost analysis, there is no clear distinction there either).
336   static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
337 
338   std::unique_ptr<FunctionOutliningInfo>
339   computeOutliningInfo(Function &F) const;
340 
341   std::unique_ptr<FunctionOutliningMultiRegionInfo>
342   computeOutliningColdRegionsInfo(Function &F,
343                                   OptimizationRemarkEmitter &ORE) const;
344 };
345 
346 struct PartialInlinerLegacyPass : public ModulePass {
347   static char ID; // Pass identification, replacement for typeid
348 
349   PartialInlinerLegacyPass() : ModulePass(ID) {
350     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
351   }
352 
353   void getAnalysisUsage(AnalysisUsage &AU) const override {
354     AU.addRequired<AssumptionCacheTracker>();
355     AU.addRequired<ProfileSummaryInfoWrapperPass>();
356     AU.addRequired<TargetTransformInfoWrapperPass>();
357     AU.addRequired<TargetLibraryInfoWrapperPass>();
358   }
359 
360   bool runOnModule(Module &M) override {
361     if (skipModule(M))
362       return false;
363 
364     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
365     TargetTransformInfoWrapperPass *TTIWP =
366         &getAnalysis<TargetTransformInfoWrapperPass>();
367     ProfileSummaryInfo &PSI =
368         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
369 
370     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
371       return ACT->getAssumptionCache(F);
372     };
373 
374     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
375       return ACT->lookupAssumptionCache(F);
376     };
377 
378     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
379       return TTIWP->getTTI(F);
380     };
381 
382     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
383       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
384     };
385 
386     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
387                               GetTLI, PSI)
388         .run(M);
389   }
390 };
391 
392 } // end anonymous namespace
393 
394 std::unique_ptr<FunctionOutliningMultiRegionInfo>
395 PartialInlinerImpl::computeOutliningColdRegionsInfo(
396     Function &F, OptimizationRemarkEmitter &ORE) const {
397   BasicBlock *EntryBlock = &F.front();
398 
399   DominatorTree DT(F);
400   LoopInfo LI(DT);
401   BranchProbabilityInfo BPI(F, LI);
402   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
403   BlockFrequencyInfo *BFI;
404   if (!GetBFI) {
405     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
406     BFI = ScopedBFI.get();
407   } else
408     BFI = &(GetBFI(F));
409 
410   // Return if we don't have profiling information.
411   if (!PSI.hasInstrumentationProfile())
412     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
413 
414   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
415       std::make_unique<FunctionOutliningMultiRegionInfo>();
416 
417   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
418     BasicBlock *Dom = BlockList.front();
419     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
420   };
421 
422   auto IsSingleExit =
423       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
424     BasicBlock *ExitBlock = nullptr;
425     for (auto *Block : BlockList) {
426       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
427         if (!is_contained(BlockList, *SI)) {
428           if (ExitBlock) {
429             ORE.emit([&]() {
430               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
431                                               &SI->front())
432                      << "Region dominated by "
433                      << ore::NV("Block", BlockList.front()->getName())
434                      << " has more than one region exit edge.";
435             });
436             return nullptr;
437           }
438 
439           ExitBlock = Block;
440         }
441       }
442     }
443     return ExitBlock;
444   };
445 
446   auto BBProfileCount = [BFI](BasicBlock *BB) {
447     return BFI->getBlockProfileCount(BB)
448                ? BFI->getBlockProfileCount(BB).getValue()
449                : 0;
450   };
451 
452   // Use the same computeBBInlineCost function to compute the cost savings of
453   // the outlining the candidate region.
454   TargetTransformInfo *FTTI = &GetTTI(F);
455   int OverallFunctionCost = 0;
456   for (auto &BB : F)
457     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
458 
459   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
460                     << "\n";);
461 
462   int MinOutlineRegionCost =
463       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
464   BranchProbability MinBranchProbability(
465       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
466       MinBlockCounterExecution);
467   bool ColdCandidateFound = false;
468   BasicBlock *CurrEntry = EntryBlock;
469   std::vector<BasicBlock *> DFS;
470   DenseMap<BasicBlock *, bool> VisitedMap;
471   DFS.push_back(CurrEntry);
472   VisitedMap[CurrEntry] = true;
473 
474   // Use Depth First Search on the basic blocks to find CFG edges that are
475   // considered cold.
476   // Cold regions considered must also have its inline cost compared to the
477   // overall inline cost of the original function.  The region is outlined only
478   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
479   // more.
480   while (!DFS.empty()) {
481     auto *ThisBB = DFS.back();
482     DFS.pop_back();
483     // Only consider regions with predecessor blocks that are considered
484     // not-cold (default: part of the top 99.99% of all block counters)
485     // AND greater than our minimum block execution count (default: 100).
486     if (PSI.isColdBlock(ThisBB, BFI) ||
487         BBProfileCount(ThisBB) < MinBlockCounterExecution)
488       continue;
489     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
490       if (VisitedMap[*SI])
491         continue;
492       VisitedMap[*SI] = true;
493       DFS.push_back(*SI);
494       // If branch isn't cold, we skip to the next one.
495       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
496       if (SuccProb > MinBranchProbability)
497         continue;
498 
499       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
500                         << SI->getName()
501                         << "\nBranch Probability = " << SuccProb << "\n";);
502 
503       SmallVector<BasicBlock *, 8> DominateVector;
504       DT.getDescendants(*SI, DominateVector);
505 
506       // We can only outline single entry regions (for now).
507       if (!IsSingleEntry(DominateVector))
508         continue;
509 
510       BasicBlock *ExitBlock = nullptr;
511       // We can only outline single exit regions (for now).
512       if (!(ExitBlock = IsSingleExit(DominateVector)))
513         continue;
514 
515       int OutlineRegionCost = 0;
516       for (auto *BB : DominateVector)
517         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
518 
519       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
520                         << "\n";);
521 
522       if (OutlineRegionCost < MinOutlineRegionCost) {
523         ORE.emit([&]() {
524           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
525                                             &SI->front())
526                  << ore::NV("Callee", &F)
527                  << " inline cost-savings smaller than "
528                  << ore::NV("Cost", MinOutlineRegionCost);
529         });
530         continue;
531       }
532       // For now, ignore blocks that belong to a SISE region that is a
533       // candidate for outlining.  In the future, we may want to look
534       // at inner regions because the outer region may have live-exit
535       // variables.
536       for (auto *BB : DominateVector)
537         VisitedMap[BB] = true;
538 
539       // ReturnBlock here means the block after the outline call
540       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
541       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
542           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
543       OutliningInfo->ORI.push_back(RegInfo);
544       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
545                         << DominateVector.front()->getName() << "\n";);
546       ColdCandidateFound = true;
547       NumColdRegionsFound++;
548     }
549   }
550 
551   if (ColdCandidateFound)
552     return OutliningInfo;
553 
554   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
555 }
556 
557 std::unique_ptr<FunctionOutliningInfo>
558 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
559   BasicBlock *EntryBlock = &F.front();
560   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
561   if (!BR || BR->isUnconditional())
562     return std::unique_ptr<FunctionOutliningInfo>();
563 
564   // Returns true if Succ is BB's successor
565   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
566     return is_contained(successors(BB), Succ);
567   };
568 
569   auto IsReturnBlock = [](BasicBlock *BB) {
570     Instruction *TI = BB->getTerminator();
571     return isa<ReturnInst>(TI);
572   };
573 
574   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
575     if (IsReturnBlock(Succ1))
576       return std::make_tuple(Succ1, Succ2);
577     if (IsReturnBlock(Succ2))
578       return std::make_tuple(Succ2, Succ1);
579 
580     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
581   };
582 
583   // Detect a triangular shape:
584   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
585     if (IsSuccessor(Succ1, Succ2))
586       return std::make_tuple(Succ1, Succ2);
587     if (IsSuccessor(Succ2, Succ1))
588       return std::make_tuple(Succ2, Succ1);
589 
590     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
591   };
592 
593   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
594       std::make_unique<FunctionOutliningInfo>();
595 
596   BasicBlock *CurrEntry = EntryBlock;
597   bool CandidateFound = false;
598   do {
599     // The number of blocks to be inlined has already reached
600     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
601     // disables partial inlining for the function.
602     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
603       break;
604 
605     if (succ_size(CurrEntry) != 2)
606       break;
607 
608     BasicBlock *Succ1 = *succ_begin(CurrEntry);
609     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
610 
611     BasicBlock *ReturnBlock, *NonReturnBlock;
612     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
613 
614     if (ReturnBlock) {
615       OutliningInfo->Entries.push_back(CurrEntry);
616       OutliningInfo->ReturnBlock = ReturnBlock;
617       OutliningInfo->NonReturnBlock = NonReturnBlock;
618       CandidateFound = true;
619       break;
620     }
621 
622     BasicBlock *CommSucc, *OtherSucc;
623     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
624 
625     if (!CommSucc)
626       break;
627 
628     OutliningInfo->Entries.push_back(CurrEntry);
629     CurrEntry = OtherSucc;
630   } while (true);
631 
632   if (!CandidateFound)
633     return std::unique_ptr<FunctionOutliningInfo>();
634 
635   // Do sanity check of the entries: threre should not
636   // be any successors (not in the entry set) other than
637   // {ReturnBlock, NonReturnBlock}
638   assert(OutliningInfo->Entries[0] == &F.front() &&
639          "Function Entry must be the first in Entries vector");
640   DenseSet<BasicBlock *> Entries;
641   for (BasicBlock *E : OutliningInfo->Entries)
642     Entries.insert(E);
643 
644   // Returns true of BB has Predecessor which is not
645   // in Entries set.
646   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
647     for (auto *Pred : predecessors(BB)) {
648       if (!Entries.count(Pred))
649         return true;
650     }
651     return false;
652   };
653   auto CheckAndNormalizeCandidate =
654       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
655         for (BasicBlock *E : OutliningInfo->Entries) {
656           for (auto *Succ : successors(E)) {
657             if (Entries.count(Succ))
658               continue;
659             if (Succ == OutliningInfo->ReturnBlock)
660               OutliningInfo->ReturnBlockPreds.push_back(E);
661             else if (Succ != OutliningInfo->NonReturnBlock)
662               return false;
663           }
664           // There should not be any outside incoming edges either:
665           if (HasNonEntryPred(E))
666             return false;
667         }
668         return true;
669       };
670 
671   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
672     return std::unique_ptr<FunctionOutliningInfo>();
673 
674   // Now further growing the candidate's inlining region by
675   // peeling off dominating blocks from the outlining region:
676   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
677     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
678     if (succ_size(Cand) != 2)
679       break;
680 
681     if (HasNonEntryPred(Cand))
682       break;
683 
684     BasicBlock *Succ1 = *succ_begin(Cand);
685     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
686 
687     BasicBlock *ReturnBlock, *NonReturnBlock;
688     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
689     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
690       break;
691 
692     if (NonReturnBlock->getSinglePredecessor() != Cand)
693       break;
694 
695     // Now grow and update OutlininigInfo:
696     OutliningInfo->Entries.push_back(Cand);
697     OutliningInfo->NonReturnBlock = NonReturnBlock;
698     OutliningInfo->ReturnBlockPreds.push_back(Cand);
699     Entries.insert(Cand);
700   }
701 
702   return OutliningInfo;
703 }
704 
705 // Check if there is PGO data or user annotated branch data:
706 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
707   if (F.hasProfileData())
708     return true;
709   // Now check if any of the entry block has MD_prof data:
710   for (auto *E : OI.Entries) {
711     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
712     if (!BR || BR->isUnconditional())
713       continue;
714     uint64_t T, F;
715     if (BR->extractProfMetadata(T, F))
716       return true;
717   }
718   return false;
719 }
720 
721 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
722     FunctionCloner &Cloner) const {
723   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
724   auto EntryFreq =
725       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
726   auto OutliningCallFreq =
727       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
728   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
729   // we outlined any regions, so we may encounter situations where the
730   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
731   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
732     OutliningCallFreq = EntryFreq;
733 
734   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
735       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
736 
737   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
738     return OutlineRegionRelFreq;
739 
740   // When profile data is not available, we need to be conservative in
741   // estimating the overall savings. Static branch prediction can usually
742   // guess the branch direction right (taken/non-taken), but the guessed
743   // branch probability is usually not biased enough. In case when the
744   // outlined region is predicted to be likely, its probability needs
745   // to be made higher (more biased) to not under-estimate the cost of
746   // function outlining. On the other hand, if the outlined region
747   // is predicted to be less likely, the predicted probablity is usually
748   // higher than the actual. For instance, the actual probability of the
749   // less likely target is only 5%, but the guessed probablity can be
750   // 40%. In the latter case, there is no need for further adjustement.
751   // FIXME: add an option for this.
752   if (OutlineRegionRelFreq < BranchProbability(45, 100))
753     return OutlineRegionRelFreq;
754 
755   OutlineRegionRelFreq = std::max(
756       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
757 
758   return OutlineRegionRelFreq;
759 }
760 
761 bool PartialInlinerImpl::shouldPartialInline(
762     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
763     OptimizationRemarkEmitter &ORE) const {
764   using namespace ore;
765 
766   Function *Callee = CB.getCalledFunction();
767   assert(Callee == Cloner.ClonedFunc);
768 
769   if (SkipCostAnalysis)
770     return isInlineViable(*Callee).isSuccess();
771 
772   Function *Caller = CB.getCaller();
773   auto &CalleeTTI = GetTTI(*Callee);
774   bool RemarksEnabled =
775       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
776           DEBUG_TYPE);
777   InlineCost IC =
778       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
779                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
780 
781   if (IC.isAlways()) {
782     ORE.emit([&]() {
783       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
784              << NV("Callee", Cloner.OrigFunc)
785              << " should always be fully inlined, not partially";
786     });
787     return false;
788   }
789 
790   if (IC.isNever()) {
791     ORE.emit([&]() {
792       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
793              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
794              << NV("Caller", Caller)
795              << " because it should never be inlined (cost=never)";
796     });
797     return false;
798   }
799 
800   if (!IC) {
801     ORE.emit([&]() {
802       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
803              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
804              << NV("Caller", Caller) << " because too costly to inline (cost="
805              << NV("Cost", IC.getCost()) << ", threshold="
806              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
807     });
808     return false;
809   }
810   const DataLayout &DL = Caller->getParent()->getDataLayout();
811 
812   // The savings of eliminating the call:
813   int NonWeightedSavings = getCallsiteCost(CB, DL);
814   BlockFrequency NormWeightedSavings(NonWeightedSavings);
815 
816   // Weighted saving is smaller than weighted cost, return false
817   if (NormWeightedSavings < WeightedOutliningRcost) {
818     ORE.emit([&]() {
819       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
820                                         &CB)
821              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
822              << NV("Caller", Caller) << " runtime overhead (overhead="
823              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
824              << ", savings="
825              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
826              << ")"
827              << " of making the outlined call is too high";
828     });
829 
830     return false;
831   }
832 
833   ORE.emit([&]() {
834     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
835            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
836            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
837            << " (threshold="
838            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
839   });
840   return true;
841 }
842 
843 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
844 // For now use a simplified version. The returned 'InlineCost' will be used
845 // to esimate the size cost as well as runtime cost of the BB.
846 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
847                                             TargetTransformInfo *TTI) {
848   int InlineCost = 0;
849   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
850   for (Instruction &I : BB->instructionsWithoutDebug()) {
851     // Skip free instructions.
852     switch (I.getOpcode()) {
853     case Instruction::BitCast:
854     case Instruction::PtrToInt:
855     case Instruction::IntToPtr:
856     case Instruction::Alloca:
857     case Instruction::PHI:
858       continue;
859     case Instruction::GetElementPtr:
860       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
861         continue;
862       break;
863     default:
864       break;
865     }
866 
867     if (I.isLifetimeStartOrEnd())
868       continue;
869 
870     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
871       Intrinsic::ID IID = II->getIntrinsicID();
872       SmallVector<Type *, 4> Tys;
873       FastMathFlags FMF;
874       for (Value *Val : II->args())
875         Tys.push_back(Val->getType());
876 
877       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
878         FMF = FPMO->getFastMathFlags();
879 
880       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
881       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
882       continue;
883     }
884 
885     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
886       InlineCost += getCallsiteCost(*CI, DL);
887       continue;
888     }
889 
890     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
891       InlineCost += getCallsiteCost(*II, DL);
892       continue;
893     }
894 
895     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
896       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
897       continue;
898     }
899     InlineCost += InlineConstants::InstrCost;
900   }
901   return InlineCost;
902 }
903 
904 std::tuple<int, int>
905 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
906   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
907   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
908     Function *OutlinedFunc = FuncBBPair.first;
909     BasicBlock* OutliningCallBB = FuncBBPair.second;
910     // Now compute the cost of the call sequence to the outlined function
911     // 'OutlinedFunction' in BB 'OutliningCallBB':
912     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
913     OutliningFuncCallCost +=
914         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
915 
916     // Now compute the cost of the extracted/outlined function itself:
917     for (BasicBlock &BB : *OutlinedFunc)
918       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
919   }
920   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
921          "Outlined function cost should be no less than the outlined region");
922 
923   // The code extractor introduces a new root and exit stub blocks with
924   // additional unconditional branches. Those branches will be eliminated
925   // later with bb layout. The cost should be adjusted accordingly:
926   OutlinedFunctionCost -=
927       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
928 
929   int OutliningRuntimeOverhead =
930       OutliningFuncCallCost +
931       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
932       ExtraOutliningPenalty;
933 
934   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
935 }
936 
937 // Create the callsite to profile count map which is
938 // used to update the original function's entry count,
939 // after the function is partially inlined into the callsite.
940 void PartialInlinerImpl::computeCallsiteToProfCountMap(
941     Function *DuplicateFunction,
942     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
943   std::vector<User *> Users(DuplicateFunction->user_begin(),
944                             DuplicateFunction->user_end());
945   Function *CurrentCaller = nullptr;
946   std::unique_ptr<BlockFrequencyInfo> TempBFI;
947   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
948 
949   auto ComputeCurrBFI = [&,this](Function *Caller) {
950       // For the old pass manager:
951       if (!GetBFI) {
952         DominatorTree DT(*Caller);
953         LoopInfo LI(DT);
954         BranchProbabilityInfo BPI(*Caller, LI);
955         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
956         CurrentCallerBFI = TempBFI.get();
957       } else {
958         // New pass manager:
959         CurrentCallerBFI = &(GetBFI(*Caller));
960       }
961   };
962 
963   for (User *User : Users) {
964     CallBase *CB = getSupportedCallBase(User);
965     Function *Caller = CB->getCaller();
966     if (CurrentCaller != Caller) {
967       CurrentCaller = Caller;
968       ComputeCurrBFI(Caller);
969     } else {
970       assert(CurrentCallerBFI && "CallerBFI is not set");
971     }
972     BasicBlock *CallBB = CB->getParent();
973     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
974     if (Count)
975       CallSiteToProfCountMap[User] = *Count;
976     else
977       CallSiteToProfCountMap[User] = 0;
978   }
979 }
980 
981 PartialInlinerImpl::FunctionCloner::FunctionCloner(
982     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
983     function_ref<AssumptionCache *(Function &)> LookupAC,
984     function_ref<TargetTransformInfo &(Function &)> GetTTI)
985     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
986   ClonedOI = std::make_unique<FunctionOutliningInfo>();
987 
988   // Clone the function, so that we can hack away on it.
989   ValueToValueMapTy VMap;
990   ClonedFunc = CloneFunction(F, VMap);
991 
992   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
993   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
994   for (BasicBlock *BB : OI->Entries)
995     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
996 
997   for (BasicBlock *E : OI->ReturnBlockPreds) {
998     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
999     ClonedOI->ReturnBlockPreds.push_back(NewE);
1000   }
1001   // Go ahead and update all uses to the duplicate, so that we can just
1002   // use the inliner functionality when we're done hacking.
1003   F->replaceAllUsesWith(ClonedFunc);
1004 }
1005 
1006 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1007     Function *F, FunctionOutliningMultiRegionInfo *OI,
1008     OptimizationRemarkEmitter &ORE,
1009     function_ref<AssumptionCache *(Function &)> LookupAC,
1010     function_ref<TargetTransformInfo &(Function &)> GetTTI)
1011     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1012   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1013 
1014   // Clone the function, so that we can hack away on it.
1015   ValueToValueMapTy VMap;
1016   ClonedFunc = CloneFunction(F, VMap);
1017 
1018   // Go through all Outline Candidate Regions and update all BasicBlock
1019   // information.
1020   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1021        OI->ORI) {
1022     SmallVector<BasicBlock *, 8> Region;
1023     for (BasicBlock *BB : RegionInfo.Region)
1024       Region.push_back(cast<BasicBlock>(VMap[BB]));
1025 
1026     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1027     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1028     BasicBlock *NewReturnBlock = nullptr;
1029     if (RegionInfo.ReturnBlock)
1030       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1031     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1032         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1033     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1034   }
1035   // Go ahead and update all uses to the duplicate, so that we can just
1036   // use the inliner functionality when we're done hacking.
1037   F->replaceAllUsesWith(ClonedFunc);
1038 }
1039 
1040 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1041   auto GetFirstPHI = [](BasicBlock *BB) {
1042     BasicBlock::iterator I = BB->begin();
1043     PHINode *FirstPhi = nullptr;
1044     while (I != BB->end()) {
1045       PHINode *Phi = dyn_cast<PHINode>(I);
1046       if (!Phi)
1047         break;
1048       if (!FirstPhi) {
1049         FirstPhi = Phi;
1050         break;
1051       }
1052     }
1053     return FirstPhi;
1054   };
1055 
1056   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1057   // blocks.
1058   if (!ClonedOI)
1059     return;
1060 
1061   // Special hackery is needed with PHI nodes that have inputs from more than
1062   // one extracted block.  For simplicity, just split the PHIs into a two-level
1063   // sequence of PHIs, some of which will go in the extracted region, and some
1064   // of which will go outside.
1065   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1066   // only split block when necessary:
1067   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1068   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1069 
1070   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1071     return;
1072 
1073   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1074     Value *CommonValue = PN->getIncomingValue(0);
1075     if (all_of(PN->incoming_values(),
1076                [&](Value *V) { return V == CommonValue; }))
1077       return CommonValue;
1078     return nullptr;
1079   };
1080 
1081   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1082       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1083   BasicBlock::iterator I = PreReturn->begin();
1084   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1085   SmallVector<Instruction *, 4> DeadPhis;
1086   while (I != PreReturn->end()) {
1087     PHINode *OldPhi = dyn_cast<PHINode>(I);
1088     if (!OldPhi)
1089       break;
1090 
1091     PHINode *RetPhi =
1092         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1093     OldPhi->replaceAllUsesWith(RetPhi);
1094     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1095 
1096     RetPhi->addIncoming(&*I, PreReturn);
1097     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1098       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1099       OldPhi->removeIncomingValue(E);
1100     }
1101 
1102     // After incoming values splitting, the old phi may become trivial.
1103     // Keeping the trivial phi can introduce definition inside the outline
1104     // region which is live-out, causing necessary overhead (load, store
1105     // arg passing etc).
1106     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1107       OldPhi->replaceAllUsesWith(OldPhiVal);
1108       DeadPhis.push_back(OldPhi);
1109     }
1110     ++I;
1111   }
1112   for (auto *DP : DeadPhis)
1113     DP->eraseFromParent();
1114 
1115   for (auto *E : ClonedOI->ReturnBlockPreds)
1116     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1117 }
1118 
1119 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1120 
1121   auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
1122     int Cost = 0;
1123     for (BasicBlock* BB : Region)
1124       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1125     return Cost;
1126   };
1127 
1128   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1129 
1130   if (ClonedOMRI->ORI.empty())
1131     return false;
1132 
1133   // The CodeExtractor needs a dominator tree.
1134   DominatorTree DT;
1135   DT.recalculate(*ClonedFunc);
1136 
1137   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1138   LoopInfo LI(DT);
1139   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1140   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1141 
1142   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1143   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1144 
1145   SetVector<Value *> Inputs, Outputs, Sinks;
1146   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1147        ClonedOMRI->ORI) {
1148     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1149 
1150     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1151                      ClonedFuncBFI.get(), &BPI,
1152                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1153                      /* AllowVarargs */ false);
1154 
1155     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1156 
1157     LLVM_DEBUG({
1158       dbgs() << "inputs: " << Inputs.size() << "\n";
1159       dbgs() << "outputs: " << Outputs.size() << "\n";
1160       for (Value *value : Inputs)
1161         dbgs() << "value used in func: " << *value << "\n";
1162       for (Value *output : Outputs)
1163         dbgs() << "instr used in func: " << *output << "\n";
1164     });
1165 
1166     // Do not extract regions that have live exit variables.
1167     if (Outputs.size() > 0 && !ForceLiveExit)
1168       continue;
1169 
1170     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1171       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1172       BasicBlock *OutliningCallBB = OCS->getParent();
1173       assert(OutliningCallBB->getParent() == ClonedFunc);
1174       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1175       NumColdRegionsOutlined++;
1176       OutlinedRegionCost += CurrentOutlinedRegionCost;
1177 
1178       if (MarkOutlinedColdCC) {
1179         OutlinedFunc->setCallingConv(CallingConv::Cold);
1180         OCS->setCallingConv(CallingConv::Cold);
1181       }
1182     } else
1183       ORE.emit([&]() {
1184         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1185                                         &RegionInfo.Region.front()->front())
1186                << "Failed to extract region at block "
1187                << ore::NV("Block", RegionInfo.Region.front());
1188       });
1189   }
1190 
1191   return !OutlinedFunctions.empty();
1192 }
1193 
1194 Function *
1195 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1196   // Returns true if the block is to be partial inlined into the caller
1197   // (i.e. not to be extracted to the out of line function)
1198   auto ToBeInlined = [&, this](BasicBlock *BB) {
1199     return BB == ClonedOI->ReturnBlock ||
1200            llvm::is_contained(ClonedOI->Entries, BB);
1201   };
1202 
1203   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1204   // The CodeExtractor needs a dominator tree.
1205   DominatorTree DT;
1206   DT.recalculate(*ClonedFunc);
1207 
1208   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1209   LoopInfo LI(DT);
1210   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1211   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1212 
1213   // Gather up the blocks that we're going to extract.
1214   std::vector<BasicBlock *> ToExtract;
1215   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1216   ToExtract.push_back(ClonedOI->NonReturnBlock);
1217   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1218       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1219   for (BasicBlock &BB : *ClonedFunc)
1220     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1221       ToExtract.push_back(&BB);
1222       // FIXME: the code extractor may hoist/sink more code
1223       // into the outlined function which may make the outlining
1224       // overhead (the difference of the outlined function cost
1225       // and OutliningRegionCost) look larger.
1226       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1227     }
1228 
1229   // Extract the body of the if.
1230   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1231   Function *OutlinedFunc =
1232       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1233                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1234                     /* AllowVarargs */ true)
1235           .extractCodeRegion(CEAC);
1236 
1237   if (OutlinedFunc) {
1238     BasicBlock *OutliningCallBB =
1239         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1240     assert(OutliningCallBB->getParent() == ClonedFunc);
1241     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1242   } else
1243     ORE.emit([&]() {
1244       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1245                                       &ToExtract.front()->front())
1246              << "Failed to extract region at block "
1247              << ore::NV("Block", ToExtract.front());
1248     });
1249 
1250   return OutlinedFunc;
1251 }
1252 
1253 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1254   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1255   // users (function pointers, etc.) back to the original function.
1256   ClonedFunc->replaceAllUsesWith(OrigFunc);
1257   ClonedFunc->eraseFromParent();
1258   if (!IsFunctionInlined) {
1259     // Remove each function that was speculatively created if there is no
1260     // reference.
1261     for (auto FuncBBPair : OutlinedFunctions) {
1262       Function *Func = FuncBBPair.first;
1263       Func->eraseFromParent();
1264     }
1265   }
1266 }
1267 
1268 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1269   if (F.hasAddressTaken())
1270     return {false, nullptr};
1271 
1272   // Let inliner handle it
1273   if (F.hasFnAttribute(Attribute::AlwaysInline))
1274     return {false, nullptr};
1275 
1276   if (F.hasFnAttribute(Attribute::NoInline))
1277     return {false, nullptr};
1278 
1279   if (PSI.isFunctionEntryCold(&F))
1280     return {false, nullptr};
1281 
1282   if (F.users().empty())
1283     return {false, nullptr};
1284 
1285   OptimizationRemarkEmitter ORE(&F);
1286 
1287   // Only try to outline cold regions if we have a profile summary, which
1288   // implies we have profiling information.
1289   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1290       !DisableMultiRegionPartialInline) {
1291     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1292         computeOutliningColdRegionsInfo(F, ORE);
1293     if (OMRI) {
1294       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1295 
1296       LLVM_DEBUG({
1297         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1298         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1299                << "\n";
1300       });
1301 
1302       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1303 
1304       if (DidOutline) {
1305         LLVM_DEBUG({
1306           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1307           Cloner.ClonedFunc->print(dbgs());
1308           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1309         });
1310 
1311         if (tryPartialInline(Cloner))
1312           return {true, nullptr};
1313       }
1314     }
1315   }
1316 
1317   // Fall-thru to regular partial inlining if we:
1318   //    i) can't find any cold regions to outline, or
1319   //   ii) can't inline the outlined function anywhere.
1320   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1321   if (!OI)
1322     return {false, nullptr};
1323 
1324   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1325   Cloner.normalizeReturnBlock();
1326 
1327   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1328 
1329   if (!OutlinedFunction)
1330     return {false, nullptr};
1331 
1332   if (tryPartialInline(Cloner))
1333     return {true, OutlinedFunction};
1334 
1335   return {false, nullptr};
1336 }
1337 
1338 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1339   if (Cloner.OutlinedFunctions.empty())
1340     return false;
1341 
1342   int SizeCost = 0;
1343   BlockFrequency WeightedRcost;
1344   int NonWeightedRcost;
1345   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1346 
1347   // Only calculate RelativeToEntryFreq when we are doing single region
1348   // outlining.
1349   BranchProbability RelativeToEntryFreq;
1350   if (Cloner.ClonedOI)
1351     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1352   else
1353     // RelativeToEntryFreq doesn't make sense when we have more than one
1354     // outlined call because each call will have a different relative frequency
1355     // to the entry block.  We can consider using the average, but the
1356     // usefulness of that information is questionable. For now, assume we never
1357     // execute the calls to outlined functions.
1358     RelativeToEntryFreq = BranchProbability(0, 1);
1359 
1360   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1361 
1362   // The call sequence(s) to the outlined function(s) are larger than the sum of
1363   // the original outlined region size(s), it does not increase the chances of
1364   // inlining the function with outlining (The inliner uses the size increase to
1365   // model the cost of inlining a callee).
1366   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1367     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1368     DebugLoc DLoc;
1369     BasicBlock *Block;
1370     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1371     OrigFuncORE.emit([&]() {
1372       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1373                                         DLoc, Block)
1374              << ore::NV("Function", Cloner.OrigFunc)
1375              << " not partially inlined into callers (Original Size = "
1376              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1377              << ", Size of call sequence to outlined function = "
1378              << ore::NV("NewSize", SizeCost) << ")";
1379     });
1380     return false;
1381   }
1382 
1383   assert(Cloner.OrigFunc->users().empty() &&
1384          "F's users should all be replaced!");
1385 
1386   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1387                             Cloner.ClonedFunc->user_end());
1388 
1389   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1390   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1391   if (CalleeEntryCount)
1392     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1393 
1394   uint64_t CalleeEntryCountV =
1395       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1396 
1397   bool AnyInline = false;
1398   for (User *User : Users) {
1399     CallBase *CB = getSupportedCallBase(User);
1400 
1401     if (isLimitReached())
1402       continue;
1403 
1404     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1405     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1406       continue;
1407 
1408     // Construct remark before doing the inlining, as after successful inlining
1409     // the callsite is removed.
1410     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1411     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1412        << ore::NV("Caller", CB->getCaller());
1413 
1414     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1415     // We can only forward varargs when we outlined a single region, else we
1416     // bail on vararg functions.
1417     if (!InlineFunction(*CB, IFI, nullptr, true,
1418                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1419                                          : nullptr))
1420              .isSuccess())
1421       continue;
1422 
1423     CallerORE.emit(OR);
1424 
1425     // Now update the entry count:
1426     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1427       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1428       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1429     }
1430 
1431     AnyInline = true;
1432     NumPartialInlining++;
1433     // Update the stats
1434     if (Cloner.ClonedOI)
1435       NumPartialInlined++;
1436     else
1437       NumColdOutlinePartialInlined++;
1438   }
1439 
1440   if (AnyInline) {
1441     Cloner.IsFunctionInlined = true;
1442     if (CalleeEntryCount)
1443       Cloner.OrigFunc->setEntryCount(
1444           CalleeEntryCount.setCount(CalleeEntryCountV));
1445     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1446     OrigFuncORE.emit([&]() {
1447       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1448              << "Partially inlined into at least one caller";
1449     });
1450   }
1451 
1452   return AnyInline;
1453 }
1454 
1455 bool PartialInlinerImpl::run(Module &M) {
1456   if (DisablePartialInlining)
1457     return false;
1458 
1459   std::vector<Function *> Worklist;
1460   Worklist.reserve(M.size());
1461   for (Function &F : M)
1462     if (!F.use_empty() && !F.isDeclaration())
1463       Worklist.push_back(&F);
1464 
1465   bool Changed = false;
1466   while (!Worklist.empty()) {
1467     Function *CurrFunc = Worklist.back();
1468     Worklist.pop_back();
1469 
1470     if (CurrFunc->use_empty())
1471       continue;
1472 
1473     bool Recursive = false;
1474     for (User *U : CurrFunc->users())
1475       if (Instruction *I = dyn_cast<Instruction>(U))
1476         if (I->getParent()->getParent() == CurrFunc) {
1477           Recursive = true;
1478           break;
1479         }
1480     if (Recursive)
1481       continue;
1482 
1483     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1484     if (Result.second)
1485       Worklist.push_back(Result.second);
1486     Changed |= Result.first;
1487   }
1488 
1489   return Changed;
1490 }
1491 
1492 char PartialInlinerLegacyPass::ID = 0;
1493 
1494 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1495                       "Partial Inliner", false, false)
1496 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1497 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1498 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1499 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1500 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1501                     "Partial Inliner", false, false)
1502 
1503 ModulePass *llvm::createPartialInliningPass() {
1504   return new PartialInlinerLegacyPass();
1505 }
1506 
1507 PreservedAnalyses PartialInlinerPass::run(Module &M,
1508                                           ModuleAnalysisManager &AM) {
1509   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1510 
1511   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1512     return FAM.getResult<AssumptionAnalysis>(F);
1513   };
1514 
1515   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1516     return FAM.getCachedResult<AssumptionAnalysis>(F);
1517   };
1518 
1519   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1520     return FAM.getResult<BlockFrequencyAnalysis>(F);
1521   };
1522 
1523   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1524     return FAM.getResult<TargetIRAnalysis>(F);
1525   };
1526 
1527   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1528     return FAM.getResult<TargetLibraryAnalysis>(F);
1529   };
1530 
1531   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1532 
1533   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1534                          GetTLI, PSI, GetBFI)
1535           .run(M))
1536     return PreservedAnalyses::none();
1537   return PreservedAnalyses::all();
1538 }
1539