1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
71                                         "cold outlined regions were partially "
72                                         "inlined into its caller(s).");
73 STATISTIC(NumColdRegionsFound,
74            "Number of cold single entry/exit regions found.");
75 STATISTIC(NumColdRegionsOutlined,
76            "Number of cold single entry/exit regions outlined.");
77 
78 // Command line option to disable partial-inlining. The default is false:
79 static cl::opt<bool>
80     DisablePartialInlining("disable-partial-inlining", cl::init(false),
81                            cl::Hidden, cl::desc("Disable partial inlining"));
82 // Command line option to disable multi-region partial-inlining. The default is
83 // false:
84 static cl::opt<bool> DisableMultiRegionPartialInline(
85     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
86     cl::desc("Disable multi-region partial inlining"));
87 
88 // Command line option to force outlining in regions with live exit variables.
89 // The default is false:
90 static cl::opt<bool>
91     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
92                cl::desc("Force outline regions with live exits"));
93 
94 // Command line option to enable marking outline functions with Cold Calling
95 // Convention. The default is false:
96 static cl::opt<bool>
97     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
98                        cl::desc("Mark outline function calls with ColdCC"));
99 
100 #ifndef NDEBUG
101 // Command line option to debug partial-inlining. The default is none:
102 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
103                                           cl::init(false), cl::Hidden,
104                                           cl::desc("Trace partial inlining."));
105 #endif
106 
107 // This is an option used by testing:
108 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
109                                       cl::init(false), cl::ZeroOrMore,
110                                       cl::ReallyHidden,
111                                       cl::desc("Skip Cost Analysis"));
112 // Used to determine if a cold region is worth outlining based on
113 // its inlining cost compared to the original function.  Default is set at 10%.
114 // ie. if the cold region reduces the inlining cost of the original function by
115 // at least 10%.
116 static cl::opt<float> MinRegionSizeRatio(
117     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
118     cl::desc("Minimum ratio comparing relative sizes of each "
119              "outline candidate and original function"));
120 // Used to tune the minimum number of execution counts needed in the predecessor
121 // block to the cold edge. ie. confidence interval.
122 static cl::opt<unsigned>
123     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
124                              cl::desc("Minimum block executions to consider "
125                                       "its BranchProbabilityInfo valid"));
126 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
127 // if the branch probability is 10% or less, then it is deemed as 'cold'.
128 static cl::opt<float> ColdBranchRatio(
129     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
130     cl::desc("Minimum BranchProbability to consider a region cold."));
131 
132 static cl::opt<unsigned> MaxNumInlineBlocks(
133     "max-num-inline-blocks", cl::init(5), cl::Hidden,
134     cl::desc("Max number of blocks to be partially inlined"));
135 
136 // Command line option to set the maximum number of partial inlining allowed
137 // for the module. The default value of -1 means no limit.
138 static cl::opt<int> MaxNumPartialInlining(
139     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
140     cl::desc("Max number of partial inlining. The default is unlimited"));
141 
142 // Used only when PGO or user annotated branch data is absent. It is
143 // the least value that is used to weigh the outline region. If BFI
144 // produces larger value, the BFI value will be used.
145 static cl::opt<int>
146     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
147                              cl::Hidden, cl::ZeroOrMore,
148                              cl::desc("Relative frequency of outline region to "
149                                       "the entry block"));
150 
151 static cl::opt<unsigned> ExtraOutliningPenalty(
152     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
153     cl::desc("A debug option to add additional penalty to the computed one."));
154 
155 namespace {
156 
157 struct FunctionOutliningInfo {
158   FunctionOutliningInfo() = default;
159 
160   // Returns the number of blocks to be inlined including all blocks
161   // in Entries and one return block.
162   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
163 
164   // A set of blocks including the function entry that guard
165   // the region to be outlined.
166   SmallVector<BasicBlock *, 4> Entries;
167 
168   // The return block that is not included in the outlined region.
169   BasicBlock *ReturnBlock = nullptr;
170 
171   // The dominating block of the region to be outlined.
172   BasicBlock *NonReturnBlock = nullptr;
173 
174   // The set of blocks in Entries that that are predecessors to ReturnBlock
175   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
176 };
177 
178 struct FunctionOutliningMultiRegionInfo {
179   FunctionOutliningMultiRegionInfo()
180       : ORI() {}
181 
182   // Container for outline regions
183   struct OutlineRegionInfo {
184     OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region,
185                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
186                       BasicBlock *ReturnBlock)
187         : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
188           ReturnBlock(ReturnBlock) {}
189     SmallVector<BasicBlock *, 8> Region;
190     BasicBlock *EntryBlock;
191     BasicBlock *ExitBlock;
192     BasicBlock *ReturnBlock;
193   };
194 
195   SmallVector<OutlineRegionInfo, 4> ORI;
196 };
197 
198 struct PartialInlinerImpl {
199 
200   PartialInlinerImpl(
201       std::function<AssumptionCache &(Function &)> *GetAC,
202       std::function<TargetTransformInfo &(Function &)> *GTTI,
203       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
204       ProfileSummaryInfo *ProfSI)
205       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
206 
207   bool run(Module &M);
208   // Main part of the transformation that calls helper functions to find
209   // outlining candidates, clone & outline the function, and attempt to
210   // partially inline the resulting function. Returns true if
211   // inlining was successful, false otherwise.  Also returns the outline
212   // function (only if we partially inlined early returns) as there is a
213   // possibility to further "peel" early return statements that were left in the
214   // outline function due to code size.
215   std::pair<bool, Function *> unswitchFunction(Function *F);
216 
217   // This class speculatively clones the function to be partial inlined.
218   // At the end of partial inlining, the remaining callsites to the cloned
219   // function that are not partially inlined will be fixed up to reference
220   // the original function, and the cloned function will be erased.
221   struct FunctionCloner {
222     // Two constructors, one for single region outlining, the other for
223     // multi-region outlining.
224     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
225                    OptimizationRemarkEmitter &ORE);
226     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
227                    OptimizationRemarkEmitter &ORE);
228     ~FunctionCloner();
229 
230     // Prepare for function outlining: making sure there is only
231     // one incoming edge from the extracted/outlined region to
232     // the return block.
233     void NormalizeReturnBlock();
234 
235     // Do function outlining for cold regions.
236     bool doMultiRegionFunctionOutlining();
237     // Do function outlining for region after early return block(s).
238     // NOTE: For vararg functions that do the vararg handling in the outlined
239     //       function, we temporarily generate IR that does not properly
240     //       forward varargs to the outlined function. Calling InlineFunction
241     //       will update calls to the outlined functions to properly forward
242     //       the varargs.
243     Function *doSingleRegionFunctionOutlining();
244 
245     Function *OrigFunc = nullptr;
246     Function *ClonedFunc = nullptr;
247 
248     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
249     // Keep track of Outlined Functions and the basic block they're called from.
250     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
251 
252     // ClonedFunc is inlined in one of its callers after function
253     // outlining.
254     bool IsFunctionInlined = false;
255     // The cost of the region to be outlined.
256     int OutlinedRegionCost = 0;
257     // ClonedOI is specific to outlining non-early return blocks.
258     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
259     // ClonedOMRI is specific to outlining cold regions.
260     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
261     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
262     OptimizationRemarkEmitter &ORE;
263   };
264 
265 private:
266   int NumPartialInlining = 0;
267   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
268   std::function<TargetTransformInfo &(Function &)> *GetTTI;
269   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
270   ProfileSummaryInfo *PSI;
271 
272   // Return the frequency of the OutlininingBB relative to F's entry point.
273   // The result is no larger than 1 and is represented using BP.
274   // (Note that the outlined region's 'head' block can only have incoming
275   // edges from the guarding entry blocks).
276   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
277 
278   // Return true if the callee of CS should be partially inlined with
279   // profit.
280   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
281                            BlockFrequency WeightedOutliningRcost,
282                            OptimizationRemarkEmitter &ORE);
283 
284   // Try to inline DuplicateFunction (cloned from F with call to
285   // the OutlinedFunction into its callers. Return true
286   // if there is any successful inlining.
287   bool tryPartialInline(FunctionCloner &Cloner);
288 
289   // Compute the mapping from use site of DuplicationFunction to the enclosing
290   // BB's profile count.
291   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
292                                      DenseMap<User *, uint64_t> &SiteCountMap);
293 
294   bool IsLimitReached() {
295     return (MaxNumPartialInlining != -1 &&
296             NumPartialInlining >= MaxNumPartialInlining);
297   }
298 
299   static CallSite getCallSite(User *U) {
300     CallSite CS;
301     if (CallInst *CI = dyn_cast<CallInst>(U))
302       CS = CallSite(CI);
303     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
304       CS = CallSite(II);
305     else
306       llvm_unreachable("All uses must be calls");
307     return CS;
308   }
309 
310   static CallSite getOneCallSiteTo(Function *F) {
311     User *User = *F->user_begin();
312     return getCallSite(User);
313   }
314 
315   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
316     CallSite CS = getOneCallSiteTo(F);
317     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
318     BasicBlock *Block = CS.getParent();
319     return std::make_tuple(DLoc, Block);
320   }
321 
322   // Returns the costs associated with function outlining:
323   // - The first value is the non-weighted runtime cost for making the call
324   //   to the outlined function, including the addtional  setup cost in the
325   //    outlined function itself;
326   // - The second value is the estimated size of the new call sequence in
327   //   basic block Cloner.OutliningCallBB;
328   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
329 
330   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
331   // approximate both the size and runtime cost (Note that in the current
332   // inline cost analysis, there is no clear distinction there either).
333   static int computeBBInlineCost(BasicBlock *BB);
334 
335   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
336   std::unique_ptr<FunctionOutliningMultiRegionInfo>
337   computeOutliningColdRegionsInfo(Function *F, OptimizationRemarkEmitter &ORE);
338 };
339 
340 struct PartialInlinerLegacyPass : public ModulePass {
341   static char ID; // Pass identification, replacement for typeid
342 
343   PartialInlinerLegacyPass() : ModulePass(ID) {
344     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
345   }
346 
347   void getAnalysisUsage(AnalysisUsage &AU) const override {
348     AU.addRequired<AssumptionCacheTracker>();
349     AU.addRequired<ProfileSummaryInfoWrapperPass>();
350     AU.addRequired<TargetTransformInfoWrapperPass>();
351   }
352 
353   bool runOnModule(Module &M) override {
354     if (skipModule(M))
355       return false;
356 
357     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
358     TargetTransformInfoWrapperPass *TTIWP =
359         &getAnalysis<TargetTransformInfoWrapperPass>();
360     ProfileSummaryInfo *PSI =
361         &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
362 
363     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
364         [&ACT](Function &F) -> AssumptionCache & {
365       return ACT->getAssumptionCache(F);
366     };
367 
368     std::function<TargetTransformInfo &(Function &)> GetTTI =
369         [&TTIWP](Function &F) -> TargetTransformInfo & {
370       return TTIWP->getTTI(F);
371     };
372 
373     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI)
374         .run(M);
375   }
376 };
377 
378 } // end anonymous namespace
379 
380 std::unique_ptr<FunctionOutliningMultiRegionInfo>
381 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
382                                                     OptimizationRemarkEmitter &ORE) {
383   BasicBlock *EntryBlock = &F->front();
384 
385   DominatorTree DT(*F);
386   LoopInfo LI(DT);
387   BranchProbabilityInfo BPI(*F, LI);
388   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
389   BlockFrequencyInfo *BFI;
390   if (!GetBFI) {
391     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
392     BFI = ScopedBFI.get();
393   } else
394     BFI = &(*GetBFI)(*F);
395 
396   // Return if we don't have profiling information.
397   if (!PSI->hasInstrumentationProfile())
398     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
399 
400   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
401       llvm::make_unique<FunctionOutliningMultiRegionInfo>();
402 
403   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
404     BasicBlock *Dom = BlockList.front();
405     return BlockList.size() > 1 && Dom->hasNPredecessors(1);
406   };
407 
408   auto IsSingleExit =
409       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
410     BasicBlock *ExitBlock = nullptr;
411     for (auto *Block : BlockList) {
412       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
413         if (!is_contained(BlockList, *SI)) {
414           if (ExitBlock) {
415             ORE.emit([&]() {
416               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
417                                               &SI->front())
418                      << "Region dominated by "
419                      << ore::NV("Block", BlockList.front()->getName())
420                      << " has more than one region exit edge.";
421             });
422             return nullptr;
423           } else
424             ExitBlock = Block;
425         }
426       }
427     }
428     return ExitBlock;
429   };
430 
431   auto BBProfileCount = [BFI](BasicBlock *BB) {
432     return BFI->getBlockProfileCount(BB)
433                ? BFI->getBlockProfileCount(BB).getValue()
434                : 0;
435   };
436 
437   // Use the same computeBBInlineCost function to compute the cost savings of
438   // the outlining the candidate region.
439   int OverallFunctionCost = 0;
440   for (auto &BB : *F)
441     OverallFunctionCost += computeBBInlineCost(&BB);
442 
443 #ifndef NDEBUG
444   if (TracePartialInlining)
445     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
446 #endif
447   int MinOutlineRegionCost =
448       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
449   BranchProbability MinBranchProbability(
450       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
451       MinBlockCounterExecution);
452   bool ColdCandidateFound = false;
453   BasicBlock *CurrEntry = EntryBlock;
454   std::vector<BasicBlock *> DFS;
455   DenseMap<BasicBlock *, bool> VisitedMap;
456   DFS.push_back(CurrEntry);
457   VisitedMap[CurrEntry] = true;
458   // Use Depth First Search on the basic blocks to find CFG edges that are
459   // considered cold.
460   // Cold regions considered must also have its inline cost compared to the
461   // overall inline cost of the original function.  The region is outlined only
462   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
463   // more.
464   while (!DFS.empty()) {
465     auto *thisBB = DFS.back();
466     DFS.pop_back();
467     // Only consider regions with predecessor blocks that are considered
468     // not-cold (default: part of the top 99.99% of all block counters)
469     // AND greater than our minimum block execution count (default: 100).
470     if (PSI->isColdBlock(thisBB, BFI) ||
471         BBProfileCount(thisBB) < MinBlockCounterExecution)
472       continue;
473     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
474       if (VisitedMap[*SI])
475         continue;
476       VisitedMap[*SI] = true;
477       DFS.push_back(*SI);
478       // If branch isn't cold, we skip to the next one.
479       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
480       if (SuccProb > MinBranchProbability)
481         continue;
482 #ifndef NDEBUG
483       if (TracePartialInlining) {
484         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
485                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
486                << "\n";
487       }
488 #endif
489       SmallVector<BasicBlock *, 8> DominateVector;
490       DT.getDescendants(*SI, DominateVector);
491       // We can only outline single entry regions (for now).
492       if (!IsSingleEntry(DominateVector))
493         continue;
494       BasicBlock *ExitBlock = nullptr;
495       // We can only outline single exit regions (for now).
496       if (!(ExitBlock = IsSingleExit(DominateVector)))
497         continue;
498       int OutlineRegionCost = 0;
499       for (auto *BB : DominateVector)
500         OutlineRegionCost += computeBBInlineCost(BB);
501 
502 #ifndef NDEBUG
503       if (TracePartialInlining)
504         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
505 #endif
506 
507       if (OutlineRegionCost < MinOutlineRegionCost) {
508         ORE.emit([&]() {
509           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
510                                             &SI->front())
511                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
512                  << ore::NV("Cost", MinOutlineRegionCost);
513         });
514         continue;
515       }
516       // For now, ignore blocks that belong to a SISE region that is a
517       // candidate for outlining.  In the future, we may want to look
518       // at inner regions because the outer region may have live-exit
519       // variables.
520       for (auto *BB : DominateVector)
521         VisitedMap[BB] = true;
522       // ReturnBlock here means the block after the outline call
523       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
524       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
525       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
526           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
527       RegInfo.Region = DominateVector;
528       OutliningInfo->ORI.push_back(RegInfo);
529 #ifndef NDEBUG
530       if (TracePartialInlining) {
531         dbgs() << "Found Cold Candidate starting at block: "
532                << DominateVector.front()->getName() << "\n";
533       }
534 #endif
535       ColdCandidateFound = true;
536       NumColdRegionsFound++;
537     }
538   }
539   if (ColdCandidateFound)
540     return OutliningInfo;
541   else
542     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
543 }
544 
545 std::unique_ptr<FunctionOutliningInfo>
546 PartialInlinerImpl::computeOutliningInfo(Function *F) {
547   BasicBlock *EntryBlock = &F->front();
548   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
549   if (!BR || BR->isUnconditional())
550     return std::unique_ptr<FunctionOutliningInfo>();
551 
552   // Returns true if Succ is BB's successor
553   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
554     return is_contained(successors(BB), Succ);
555   };
556 
557   auto IsReturnBlock = [](BasicBlock *BB) {
558     Instruction *TI = BB->getTerminator();
559     return isa<ReturnInst>(TI);
560   };
561 
562   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
563     if (IsReturnBlock(Succ1))
564       return std::make_tuple(Succ1, Succ2);
565     if (IsReturnBlock(Succ2))
566       return std::make_tuple(Succ2, Succ1);
567 
568     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
569   };
570 
571   // Detect a triangular shape:
572   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
573     if (IsSuccessor(Succ1, Succ2))
574       return std::make_tuple(Succ1, Succ2);
575     if (IsSuccessor(Succ2, Succ1))
576       return std::make_tuple(Succ2, Succ1);
577 
578     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
579   };
580 
581   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
582       llvm::make_unique<FunctionOutliningInfo>();
583 
584   BasicBlock *CurrEntry = EntryBlock;
585   bool CandidateFound = false;
586   do {
587     // The number of blocks to be inlined has already reached
588     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
589     // disables partial inlining for the function.
590     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
591       break;
592 
593     if (succ_size(CurrEntry) != 2)
594       break;
595 
596     BasicBlock *Succ1 = *succ_begin(CurrEntry);
597     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
598 
599     BasicBlock *ReturnBlock, *NonReturnBlock;
600     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
601 
602     if (ReturnBlock) {
603       OutliningInfo->Entries.push_back(CurrEntry);
604       OutliningInfo->ReturnBlock = ReturnBlock;
605       OutliningInfo->NonReturnBlock = NonReturnBlock;
606       CandidateFound = true;
607       break;
608     }
609 
610     BasicBlock *CommSucc;
611     BasicBlock *OtherSucc;
612     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
613 
614     if (!CommSucc)
615       break;
616 
617     OutliningInfo->Entries.push_back(CurrEntry);
618     CurrEntry = OtherSucc;
619   } while (true);
620 
621   if (!CandidateFound)
622     return std::unique_ptr<FunctionOutliningInfo>();
623 
624   // Do sanity check of the entries: threre should not
625   // be any successors (not in the entry set) other than
626   // {ReturnBlock, NonReturnBlock}
627   assert(OutliningInfo->Entries[0] == &F->front() &&
628          "Function Entry must be the first in Entries vector");
629   DenseSet<BasicBlock *> Entries;
630   for (BasicBlock *E : OutliningInfo->Entries)
631     Entries.insert(E);
632 
633   // Returns true of BB has Predecessor which is not
634   // in Entries set.
635   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
636     for (auto Pred : predecessors(BB)) {
637       if (!Entries.count(Pred))
638         return true;
639     }
640     return false;
641   };
642   auto CheckAndNormalizeCandidate =
643       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
644         for (BasicBlock *E : OutliningInfo->Entries) {
645           for (auto Succ : successors(E)) {
646             if (Entries.count(Succ))
647               continue;
648             if (Succ == OutliningInfo->ReturnBlock)
649               OutliningInfo->ReturnBlockPreds.push_back(E);
650             else if (Succ != OutliningInfo->NonReturnBlock)
651               return false;
652           }
653           // There should not be any outside incoming edges either:
654           if (HasNonEntryPred(E))
655             return false;
656         }
657         return true;
658       };
659 
660   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
661     return std::unique_ptr<FunctionOutliningInfo>();
662 
663   // Now further growing the candidate's inlining region by
664   // peeling off dominating blocks from the outlining region:
665   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
666     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
667     if (succ_size(Cand) != 2)
668       break;
669 
670     if (HasNonEntryPred(Cand))
671       break;
672 
673     BasicBlock *Succ1 = *succ_begin(Cand);
674     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
675 
676     BasicBlock *ReturnBlock, *NonReturnBlock;
677     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
678     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
679       break;
680 
681     if (NonReturnBlock->getSinglePredecessor() != Cand)
682       break;
683 
684     // Now grow and update OutlininigInfo:
685     OutliningInfo->Entries.push_back(Cand);
686     OutliningInfo->NonReturnBlock = NonReturnBlock;
687     OutliningInfo->ReturnBlockPreds.push_back(Cand);
688     Entries.insert(Cand);
689   }
690 
691   return OutliningInfo;
692 }
693 
694 // Check if there is PGO data or user annoated branch data:
695 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
696   if (F->hasProfileData())
697     return true;
698   // Now check if any of the entry block has MD_prof data:
699   for (auto *E : OI->Entries) {
700     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
701     if (!BR || BR->isUnconditional())
702       continue;
703     uint64_t T, F;
704     if (BR->extractProfMetadata(T, F))
705       return true;
706   }
707   return false;
708 }
709 
710 BranchProbability
711 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
712   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
713   auto EntryFreq =
714       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
715   auto OutliningCallFreq =
716       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
717   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
718   // we outlined any regions, so we may encounter situations where the
719   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
720   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
721     OutliningCallFreq = EntryFreq;
722   }
723   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
724       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
725 
726   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
727     return OutlineRegionRelFreq;
728 
729   // When profile data is not available, we need to be conservative in
730   // estimating the overall savings. Static branch prediction can usually
731   // guess the branch direction right (taken/non-taken), but the guessed
732   // branch probability is usually not biased enough. In case when the
733   // outlined region is predicted to be likely, its probability needs
734   // to be made higher (more biased) to not under-estimate the cost of
735   // function outlining. On the other hand, if the outlined region
736   // is predicted to be less likely, the predicted probablity is usually
737   // higher than the actual. For instance, the actual probability of the
738   // less likely target is only 5%, but the guessed probablity can be
739   // 40%. In the latter case, there is no need for further adjustement.
740   // FIXME: add an option for this.
741   if (OutlineRegionRelFreq < BranchProbability(45, 100))
742     return OutlineRegionRelFreq;
743 
744   OutlineRegionRelFreq = std::max(
745       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
746 
747   return OutlineRegionRelFreq;
748 }
749 
750 bool PartialInlinerImpl::shouldPartialInline(
751     CallSite CS, FunctionCloner &Cloner,
752     BlockFrequency WeightedOutliningRcost,
753     OptimizationRemarkEmitter &ORE) {
754   using namespace ore;
755 
756   Instruction *Call = CS.getInstruction();
757   Function *Callee = CS.getCalledFunction();
758   assert(Callee == Cloner.ClonedFunc);
759 
760   if (SkipCostAnalysis)
761     return isInlineViable(*Callee);
762 
763   Function *Caller = CS.getCaller();
764   auto &CalleeTTI = (*GetTTI)(*Callee);
765   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
766                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
767 
768   if (IC.isAlways()) {
769     ORE.emit([&]() {
770       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
771              << NV("Callee", Cloner.OrigFunc)
772              << " should always be fully inlined, not partially";
773     });
774     return false;
775   }
776 
777   if (IC.isNever()) {
778     ORE.emit([&]() {
779       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
780              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
781              << NV("Caller", Caller)
782              << " because it should never be inlined (cost=never)";
783     });
784     return false;
785   }
786 
787   if (!IC) {
788     ORE.emit([&]() {
789       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
790              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
791              << NV("Caller", Caller) << " because too costly to inline (cost="
792              << NV("Cost", IC.getCost()) << ", threshold="
793              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
794     });
795     return false;
796   }
797   const DataLayout &DL = Caller->getParent()->getDataLayout();
798 
799   // The savings of eliminating the call:
800   int NonWeightedSavings = getCallsiteCost(CS, DL);
801   BlockFrequency NormWeightedSavings(NonWeightedSavings);
802 
803   // Weighted saving is smaller than weighted cost, return false
804   if (NormWeightedSavings < WeightedOutliningRcost) {
805     ORE.emit([&]() {
806       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
807                                         Call)
808              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
809              << NV("Caller", Caller) << " runtime overhead (overhead="
810              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
811              << ", savings="
812              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
813              << ")"
814              << " of making the outlined call is too high";
815     });
816 
817     return false;
818   }
819 
820   ORE.emit([&]() {
821     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
822            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
823            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
824            << " (threshold="
825            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
826   });
827   return true;
828 }
829 
830 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
831 // For now use a simplified version. The returned 'InlineCost' will be used
832 // to esimate the size cost as well as runtime cost of the BB.
833 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
834   int InlineCost = 0;
835   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
836   for (Instruction &I : BB->instructionsWithoutDebug()) {
837     // Skip free instructions.
838     switch (I.getOpcode()) {
839     case Instruction::BitCast:
840     case Instruction::PtrToInt:
841     case Instruction::IntToPtr:
842     case Instruction::Alloca:
843     case Instruction::PHI:
844       continue;
845     case Instruction::GetElementPtr:
846       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
847         continue;
848       break;
849     default:
850       break;
851     }
852 
853     if (I.isLifetimeStartOrEnd())
854       continue;
855 
856     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
857       InlineCost += getCallsiteCost(CallSite(CI), DL);
858       continue;
859     }
860 
861     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
862       InlineCost += getCallsiteCost(CallSite(II), DL);
863       continue;
864     }
865 
866     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
867       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
868       continue;
869     }
870     InlineCost += InlineConstants::InstrCost;
871   }
872   return InlineCost;
873 }
874 
875 std::tuple<int, int>
876 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
877   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
878   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
879     Function *OutlinedFunc = FuncBBPair.first;
880     BasicBlock* OutliningCallBB = FuncBBPair.second;
881     // Now compute the cost of the call sequence to the outlined function
882     // 'OutlinedFunction' in BB 'OutliningCallBB':
883     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
884 
885     // Now compute the cost of the extracted/outlined function itself:
886     for (BasicBlock &BB : *OutlinedFunc)
887       OutlinedFunctionCost += computeBBInlineCost(&BB);
888   }
889   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
890          "Outlined function cost should be no less than the outlined region");
891 
892   // The code extractor introduces a new root and exit stub blocks with
893   // additional unconditional branches. Those branches will be eliminated
894   // later with bb layout. The cost should be adjusted accordingly:
895   OutlinedFunctionCost -=
896       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
897 
898   int OutliningRuntimeOverhead =
899       OutliningFuncCallCost +
900       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
901       ExtraOutliningPenalty;
902 
903   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
904 }
905 
906 // Create the callsite to profile count map which is
907 // used to update the original function's entry count,
908 // after the function is partially inlined into the callsite.
909 void PartialInlinerImpl::computeCallsiteToProfCountMap(
910     Function *DuplicateFunction,
911     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
912   std::vector<User *> Users(DuplicateFunction->user_begin(),
913                             DuplicateFunction->user_end());
914   Function *CurrentCaller = nullptr;
915   std::unique_ptr<BlockFrequencyInfo> TempBFI;
916   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
917 
918   auto ComputeCurrBFI = [&,this](Function *Caller) {
919       // For the old pass manager:
920       if (!GetBFI) {
921         DominatorTree DT(*Caller);
922         LoopInfo LI(DT);
923         BranchProbabilityInfo BPI(*Caller, LI);
924         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
925         CurrentCallerBFI = TempBFI.get();
926       } else {
927         // New pass manager:
928         CurrentCallerBFI = &(*GetBFI)(*Caller);
929       }
930   };
931 
932   for (User *User : Users) {
933     CallSite CS = getCallSite(User);
934     Function *Caller = CS.getCaller();
935     if (CurrentCaller != Caller) {
936       CurrentCaller = Caller;
937       ComputeCurrBFI(Caller);
938     } else {
939       assert(CurrentCallerBFI && "CallerBFI is not set");
940     }
941     BasicBlock *CallBB = CS.getInstruction()->getParent();
942     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
943     if (Count)
944       CallSiteToProfCountMap[User] = *Count;
945     else
946       CallSiteToProfCountMap[User] = 0;
947   }
948 }
949 
950 PartialInlinerImpl::FunctionCloner::FunctionCloner(
951     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE)
952     : OrigFunc(F), ORE(ORE) {
953   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
954 
955   // Clone the function, so that we can hack away on it.
956   ValueToValueMapTy VMap;
957   ClonedFunc = CloneFunction(F, VMap);
958 
959   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
960   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
961   for (BasicBlock *BB : OI->Entries) {
962     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
963   }
964   for (BasicBlock *E : OI->ReturnBlockPreds) {
965     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
966     ClonedOI->ReturnBlockPreds.push_back(NewE);
967   }
968   // Go ahead and update all uses to the duplicate, so that we can just
969   // use the inliner functionality when we're done hacking.
970   F->replaceAllUsesWith(ClonedFunc);
971 }
972 
973 PartialInlinerImpl::FunctionCloner::FunctionCloner(
974     Function *F, FunctionOutliningMultiRegionInfo *OI,
975     OptimizationRemarkEmitter &ORE)
976     : OrigFunc(F), ORE(ORE) {
977   ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
978 
979   // Clone the function, so that we can hack away on it.
980   ValueToValueMapTy VMap;
981   ClonedFunc = CloneFunction(F, VMap);
982 
983   // Go through all Outline Candidate Regions and update all BasicBlock
984   // information.
985   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
986        OI->ORI) {
987     SmallVector<BasicBlock *, 8> Region;
988     for (BasicBlock *BB : RegionInfo.Region) {
989       Region.push_back(cast<BasicBlock>(VMap[BB]));
990     }
991     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
992     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
993     BasicBlock *NewReturnBlock = nullptr;
994     if (RegionInfo.ReturnBlock)
995       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
996     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
997         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
998     ClonedOMRI->ORI.push_back(MappedRegionInfo);
999   }
1000   // Go ahead and update all uses to the duplicate, so that we can just
1001   // use the inliner functionality when we're done hacking.
1002   F->replaceAllUsesWith(ClonedFunc);
1003 }
1004 
1005 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1006   auto getFirstPHI = [](BasicBlock *BB) {
1007     BasicBlock::iterator I = BB->begin();
1008     PHINode *FirstPhi = nullptr;
1009     while (I != BB->end()) {
1010       PHINode *Phi = dyn_cast<PHINode>(I);
1011       if (!Phi)
1012         break;
1013       if (!FirstPhi) {
1014         FirstPhi = Phi;
1015         break;
1016       }
1017     }
1018     return FirstPhi;
1019   };
1020 
1021   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1022   // blocks.
1023   if (!ClonedOI)
1024     return;
1025 
1026   // Special hackery is needed with PHI nodes that have inputs from more than
1027   // one extracted block.  For simplicity, just split the PHIs into a two-level
1028   // sequence of PHIs, some of which will go in the extracted region, and some
1029   // of which will go outside.
1030   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1031   // only split block when necessary:
1032   PHINode *FirstPhi = getFirstPHI(PreReturn);
1033   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1034 
1035   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1036     return;
1037 
1038   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1039     Value *CommonValue = PN->getIncomingValue(0);
1040     if (all_of(PN->incoming_values(),
1041                [&](Value *V) { return V == CommonValue; }))
1042       return CommonValue;
1043     return nullptr;
1044   };
1045 
1046   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1047       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1048   BasicBlock::iterator I = PreReturn->begin();
1049   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1050   SmallVector<Instruction *, 4> DeadPhis;
1051   while (I != PreReturn->end()) {
1052     PHINode *OldPhi = dyn_cast<PHINode>(I);
1053     if (!OldPhi)
1054       break;
1055 
1056     PHINode *RetPhi =
1057         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1058     OldPhi->replaceAllUsesWith(RetPhi);
1059     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1060 
1061     RetPhi->addIncoming(&*I, PreReturn);
1062     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1063       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1064       OldPhi->removeIncomingValue(E);
1065     }
1066 
1067     // After incoming values splitting, the old phi may become trivial.
1068     // Keeping the trivial phi can introduce definition inside the outline
1069     // region which is live-out, causing necessary overhead (load, store
1070     // arg passing etc).
1071     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1072       OldPhi->replaceAllUsesWith(OldPhiVal);
1073       DeadPhis.push_back(OldPhi);
1074     }
1075     ++I;
1076   }
1077   for (auto *DP : DeadPhis)
1078     DP->eraseFromParent();
1079 
1080   for (auto E : ClonedOI->ReturnBlockPreds) {
1081     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1082   }
1083 }
1084 
1085 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1086 
1087   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1088     int Cost = 0;
1089     for (BasicBlock* BB : Region)
1090       Cost += computeBBInlineCost(BB);
1091     return Cost;
1092   };
1093 
1094   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1095 
1096   if (ClonedOMRI->ORI.empty())
1097     return false;
1098 
1099   // The CodeExtractor needs a dominator tree.
1100   DominatorTree DT;
1101   DT.recalculate(*ClonedFunc);
1102 
1103   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1104   LoopInfo LI(DT);
1105   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1106   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1107 
1108   SetVector<Value *> Inputs, Outputs, Sinks;
1109   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1110        ClonedOMRI->ORI) {
1111     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1112 
1113     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1114                      ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false);
1115 
1116     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1117 
1118 #ifndef NDEBUG
1119     if (TracePartialInlining) {
1120       dbgs() << "inputs: " << Inputs.size() << "\n";
1121       dbgs() << "outputs: " << Outputs.size() << "\n";
1122       for (Value *value : Inputs)
1123         dbgs() << "value used in func: " << *value << "\n";
1124       for (Value *output : Outputs)
1125         dbgs() << "instr used in func: " << *output << "\n";
1126     }
1127 #endif
1128     // Do not extract regions that have live exit variables.
1129     if (Outputs.size() > 0 && !ForceLiveExit)
1130       continue;
1131 
1132     Function *OutlinedFunc = CE.extractCodeRegion();
1133 
1134     if (OutlinedFunc) {
1135       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1136       BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1137       assert(OutliningCallBB->getParent() == ClonedFunc);
1138       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1139       NumColdRegionsOutlined++;
1140       OutlinedRegionCost += CurrentOutlinedRegionCost;
1141 
1142       if (MarkOutlinedColdCC) {
1143         OutlinedFunc->setCallingConv(CallingConv::Cold);
1144         OCS.setCallingConv(CallingConv::Cold);
1145       }
1146     } else
1147       ORE.emit([&]() {
1148         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1149                                         &RegionInfo.Region.front()->front())
1150                << "Failed to extract region at block "
1151                << ore::NV("Block", RegionInfo.Region.front());
1152       });
1153   }
1154 
1155   return !OutlinedFunctions.empty();
1156 }
1157 
1158 Function *
1159 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1160   // Returns true if the block is to be partial inlined into the caller
1161   // (i.e. not to be extracted to the out of line function)
1162   auto ToBeInlined = [&, this](BasicBlock *BB) {
1163     return BB == ClonedOI->ReturnBlock ||
1164            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1165             ClonedOI->Entries.end());
1166   };
1167 
1168   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1169   // The CodeExtractor needs a dominator tree.
1170   DominatorTree DT;
1171   DT.recalculate(*ClonedFunc);
1172 
1173   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1174   LoopInfo LI(DT);
1175   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1176   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1177 
1178   // Gather up the blocks that we're going to extract.
1179   std::vector<BasicBlock *> ToExtract;
1180   ToExtract.push_back(ClonedOI->NonReturnBlock);
1181   OutlinedRegionCost +=
1182       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1183   for (BasicBlock &BB : *ClonedFunc)
1184     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1185       ToExtract.push_back(&BB);
1186       // FIXME: the code extractor may hoist/sink more code
1187       // into the outlined function which may make the outlining
1188       // overhead (the difference of the outlined function cost
1189       // and OutliningRegionCost) look larger.
1190       OutlinedRegionCost += computeBBInlineCost(&BB);
1191     }
1192 
1193   // Extract the body of the if.
1194   Function *OutlinedFunc =
1195       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1196                     ClonedFuncBFI.get(), &BPI,
1197                     /* AllowVarargs */ true)
1198           .extractCodeRegion();
1199 
1200   if (OutlinedFunc) {
1201     BasicBlock *OutliningCallBB =
1202         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1203             .getInstruction()
1204             ->getParent();
1205     assert(OutliningCallBB->getParent() == ClonedFunc);
1206     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1207   } else
1208     ORE.emit([&]() {
1209       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1210                                       &ToExtract.front()->front())
1211              << "Failed to extract region at block "
1212              << ore::NV("Block", ToExtract.front());
1213     });
1214 
1215   return OutlinedFunc;
1216 }
1217 
1218 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1219   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1220   // users (function pointers, etc.) back to the original function.
1221   ClonedFunc->replaceAllUsesWith(OrigFunc);
1222   ClonedFunc->eraseFromParent();
1223   if (!IsFunctionInlined) {
1224     // Remove each function that was speculatively created if there is no
1225     // reference.
1226     for (auto FuncBBPair : OutlinedFunctions) {
1227       Function *Func = FuncBBPair.first;
1228       Func->eraseFromParent();
1229     }
1230   }
1231 }
1232 
1233 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1234 
1235   if (F->hasAddressTaken())
1236     return {false, nullptr};
1237 
1238   // Let inliner handle it
1239   if (F->hasFnAttribute(Attribute::AlwaysInline))
1240     return {false, nullptr};
1241 
1242   if (F->hasFnAttribute(Attribute::NoInline))
1243     return {false, nullptr};
1244 
1245   if (PSI->isFunctionEntryCold(F))
1246     return {false, nullptr};
1247 
1248   if (empty(F->users()))
1249     return {false, nullptr};
1250 
1251   OptimizationRemarkEmitter ORE(F);
1252 
1253   // Only try to outline cold regions if we have a profile summary, which
1254   // implies we have profiling information.
1255   if (PSI->hasProfileSummary() && F->hasProfileData() &&
1256       !DisableMultiRegionPartialInline) {
1257     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1258         computeOutliningColdRegionsInfo(F, ORE);
1259     if (OMRI) {
1260       FunctionCloner Cloner(F, OMRI.get(), ORE);
1261 
1262 #ifndef NDEBUG
1263       if (TracePartialInlining) {
1264         dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1265         dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1266                << "\n";
1267       }
1268 #endif
1269       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1270 
1271       if (DidOutline) {
1272 #ifndef NDEBUG
1273         if (TracePartialInlining) {
1274           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1275           Cloner.ClonedFunc->print(dbgs());
1276           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1277         }
1278 #endif
1279 
1280         if (tryPartialInline(Cloner))
1281           return {true, nullptr};
1282       }
1283     }
1284   }
1285 
1286   // Fall-thru to regular partial inlining if we:
1287   //    i) can't find any cold regions to outline, or
1288   //   ii) can't inline the outlined function anywhere.
1289   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1290   if (!OI)
1291     return {false, nullptr};
1292 
1293   FunctionCloner Cloner(F, OI.get(), ORE);
1294   Cloner.NormalizeReturnBlock();
1295 
1296   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1297 
1298   if (!OutlinedFunction)
1299     return {false, nullptr};
1300 
1301   bool AnyInline = tryPartialInline(Cloner);
1302 
1303   if (AnyInline)
1304     return {true, OutlinedFunction};
1305 
1306   return {false, nullptr};
1307 }
1308 
1309 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1310   if (Cloner.OutlinedFunctions.empty())
1311     return false;
1312 
1313   int SizeCost = 0;
1314   BlockFrequency WeightedRcost;
1315   int NonWeightedRcost;
1316   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1317 
1318   // Only calculate RelativeToEntryFreq when we are doing single region
1319   // outlining.
1320   BranchProbability RelativeToEntryFreq;
1321   if (Cloner.ClonedOI) {
1322     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1323   } else
1324     // RelativeToEntryFreq doesn't make sense when we have more than one
1325     // outlined call because each call will have a different relative frequency
1326     // to the entry block.  We can consider using the average, but the
1327     // usefulness of that information is questionable. For now, assume we never
1328     // execute the calls to outlined functions.
1329     RelativeToEntryFreq = BranchProbability(0, 1);
1330 
1331   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1332 
1333   // The call sequence(s) to the outlined function(s) are larger than the sum of
1334   // the original outlined region size(s), it does not increase the chances of
1335   // inlining the function with outlining (The inliner uses the size increase to
1336   // model the cost of inlining a callee).
1337   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1338     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1339     DebugLoc DLoc;
1340     BasicBlock *Block;
1341     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1342     OrigFuncORE.emit([&]() {
1343       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1344                                         DLoc, Block)
1345              << ore::NV("Function", Cloner.OrigFunc)
1346              << " not partially inlined into callers (Original Size = "
1347              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1348              << ", Size of call sequence to outlined function = "
1349              << ore::NV("NewSize", SizeCost) << ")";
1350     });
1351     return false;
1352   }
1353 
1354   assert(empty(Cloner.OrigFunc->users()) &&
1355          "F's users should all be replaced!");
1356 
1357   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1358                             Cloner.ClonedFunc->user_end());
1359 
1360   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1361   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1362   if (CalleeEntryCount)
1363     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1364 
1365   uint64_t CalleeEntryCountV =
1366       (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
1367 
1368   bool AnyInline = false;
1369   for (User *User : Users) {
1370     CallSite CS = getCallSite(User);
1371 
1372     if (IsLimitReached())
1373       continue;
1374 
1375     OptimizationRemarkEmitter CallerORE(CS.getCaller());
1376     if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
1377       continue;
1378 
1379     // Construct remark before doing the inlining, as after successful inlining
1380     // the callsite is removed.
1381     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1382     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1383        << ore::NV("Caller", CS.getCaller());
1384 
1385     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1386     // We can only forward varargs when we outlined a single region, else we
1387     // bail on vararg functions.
1388     if (!InlineFunction(CS, IFI, nullptr, true,
1389                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1390                                          : nullptr)))
1391       continue;
1392 
1393     CallerORE.emit(OR);
1394 
1395     // Now update the entry count:
1396     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1397       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1398       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1399     }
1400 
1401     AnyInline = true;
1402     NumPartialInlining++;
1403     // Update the stats
1404     if (Cloner.ClonedOI)
1405       NumPartialInlined++;
1406     else
1407       NumColdOutlinePartialInlined++;
1408 
1409   }
1410 
1411   if (AnyInline) {
1412     Cloner.IsFunctionInlined = true;
1413     if (CalleeEntryCount)
1414       Cloner.OrigFunc->setEntryCount(
1415           CalleeEntryCount.setCount(CalleeEntryCountV));
1416     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1417     OrigFuncORE.emit([&]() {
1418       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1419              << "Partially inlined into at least one caller";
1420     });
1421 
1422   }
1423 
1424   return AnyInline;
1425 }
1426 
1427 bool PartialInlinerImpl::run(Module &M) {
1428   if (DisablePartialInlining)
1429     return false;
1430 
1431   std::vector<Function *> Worklist;
1432   Worklist.reserve(M.size());
1433   for (Function &F : M)
1434     if (!F.use_empty() && !F.isDeclaration())
1435       Worklist.push_back(&F);
1436 
1437   bool Changed = false;
1438   while (!Worklist.empty()) {
1439     Function *CurrFunc = Worklist.back();
1440     Worklist.pop_back();
1441 
1442     if (CurrFunc->use_empty())
1443       continue;
1444 
1445     bool Recursive = false;
1446     for (User *U : CurrFunc->users())
1447       if (Instruction *I = dyn_cast<Instruction>(U))
1448         if (I->getParent()->getParent() == CurrFunc) {
1449           Recursive = true;
1450           break;
1451         }
1452     if (Recursive)
1453       continue;
1454 
1455     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1456     if (Result.second)
1457       Worklist.push_back(Result.second);
1458     Changed |= Result.first;
1459   }
1460 
1461   return Changed;
1462 }
1463 
1464 char PartialInlinerLegacyPass::ID = 0;
1465 
1466 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1467                       "Partial Inliner", false, false)
1468 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1469 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1470 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1471 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1472                     "Partial Inliner", false, false)
1473 
1474 ModulePass *llvm::createPartialInliningPass() {
1475   return new PartialInlinerLegacyPass();
1476 }
1477 
1478 PreservedAnalyses PartialInlinerPass::run(Module &M,
1479                                           ModuleAnalysisManager &AM) {
1480   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1481 
1482   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1483       [&FAM](Function &F) -> AssumptionCache & {
1484     return FAM.getResult<AssumptionAnalysis>(F);
1485   };
1486 
1487   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1488       [&FAM](Function &F) -> BlockFrequencyInfo & {
1489     return FAM.getResult<BlockFrequencyAnalysis>(F);
1490   };
1491 
1492   std::function<TargetTransformInfo &(Function &)> GetTTI =
1493       [&FAM](Function &F) -> TargetTransformInfo & {
1494     return FAM.getResult<TargetIRAnalysis>(F);
1495   };
1496 
1497   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1498 
1499   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI)
1500           .run(M))
1501     return PreservedAnalyses::none();
1502   return PreservedAnalyses::all();
1503 }
1504