1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/BlockFrequencyInfo.h"
24 #include "llvm/Analysis/BranchProbabilityInfo.h"
25 #include "llvm/Analysis/DominanceFrontier.h"
26 #include "llvm/Analysis/InlineCost.h"
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
29 #include "llvm/Analysis/ProfileSummaryInfo.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/IR/Attributes.h"
33 #include "llvm/IR/BasicBlock.h"
34 #include "llvm/IR/CFG.h"
35 #include "llvm/IR/CallSite.h"
36 #include "llvm/IR/DebugLoc.h"
37 #include "llvm/IR/DiagnosticInfo.h"
38 #include "llvm/IR/Dominators.h"
39 #include "llvm/IR/Function.h"
40 #include "llvm/IR/InstrTypes.h"
41 #include "llvm/IR/Instruction.h"
42 #include "llvm/IR/Instructions.h"
43 #include "llvm/IR/IntrinsicInst.h"
44 #include "llvm/IR/Intrinsics.h"
45 #include "llvm/IR/Module.h"
46 #include "llvm/IR/User.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/BlockFrequency.h"
49 #include "llvm/Support/BranchProbability.h"
50 #include "llvm/Support/Casting.h"
51 #include "llvm/Support/CommandLine.h"
52 #include "llvm/Support/ErrorHandling.h"
53 #include "llvm/Transforms/IPO.h"
54 #include "llvm/Transforms/Utils/Cloning.h"
55 #include "llvm/Transforms/Utils/CodeExtractor.h"
56 #include "llvm/Transforms/Utils/ValueMapper.h"
57 #include <algorithm>
58 #include <cassert>
59 #include <cstdint>
60 #include <functional>
61 #include <iterator>
62 #include <memory>
63 #include <tuple>
64 #include <vector>
65 
66 using namespace llvm;
67 
68 #define DEBUG_TYPE "partial-inlining"
69 
70 STATISTIC(NumPartialInlined,
71           "Number of callsites functions partially inlined into.");
72 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
73                                         "cold outlined regions were partially "
74                                         "inlined into its caller(s).");
75 STATISTIC(NumColdRegionsFound,
76            "Number of cold single entry/exit regions found.");
77 STATISTIC(NumColdRegionsOutlined,
78            "Number of cold single entry/exit regions outlined.");
79 
80 // Command line option to disable partial-inlining. The default is false:
81 static cl::opt<bool>
82     DisablePartialInlining("disable-partial-inlining", cl::init(false),
83                            cl::Hidden, cl::desc("Disable partial inlining"));
84 // Command line option to disable multi-region partial-inlining. The default is
85 // false:
86 static cl::opt<bool> DisableMultiRegionPartialInline(
87     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
88     cl::desc("Disable multi-region partial inlining"));
89 
90 // Command line option to force outlining in regions with live exit variables.
91 // The default is false:
92 static cl::opt<bool>
93     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
94                cl::desc("Force outline regions with live exits"));
95 
96 // Command line option to enable marking outline functions with Cold Calling
97 // Convention. The default is false:
98 static cl::opt<bool>
99     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
100                        cl::desc("Mark outline function calls with ColdCC"));
101 
102 #ifndef NDEBUG
103 // Command line option to debug partial-inlining. The default is none:
104 static cl::opt<bool> TracePartialInlining("trace-partial-inlining",
105                                           cl::init(false), cl::Hidden,
106                                           cl::desc("Trace partial inlining."));
107 #endif
108 
109 // This is an option used by testing:
110 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
111                                       cl::init(false), cl::ZeroOrMore,
112                                       cl::ReallyHidden,
113                                       cl::desc("Skip Cost Analysis"));
114 // Used to determine if a cold region is worth outlining based on
115 // its inlining cost compared to the original function.  Default is set at 10%.
116 // ie. if the cold region reduces the inlining cost of the original function by
117 // at least 10%.
118 static cl::opt<float> MinRegionSizeRatio(
119     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
120     cl::desc("Minimum ratio comparing relative sizes of each "
121              "outline candidate and original function"));
122 // Used to tune the minimum number of execution counts needed in the predecessor
123 // block to the cold edge. ie. confidence interval.
124 static cl::opt<unsigned>
125     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
126                              cl::desc("Minimum block executions to consider "
127                                       "its BranchProbabilityInfo valid"));
128 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
129 // if the branch probability is 10% or less, then it is deemed as 'cold'.
130 static cl::opt<float> ColdBranchRatio(
131     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
132     cl::desc("Minimum BranchProbability to consider a region cold."));
133 
134 static cl::opt<unsigned> MaxNumInlineBlocks(
135     "max-num-inline-blocks", cl::init(5), cl::Hidden,
136     cl::desc("Max number of blocks to be partially inlined"));
137 
138 // Command line option to set the maximum number of partial inlining allowed
139 // for the module. The default value of -1 means no limit.
140 static cl::opt<int> MaxNumPartialInlining(
141     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
142     cl::desc("Max number of partial inlining. The default is unlimited"));
143 
144 // Used only when PGO or user annotated branch data is absent. It is
145 // the least value that is used to weigh the outline region. If BFI
146 // produces larger value, the BFI value will be used.
147 static cl::opt<int>
148     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
149                              cl::Hidden, cl::ZeroOrMore,
150                              cl::desc("Relative frequency of outline region to "
151                                       "the entry block"));
152 
153 static cl::opt<unsigned> ExtraOutliningPenalty(
154     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
155     cl::desc("A debug option to add additional penalty to the computed one."));
156 
157 namespace {
158 
159 struct FunctionOutliningInfo {
160   FunctionOutliningInfo() = default;
161 
162   // Returns the number of blocks to be inlined including all blocks
163   // in Entries and one return block.
164   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
165 
166   // A set of blocks including the function entry that guard
167   // the region to be outlined.
168   SmallVector<BasicBlock *, 4> Entries;
169 
170   // The return block that is not included in the outlined region.
171   BasicBlock *ReturnBlock = nullptr;
172 
173   // The dominating block of the region to be outlined.
174   BasicBlock *NonReturnBlock = nullptr;
175 
176   // The set of blocks in Entries that that are predecessors to ReturnBlock
177   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
178 };
179 
180 struct FunctionOutliningMultiRegionInfo {
181   FunctionOutliningMultiRegionInfo()
182       : ORI() {}
183 
184   // Container for outline regions
185   struct OutlineRegionInfo {
186     OutlineRegionInfo(SmallVector<BasicBlock *, 8> Region,
187                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
188                       BasicBlock *ReturnBlock)
189         : Region(Region), EntryBlock(EntryBlock), ExitBlock(ExitBlock),
190           ReturnBlock(ReturnBlock) {}
191     SmallVector<BasicBlock *, 8> Region;
192     BasicBlock *EntryBlock;
193     BasicBlock *ExitBlock;
194     BasicBlock *ReturnBlock;
195   };
196 
197   SmallVector<OutlineRegionInfo, 4> ORI;
198 };
199 
200 struct PartialInlinerImpl {
201 
202   PartialInlinerImpl(
203       std::function<AssumptionCache &(Function &)> *GetAC,
204       std::function<TargetTransformInfo &(Function &)> *GTTI,
205       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
206       ProfileSummaryInfo *ProfSI,
207       std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
208       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI),
209         GetORE(GORE) {}
210 
211   bool run(Module &M);
212   // Main part of the transformation that calls helper functions to find
213   // outlining candidates, clone & outline the function, and attempt to
214   // partially inline the resulting function. Returns true if
215   // inlining was successful, false otherwise.  Also returns the outline
216   // function (only if we partially inlined early returns) as there is a
217   // possibility to further "peel" early return statements that were left in the
218   // outline function due to code size.
219   std::pair<bool, Function *> unswitchFunction(Function *F);
220 
221   // This class speculatively clones the the function to be partial inlined.
222   // At the end of partial inlining, the remaining callsites to the cloned
223   // function that are not partially inlined will be fixed up to reference
224   // the original function, and the cloned function will be erased.
225   struct FunctionCloner {
226     // Two constructors, one for single region outlining, the other for
227     // multi-region outlining.
228     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
229                    OptimizationRemarkEmitter &ORE);
230     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
231                    OptimizationRemarkEmitter &ORE);
232     ~FunctionCloner();
233 
234     // Prepare for function outlining: making sure there is only
235     // one incoming edge from the extracted/outlined region to
236     // the return block.
237     void NormalizeReturnBlock();
238 
239     // Do function outlining for cold regions.
240     bool doMultiRegionFunctionOutlining();
241     // Do function outlining for region after early return block(s).
242     // NOTE: For vararg functions that do the vararg handling in the outlined
243     //       function, we temporarily generate IR that does not properly
244     //       forward varargs to the outlined function. Calling InlineFunction
245     //       will update calls to the outlined functions to properly forward
246     //       the varargs.
247     Function *doSingleRegionFunctionOutlining();
248 
249     Function *OrigFunc = nullptr;
250     Function *ClonedFunc = nullptr;
251 
252     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
253     // Keep track of Outlined Functions and the basic block they're called from.
254     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
255 
256     // ClonedFunc is inlined in one of its callers after function
257     // outlining.
258     bool IsFunctionInlined = false;
259     // The cost of the region to be outlined.
260     int OutlinedRegionCost = 0;
261     // ClonedOI is specific to outlining non-early return blocks.
262     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
263     // ClonedOMRI is specific to outlining cold regions.
264     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
265     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
266     OptimizationRemarkEmitter &ORE;
267   };
268 
269 private:
270   int NumPartialInlining = 0;
271   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
272   std::function<TargetTransformInfo &(Function &)> *GetTTI;
273   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
274   ProfileSummaryInfo *PSI;
275   std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
276 
277   // Return the frequency of the OutlininingBB relative to F's entry point.
278   // The result is no larger than 1 and is represented using BP.
279   // (Note that the outlined region's 'head' block can only have incoming
280   // edges from the guarding entry blocks).
281   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
282 
283   // Return true if the callee of CS should be partially inlined with
284   // profit.
285   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
286                            BlockFrequency WeightedOutliningRcost);
287 
288   // Try to inline DuplicateFunction (cloned from F with call to
289   // the OutlinedFunction into its callers. Return true
290   // if there is any successful inlining.
291   bool tryPartialInline(FunctionCloner &Cloner);
292 
293   // Compute the mapping from use site of DuplicationFunction to the enclosing
294   // BB's profile count.
295   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
296                                      DenseMap<User *, uint64_t> &SiteCountMap);
297 
298   bool IsLimitReached() {
299     return (MaxNumPartialInlining != -1 &&
300             NumPartialInlining >= MaxNumPartialInlining);
301   }
302 
303   static CallSite getCallSite(User *U) {
304     CallSite CS;
305     if (CallInst *CI = dyn_cast<CallInst>(U))
306       CS = CallSite(CI);
307     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
308       CS = CallSite(II);
309     else
310       llvm_unreachable("All uses must be calls");
311     return CS;
312   }
313 
314   static CallSite getOneCallSiteTo(Function *F) {
315     User *User = *F->user_begin();
316     return getCallSite(User);
317   }
318 
319   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
320     CallSite CS = getOneCallSiteTo(F);
321     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
322     BasicBlock *Block = CS.getParent();
323     return std::make_tuple(DLoc, Block);
324   }
325 
326   // Returns the costs associated with function outlining:
327   // - The first value is the non-weighted runtime cost for making the call
328   //   to the outlined function, including the addtional  setup cost in the
329   //    outlined function itself;
330   // - The second value is the estimated size of the new call sequence in
331   //   basic block Cloner.OutliningCallBB;
332   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
333 
334   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335   // approximate both the size and runtime cost (Note that in the current
336   // inline cost analysis, there is no clear distinction there either).
337   static int computeBBInlineCost(BasicBlock *BB);
338 
339   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
340   std::unique_ptr<FunctionOutliningMultiRegionInfo>
341   computeOutliningColdRegionsInfo(Function *F);
342 };
343 
344 struct PartialInlinerLegacyPass : public ModulePass {
345   static char ID; // Pass identification, replacement for typeid
346 
347   PartialInlinerLegacyPass() : ModulePass(ID) {
348     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
349   }
350 
351   void getAnalysisUsage(AnalysisUsage &AU) const override {
352     AU.addRequired<AssumptionCacheTracker>();
353     AU.addRequired<ProfileSummaryInfoWrapperPass>();
354     AU.addRequired<TargetTransformInfoWrapperPass>();
355   }
356 
357   bool runOnModule(Module &M) override {
358     if (skipModule(M))
359       return false;
360 
361     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
362     TargetTransformInfoWrapperPass *TTIWP =
363         &getAnalysis<TargetTransformInfoWrapperPass>();
364     ProfileSummaryInfo *PSI =
365         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
366     std::unique_ptr<OptimizationRemarkEmitter> UPORE;
367 
368     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
369         [&ACT](Function &F) -> AssumptionCache & {
370       return ACT->getAssumptionCache(F);
371     };
372 
373     std::function<TargetTransformInfo &(Function &)> GetTTI =
374         [&TTIWP](Function &F) -> TargetTransformInfo & {
375       return TTIWP->getTTI(F);
376     };
377 
378     std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
379         [&UPORE](Function &F) -> OptimizationRemarkEmitter & {
380       UPORE.reset(new OptimizationRemarkEmitter(&F));
381       return *UPORE.get();
382     };
383 
384     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, NoneType::None, PSI,
385                               &GetORE)
386         .run(M);
387   }
388 };
389 
390 } // end anonymous namespace
391 
392 std::unique_ptr<FunctionOutliningMultiRegionInfo>
393 PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F) {
394   BasicBlock *EntryBlock = &F->front();
395 
396   DominatorTree DT(*F);
397   DominanceFrontier DF;
398   DF.analyze(DT);
399   LoopInfo LI(DT);
400   BranchProbabilityInfo BPI(*F, LI);
401   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
402   BlockFrequencyInfo *BFI;
403   if (!GetBFI) {
404     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
405     BFI = ScopedBFI.get();
406   } else
407     BFI = &(*GetBFI)(*F);
408 
409   auto &ORE = (*GetORE)(*F);
410 
411   // Return if we don't have profiling information.
412   if (!PSI->hasInstrumentationProfile())
413     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
414 
415   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
416       llvm::make_unique<FunctionOutliningMultiRegionInfo>();
417 
418   auto IsSingleEntry = [](SmallVectorImpl<BasicBlock *> &BlockList) {
419     BasicBlock *Dom = BlockList.front();
420     return BlockList.size() > 1 &&
421            std::distance(pred_begin(Dom), pred_end(Dom)) == 1;
422   };
423 
424   auto IsSingleExit =
425       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
426     BasicBlock *ExitBlock = nullptr;
427     for (auto *Block : BlockList) {
428       for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
429         if (!is_contained(BlockList, *SI)) {
430           if (ExitBlock) {
431             ORE.emit([&]() {
432               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
433                                               &SI->front())
434                      << "Region dominated by "
435                      << ore::NV("Block", BlockList.front()->getName())
436                      << " has more than one region exit edge.";
437             });
438             return nullptr;
439           } else
440             ExitBlock = Block;
441         }
442       }
443     }
444     return ExitBlock;
445   };
446 
447   auto BBProfileCount = [BFI](BasicBlock *BB) {
448     return BFI->getBlockProfileCount(BB)
449                ? BFI->getBlockProfileCount(BB).getValue()
450                : 0;
451   };
452 
453   // Use the same computeBBInlineCost function to compute the cost savings of
454   // the outlining the candidate region.
455   int OverallFunctionCost = 0;
456   for (auto &BB : *F)
457     OverallFunctionCost += computeBBInlineCost(&BB);
458 
459 #ifndef NDEBUG
460   if (TracePartialInlining)
461     dbgs() << "OverallFunctionCost = " << OverallFunctionCost << "\n";
462 #endif
463   int MinOutlineRegionCost =
464       static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
465   BranchProbability MinBranchProbability(
466       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
467       MinBlockCounterExecution);
468   bool ColdCandidateFound = false;
469   BasicBlock *CurrEntry = EntryBlock;
470   std::vector<BasicBlock *> DFS;
471   DenseMap<BasicBlock *, bool> VisitedMap;
472   DFS.push_back(CurrEntry);
473   VisitedMap[CurrEntry] = true;
474   // Use Depth First Search on the basic blocks to find CFG edges that are
475   // considered cold.
476   // Cold regions considered must also have its inline cost compared to the
477   // overall inline cost of the original function.  The region is outlined only
478   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
479   // more.
480   while (!DFS.empty()) {
481     auto *thisBB = DFS.back();
482     DFS.pop_back();
483     // Only consider regions with predecessor blocks that are considered
484     // not-cold (default: part of the top 99.99% of all block counters)
485     // AND greater than our minimum block execution count (default: 100).
486     if (PSI->isColdBB(thisBB, BFI) ||
487         BBProfileCount(thisBB) < MinBlockCounterExecution)
488       continue;
489     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
490       if (VisitedMap[*SI])
491         continue;
492       VisitedMap[*SI] = true;
493       DFS.push_back(*SI);
494       // If branch isn't cold, we skip to the next one.
495       BranchProbability SuccProb = BPI.getEdgeProbability(thisBB, *SI);
496       if (SuccProb > MinBranchProbability)
497         continue;
498 #ifndef NDEBUG
499       if (TracePartialInlining) {
500         dbgs() << "Found cold edge: " << thisBB->getName() << "->"
501                << (*SI)->getName() << "\nBranch Probability = " << SuccProb
502                << "\n";
503       }
504 #endif
505       SmallVector<BasicBlock *, 8> DominateVector;
506       DT.getDescendants(*SI, DominateVector);
507       // We can only outline single entry regions (for now).
508       if (!IsSingleEntry(DominateVector))
509         continue;
510       BasicBlock *ExitBlock = nullptr;
511       // We can only outline single exit regions (for now).
512       if (!(ExitBlock = IsSingleExit(DominateVector)))
513         continue;
514       int OutlineRegionCost = 0;
515       for (auto *BB : DominateVector)
516         OutlineRegionCost += computeBBInlineCost(BB);
517 
518 #ifndef NDEBUG
519       if (TracePartialInlining)
520         dbgs() << "OutlineRegionCost = " << OutlineRegionCost << "\n";
521 #endif
522 
523       if (OutlineRegionCost < MinOutlineRegionCost) {
524         ORE.emit([&]() {
525           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
526                                             &SI->front())
527                  << ore::NV("Callee", F) << " inline cost-savings smaller than "
528                  << ore::NV("Cost", MinOutlineRegionCost);
529         });
530         continue;
531       }
532       // For now, ignore blocks that belong to a SISE region that is a
533       // candidate for outlining.  In the future, we may want to look
534       // at inner regions because the outer region may have live-exit
535       // variables.
536       for (auto *BB : DominateVector)
537         VisitedMap[BB] = true;
538       // ReturnBlock here means the block after the outline call
539       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
540       // assert(ReturnBlock && "ReturnBlock is NULL somehow!");
541       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
542           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
543       RegInfo.Region = DominateVector;
544       OutliningInfo->ORI.push_back(RegInfo);
545 #ifndef NDEBUG
546       if (TracePartialInlining) {
547         dbgs() << "Found Cold Candidate starting at block: "
548                << DominateVector.front()->getName() << "\n";
549       }
550 #endif
551       ColdCandidateFound = true;
552       NumColdRegionsFound++;
553     }
554   }
555   if (ColdCandidateFound)
556     return OutliningInfo;
557   else
558     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
559 }
560 
561 std::unique_ptr<FunctionOutliningInfo>
562 PartialInlinerImpl::computeOutliningInfo(Function *F) {
563   BasicBlock *EntryBlock = &F->front();
564   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
565   if (!BR || BR->isUnconditional())
566     return std::unique_ptr<FunctionOutliningInfo>();
567 
568   // Returns true if Succ is BB's successor
569   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
570     return is_contained(successors(BB), Succ);
571   };
572 
573   auto SuccSize = [](BasicBlock *BB) {
574     return std::distance(succ_begin(BB), succ_end(BB));
575   };
576 
577   auto IsReturnBlock = [](BasicBlock *BB) {
578     TerminatorInst *TI = BB->getTerminator();
579     return isa<ReturnInst>(TI);
580   };
581 
582   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
583     if (IsReturnBlock(Succ1))
584       return std::make_tuple(Succ1, Succ2);
585     if (IsReturnBlock(Succ2))
586       return std::make_tuple(Succ2, Succ1);
587 
588     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
589   };
590 
591   // Detect a triangular shape:
592   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
593     if (IsSuccessor(Succ1, Succ2))
594       return std::make_tuple(Succ1, Succ2);
595     if (IsSuccessor(Succ2, Succ1))
596       return std::make_tuple(Succ2, Succ1);
597 
598     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
599   };
600 
601   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
602       llvm::make_unique<FunctionOutliningInfo>();
603 
604   BasicBlock *CurrEntry = EntryBlock;
605   bool CandidateFound = false;
606   do {
607     // The number of blocks to be inlined has already reached
608     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
609     // disables partial inlining for the function.
610     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
611       break;
612 
613     if (SuccSize(CurrEntry) != 2)
614       break;
615 
616     BasicBlock *Succ1 = *succ_begin(CurrEntry);
617     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
618 
619     BasicBlock *ReturnBlock, *NonReturnBlock;
620     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
621 
622     if (ReturnBlock) {
623       OutliningInfo->Entries.push_back(CurrEntry);
624       OutliningInfo->ReturnBlock = ReturnBlock;
625       OutliningInfo->NonReturnBlock = NonReturnBlock;
626       CandidateFound = true;
627       break;
628     }
629 
630     BasicBlock *CommSucc;
631     BasicBlock *OtherSucc;
632     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
633 
634     if (!CommSucc)
635       break;
636 
637     OutliningInfo->Entries.push_back(CurrEntry);
638     CurrEntry = OtherSucc;
639   } while (true);
640 
641   if (!CandidateFound)
642     return std::unique_ptr<FunctionOutliningInfo>();
643 
644   // Do sanity check of the entries: threre should not
645   // be any successors (not in the entry set) other than
646   // {ReturnBlock, NonReturnBlock}
647   assert(OutliningInfo->Entries[0] == &F->front() &&
648          "Function Entry must be the first in Entries vector");
649   DenseSet<BasicBlock *> Entries;
650   for (BasicBlock *E : OutliningInfo->Entries)
651     Entries.insert(E);
652 
653   // Returns true of BB has Predecessor which is not
654   // in Entries set.
655   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
656     for (auto Pred : predecessors(BB)) {
657       if (!Entries.count(Pred))
658         return true;
659     }
660     return false;
661   };
662   auto CheckAndNormalizeCandidate =
663       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
664         for (BasicBlock *E : OutliningInfo->Entries) {
665           for (auto Succ : successors(E)) {
666             if (Entries.count(Succ))
667               continue;
668             if (Succ == OutliningInfo->ReturnBlock)
669               OutliningInfo->ReturnBlockPreds.push_back(E);
670             else if (Succ != OutliningInfo->NonReturnBlock)
671               return false;
672           }
673           // There should not be any outside incoming edges either:
674           if (HasNonEntryPred(E))
675             return false;
676         }
677         return true;
678       };
679 
680   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
681     return std::unique_ptr<FunctionOutliningInfo>();
682 
683   // Now further growing the candidate's inlining region by
684   // peeling off dominating blocks from the outlining region:
685   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
686     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
687     if (SuccSize(Cand) != 2)
688       break;
689 
690     if (HasNonEntryPred(Cand))
691       break;
692 
693     BasicBlock *Succ1 = *succ_begin(Cand);
694     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
695 
696     BasicBlock *ReturnBlock, *NonReturnBlock;
697     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
698     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
699       break;
700 
701     if (NonReturnBlock->getSinglePredecessor() != Cand)
702       break;
703 
704     // Now grow and update OutlininigInfo:
705     OutliningInfo->Entries.push_back(Cand);
706     OutliningInfo->NonReturnBlock = NonReturnBlock;
707     OutliningInfo->ReturnBlockPreds.push_back(Cand);
708     Entries.insert(Cand);
709   }
710 
711   return OutliningInfo;
712 }
713 
714 // Check if there is PGO data or user annoated branch data:
715 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
716   if (F->getEntryCount())
717     return true;
718   // Now check if any of the entry block has MD_prof data:
719   for (auto *E : OI->Entries) {
720     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
721     if (!BR || BR->isUnconditional())
722       continue;
723     uint64_t T, F;
724     if (BR->extractProfMetadata(T, F))
725       return true;
726   }
727   return false;
728 }
729 
730 BranchProbability
731 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
732   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
733   auto EntryFreq =
734       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
735   auto OutliningCallFreq =
736       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
737   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
738   // we outlined any regions, so we may encounter situations where the
739   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
740   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency()) {
741     OutliningCallFreq = EntryFreq;
742   }
743   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
744       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
745 
746   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
747     return OutlineRegionRelFreq;
748 
749   // When profile data is not available, we need to be conservative in
750   // estimating the overall savings. Static branch prediction can usually
751   // guess the branch direction right (taken/non-taken), but the guessed
752   // branch probability is usually not biased enough. In case when the
753   // outlined region is predicted to be likely, its probability needs
754   // to be made higher (more biased) to not under-estimate the cost of
755   // function outlining. On the other hand, if the outlined region
756   // is predicted to be less likely, the predicted probablity is usually
757   // higher than the actual. For instance, the actual probability of the
758   // less likely target is only 5%, but the guessed probablity can be
759   // 40%. In the latter case, there is no need for further adjustement.
760   // FIXME: add an option for this.
761   if (OutlineRegionRelFreq < BranchProbability(45, 100))
762     return OutlineRegionRelFreq;
763 
764   OutlineRegionRelFreq = std::max(
765       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
766 
767   return OutlineRegionRelFreq;
768 }
769 
770 bool PartialInlinerImpl::shouldPartialInline(
771     CallSite CS, FunctionCloner &Cloner,
772     BlockFrequency WeightedOutliningRcost) {
773   using namespace ore;
774 
775   if (SkipCostAnalysis)
776     return true;
777 
778   Instruction *Call = CS.getInstruction();
779   Function *Callee = CS.getCalledFunction();
780   assert(Callee == Cloner.ClonedFunc);
781 
782   Function *Caller = CS.getCaller();
783   auto &CalleeTTI = (*GetTTI)(*Callee);
784   auto &ORE = (*GetORE)(*Caller);
785   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
786                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
787 
788   if (IC.isAlways()) {
789     ORE.emit([&]() {
790       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
791              << NV("Callee", Cloner.OrigFunc)
792              << " should always be fully inlined, not partially";
793     });
794     return false;
795   }
796 
797   if (IC.isNever()) {
798     ORE.emit([&]() {
799       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
800              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
801              << NV("Caller", Caller)
802              << " because it should never be inlined (cost=never)";
803     });
804     return false;
805   }
806 
807   if (!IC) {
808     ORE.emit([&]() {
809       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
810              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
811              << NV("Caller", Caller) << " because too costly to inline (cost="
812              << NV("Cost", IC.getCost()) << ", threshold="
813              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
814     });
815     return false;
816   }
817   const DataLayout &DL = Caller->getParent()->getDataLayout();
818 
819   // The savings of eliminating the call:
820   int NonWeightedSavings = getCallsiteCost(CS, DL);
821   BlockFrequency NormWeightedSavings(NonWeightedSavings);
822 
823   // Weighted saving is smaller than weighted cost, return false
824   if (NormWeightedSavings < WeightedOutliningRcost) {
825     ORE.emit([&]() {
826       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
827                                         Call)
828              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
829              << NV("Caller", Caller) << " runtime overhead (overhead="
830              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
831              << ", savings="
832              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
833              << ")"
834              << " of making the outlined call is too high";
835     });
836 
837     return false;
838   }
839 
840   ORE.emit([&]() {
841     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
842            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
843            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
844            << " (threshold="
845            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
846   });
847   return true;
848 }
849 
850 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
851 // For now use a simplified version. The returned 'InlineCost' will be used
852 // to esimate the size cost as well as runtime cost of the BB.
853 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
854   int InlineCost = 0;
855   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
856   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
857     if (isa<DbgInfoIntrinsic>(I))
858       continue;
859 
860     switch (I->getOpcode()) {
861     case Instruction::BitCast:
862     case Instruction::PtrToInt:
863     case Instruction::IntToPtr:
864     case Instruction::Alloca:
865       continue;
866     case Instruction::GetElementPtr:
867       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
868         continue;
869     default:
870       break;
871     }
872 
873     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
874     if (IntrInst) {
875       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
876           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
877         continue;
878     }
879 
880     if (CallInst *CI = dyn_cast<CallInst>(I)) {
881       InlineCost += getCallsiteCost(CallSite(CI), DL);
882       continue;
883     }
884 
885     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
886       InlineCost += getCallsiteCost(CallSite(II), DL);
887       continue;
888     }
889 
890     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
891       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
892       continue;
893     }
894     InlineCost += InlineConstants::InstrCost;
895   }
896   return InlineCost;
897 }
898 
899 std::tuple<int, int>
900 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
901   int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
902   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
903     Function *OutlinedFunc = FuncBBPair.first;
904     BasicBlock* OutliningCallBB = FuncBBPair.second;
905     // Now compute the cost of the call sequence to the outlined function
906     // 'OutlinedFunction' in BB 'OutliningCallBB':
907     OutliningFuncCallCost += computeBBInlineCost(OutliningCallBB);
908 
909     // Now compute the cost of the extracted/outlined function itself:
910     for (BasicBlock &BB : *OutlinedFunc)
911       OutlinedFunctionCost += computeBBInlineCost(&BB);
912   }
913   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
914          "Outlined function cost should be no less than the outlined region");
915 
916   // The code extractor introduces a new root and exit stub blocks with
917   // additional unconditional branches. Those branches will be eliminated
918   // later with bb layout. The cost should be adjusted accordingly:
919   OutlinedFunctionCost -=
920       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
921 
922   int OutliningRuntimeOverhead =
923       OutliningFuncCallCost +
924       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
925       ExtraOutliningPenalty;
926 
927   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
928 }
929 
930 // Create the callsite to profile count map which is
931 // used to update the original function's entry count,
932 // after the function is partially inlined into the callsite.
933 void PartialInlinerImpl::computeCallsiteToProfCountMap(
934     Function *DuplicateFunction,
935     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
936   std::vector<User *> Users(DuplicateFunction->user_begin(),
937                             DuplicateFunction->user_end());
938   Function *CurrentCaller = nullptr;
939   std::unique_ptr<BlockFrequencyInfo> TempBFI;
940   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
941 
942   auto ComputeCurrBFI = [&,this](Function *Caller) {
943       // For the old pass manager:
944       if (!GetBFI) {
945         DominatorTree DT(*Caller);
946         LoopInfo LI(DT);
947         BranchProbabilityInfo BPI(*Caller, LI);
948         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
949         CurrentCallerBFI = TempBFI.get();
950       } else {
951         // New pass manager:
952         CurrentCallerBFI = &(*GetBFI)(*Caller);
953       }
954   };
955 
956   for (User *User : Users) {
957     CallSite CS = getCallSite(User);
958     Function *Caller = CS.getCaller();
959     if (CurrentCaller != Caller) {
960       CurrentCaller = Caller;
961       ComputeCurrBFI(Caller);
962     } else {
963       assert(CurrentCallerBFI && "CallerBFI is not set");
964     }
965     BasicBlock *CallBB = CS.getInstruction()->getParent();
966     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
967     if (Count)
968       CallSiteToProfCountMap[User] = *Count;
969     else
970       CallSiteToProfCountMap[User] = 0;
971   }
972 }
973 
974 PartialInlinerImpl::FunctionCloner::FunctionCloner(
975     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE)
976     : OrigFunc(F), ORE(ORE) {
977   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
978 
979   // Clone the function, so that we can hack away on it.
980   ValueToValueMapTy VMap;
981   ClonedFunc = CloneFunction(F, VMap);
982 
983   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
984   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
985   for (BasicBlock *BB : OI->Entries) {
986     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
987   }
988   for (BasicBlock *E : OI->ReturnBlockPreds) {
989     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
990     ClonedOI->ReturnBlockPreds.push_back(NewE);
991   }
992   // Go ahead and update all uses to the duplicate, so that we can just
993   // use the inliner functionality when we're done hacking.
994   F->replaceAllUsesWith(ClonedFunc);
995 }
996 
997 PartialInlinerImpl::FunctionCloner::FunctionCloner(
998     Function *F, FunctionOutliningMultiRegionInfo *OI,
999     OptimizationRemarkEmitter &ORE)
1000     : OrigFunc(F), ORE(ORE) {
1001   ClonedOMRI = llvm::make_unique<FunctionOutliningMultiRegionInfo>();
1002 
1003   // Clone the function, so that we can hack away on it.
1004   ValueToValueMapTy VMap;
1005   ClonedFunc = CloneFunction(F, VMap);
1006 
1007   // Go through all Outline Candidate Regions and update all BasicBlock
1008   // information.
1009   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1010        OI->ORI) {
1011     SmallVector<BasicBlock *, 8> Region;
1012     for (BasicBlock *BB : RegionInfo.Region) {
1013       Region.push_back(cast<BasicBlock>(VMap[BB]));
1014     }
1015     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1016     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1017     BasicBlock *NewReturnBlock = nullptr;
1018     if (RegionInfo.ReturnBlock)
1019       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1020     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1021         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1022     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1023   }
1024   // Go ahead and update all uses to the duplicate, so that we can just
1025   // use the inliner functionality when we're done hacking.
1026   F->replaceAllUsesWith(ClonedFunc);
1027 }
1028 
1029 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
1030   auto getFirstPHI = [](BasicBlock *BB) {
1031     BasicBlock::iterator I = BB->begin();
1032     PHINode *FirstPhi = nullptr;
1033     while (I != BB->end()) {
1034       PHINode *Phi = dyn_cast<PHINode>(I);
1035       if (!Phi)
1036         break;
1037       if (!FirstPhi) {
1038         FirstPhi = Phi;
1039         break;
1040       }
1041     }
1042     return FirstPhi;
1043   };
1044 
1045   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1046   // blocks.
1047   if (!ClonedOI)
1048     return;
1049 
1050   // Special hackery is needed with PHI nodes that have inputs from more than
1051   // one extracted block.  For simplicity, just split the PHIs into a two-level
1052   // sequence of PHIs, some of which will go in the extracted region, and some
1053   // of which will go outside.
1054   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1055   // only split block when necessary:
1056   PHINode *FirstPhi = getFirstPHI(PreReturn);
1057   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1058 
1059   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1060     return;
1061 
1062   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1063     Value *CommonValue = PN->getIncomingValue(0);
1064     if (all_of(PN->incoming_values(),
1065                [&](Value *V) { return V == CommonValue; }))
1066       return CommonValue;
1067     return nullptr;
1068   };
1069 
1070   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1071       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1072   BasicBlock::iterator I = PreReturn->begin();
1073   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1074   SmallVector<Instruction *, 4> DeadPhis;
1075   while (I != PreReturn->end()) {
1076     PHINode *OldPhi = dyn_cast<PHINode>(I);
1077     if (!OldPhi)
1078       break;
1079 
1080     PHINode *RetPhi =
1081         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1082     OldPhi->replaceAllUsesWith(RetPhi);
1083     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1084 
1085     RetPhi->addIncoming(&*I, PreReturn);
1086     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1087       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1088       OldPhi->removeIncomingValue(E);
1089     }
1090 
1091     // After incoming values splitting, the old phi may become trivial.
1092     // Keeping the trivial phi can introduce definition inside the outline
1093     // region which is live-out, causing necessary overhead (load, store
1094     // arg passing etc).
1095     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1096       OldPhi->replaceAllUsesWith(OldPhiVal);
1097       DeadPhis.push_back(OldPhi);
1098     }
1099     ++I;
1100   }
1101   for (auto *DP : DeadPhis)
1102     DP->eraseFromParent();
1103 
1104   for (auto E : ClonedOI->ReturnBlockPreds) {
1105     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1106   }
1107 }
1108 
1109 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1110 
1111   auto ComputeRegionCost = [](SmallVectorImpl<BasicBlock *> &Region) {
1112     int Cost = 0;
1113     for (BasicBlock* BB : Region)
1114       Cost += computeBBInlineCost(BB);
1115     return Cost;
1116   };
1117 
1118   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1119 
1120   if (ClonedOMRI->ORI.empty())
1121     return false;
1122 
1123   // The CodeExtractor needs a dominator tree.
1124   DominatorTree DT;
1125   DT.recalculate(*ClonedFunc);
1126 
1127   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1128   LoopInfo LI(DT);
1129   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1130   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1131 
1132   SetVector<Value *> Inputs, Outputs, Sinks;
1133   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1134        ClonedOMRI->ORI) {
1135     int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
1136 
1137     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1138                      ClonedFuncBFI.get(), &BPI, /* AllowVarargs */ false);
1139 
1140     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1141 
1142 #ifndef NDEBUG
1143     if (TracePartialInlining) {
1144       dbgs() << "inputs: " << Inputs.size() << "\n";
1145       dbgs() << "outputs: " << Outputs.size() << "\n";
1146       for (Value *value : Inputs)
1147         dbgs() << "value used in func: " << *value << "\n";
1148       for (Value *output : Outputs)
1149         dbgs() << "instr used in func: " << *output << "\n";
1150     }
1151 #endif
1152     // Do not extract regions that have live exit variables.
1153     if (Outputs.size() > 0 && !ForceLiveExit)
1154       continue;
1155 
1156     Function *OutlinedFunc = CE.extractCodeRegion();
1157 
1158     if (OutlinedFunc) {
1159       CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
1160       BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
1161       assert(OutliningCallBB->getParent() == ClonedFunc);
1162       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1163       NumColdRegionsOutlined++;
1164       OutlinedRegionCost += CurrentOutlinedRegionCost;
1165 
1166       if (MarkOutlinedColdCC) {
1167         OutlinedFunc->setCallingConv(CallingConv::Cold);
1168         OCS.setCallingConv(CallingConv::Cold);
1169       }
1170     } else
1171       ORE.emit([&]() {
1172         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1173                                         &RegionInfo.Region.front()->front())
1174                << "Failed to extract region at block "
1175                << ore::NV("Block", RegionInfo.Region.front());
1176       });
1177   }
1178 
1179   return !OutlinedFunctions.empty();
1180 }
1181 
1182 Function *
1183 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1184   // Returns true if the block is to be partial inlined into the caller
1185   // (i.e. not to be extracted to the out of line function)
1186   auto ToBeInlined = [&, this](BasicBlock *BB) {
1187     return BB == ClonedOI->ReturnBlock ||
1188            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
1189             ClonedOI->Entries.end());
1190   };
1191 
1192   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1193   // The CodeExtractor needs a dominator tree.
1194   DominatorTree DT;
1195   DT.recalculate(*ClonedFunc);
1196 
1197   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1198   LoopInfo LI(DT);
1199   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1200   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1201 
1202   // Gather up the blocks that we're going to extract.
1203   std::vector<BasicBlock *> ToExtract;
1204   ToExtract.push_back(ClonedOI->NonReturnBlock);
1205   OutlinedRegionCost +=
1206       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
1207   for (BasicBlock &BB : *ClonedFunc)
1208     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1209       ToExtract.push_back(&BB);
1210       // FIXME: the code extractor may hoist/sink more code
1211       // into the outlined function which may make the outlining
1212       // overhead (the difference of the outlined function cost
1213       // and OutliningRegionCost) look larger.
1214       OutlinedRegionCost += computeBBInlineCost(&BB);
1215     }
1216 
1217   // Extract the body of the if.
1218   Function *OutlinedFunc =
1219       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1220                     ClonedFuncBFI.get(), &BPI,
1221                     /* AllowVarargs */ true)
1222           .extractCodeRegion();
1223 
1224   if (OutlinedFunc) {
1225     BasicBlock *OutliningCallBB =
1226         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
1227             .getInstruction()
1228             ->getParent();
1229     assert(OutliningCallBB->getParent() == ClonedFunc);
1230     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1231   } else
1232     ORE.emit([&]() {
1233       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1234                                       &ToExtract.front()->front())
1235              << "Failed to extract region at block "
1236              << ore::NV("Block", ToExtract.front());
1237     });
1238 
1239   return OutlinedFunc;
1240 }
1241 
1242 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1243   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1244   // users (function pointers, etc.) back to the original function.
1245   ClonedFunc->replaceAllUsesWith(OrigFunc);
1246   ClonedFunc->eraseFromParent();
1247   if (!IsFunctionInlined) {
1248     // Remove each function that was speculatively created if there is no
1249     // reference.
1250     for (auto FuncBBPair : OutlinedFunctions) {
1251       Function *Func = FuncBBPair.first;
1252       Func->eraseFromParent();
1253     }
1254   }
1255 }
1256 
1257 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
1258 
1259   if (F->hasAddressTaken())
1260     return {false, nullptr};
1261 
1262   // Let inliner handle it
1263   if (F->hasFnAttribute(Attribute::AlwaysInline))
1264     return {false, nullptr};
1265 
1266   if (F->hasFnAttribute(Attribute::NoInline))
1267     return {false, nullptr};
1268 
1269   if (PSI->isFunctionEntryCold(F))
1270     return {false, nullptr};
1271 
1272   if (F->user_begin() == F->user_end())
1273     return {false, nullptr};
1274 
1275   auto &ORE = (*GetORE)(*F);
1276 
1277   // Only try to outline cold regions if we have a profile summary, which
1278   // implies we have profiling information.
1279   if (PSI->hasProfileSummary() && F->getEntryCount().hasValue() &&
1280       !DisableMultiRegionPartialInline) {
1281     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1282         computeOutliningColdRegionsInfo(F);
1283     if (OMRI) {
1284       FunctionCloner Cloner(F, OMRI.get(), ORE);
1285 
1286 #ifndef NDEBUG
1287       if (TracePartialInlining) {
1288         dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
1289         dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
1290                << "\n";
1291       }
1292 #endif
1293       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1294 
1295       if (DidOutline) {
1296 #ifndef NDEBUG
1297         if (TracePartialInlining) {
1298           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1299           Cloner.ClonedFunc->print(dbgs());
1300           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1301         }
1302 #endif
1303 
1304         if (tryPartialInline(Cloner))
1305           return {true, nullptr};
1306       }
1307     }
1308   }
1309 
1310   // Fall-thru to regular partial inlining if we:
1311   //    i) can't find any cold regions to outline, or
1312   //   ii) can't inline the outlined function anywhere.
1313   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1314   if (!OI)
1315     return {false, nullptr};
1316 
1317   FunctionCloner Cloner(F, OI.get(), ORE);
1318   Cloner.NormalizeReturnBlock();
1319 
1320   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1321 
1322   if (!OutlinedFunction)
1323     return {false, nullptr};
1324 
1325   bool AnyInline = tryPartialInline(Cloner);
1326 
1327   if (AnyInline)
1328     return {true, OutlinedFunction};
1329 
1330   return {false, nullptr};
1331 }
1332 
1333 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1334   if (Cloner.OutlinedFunctions.empty())
1335     return false;
1336 
1337   int SizeCost = 0;
1338   BlockFrequency WeightedRcost;
1339   int NonWeightedRcost;
1340   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
1341 
1342   // Only calculate RelativeToEntryFreq when we are doing single region
1343   // outlining.
1344   BranchProbability RelativeToEntryFreq;
1345   if (Cloner.ClonedOI) {
1346     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1347   } else
1348     // RelativeToEntryFreq doesn't make sense when we have more than one
1349     // outlined call because each call will have a different relative frequency
1350     // to the entry block.  We can consider using the average, but the
1351     // usefulness of that information is questionable. For now, assume we never
1352     // execute the calls to outlined functions.
1353     RelativeToEntryFreq = BranchProbability(0, 1);
1354 
1355   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1356 
1357   // The call sequence(s) to the outlined function(s) are larger than the sum of
1358   // the original outlined region size(s), it does not increase the chances of
1359   // inlining the function with outlining (The inliner uses the size increase to
1360   // model the cost of inlining a callee).
1361   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1362     auto &ORE = (*GetORE)(*Cloner.OrigFunc);
1363     DebugLoc DLoc;
1364     BasicBlock *Block;
1365     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
1366     ORE.emit([&]() {
1367       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1368                                         DLoc, Block)
1369              << ore::NV("Function", Cloner.OrigFunc)
1370              << " not partially inlined into callers (Original Size = "
1371              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1372              << ", Size of call sequence to outlined function = "
1373              << ore::NV("NewSize", SizeCost) << ")";
1374     });
1375     return false;
1376   }
1377 
1378   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
1379          "F's users should all be replaced!");
1380 
1381   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1382                             Cloner.ClonedFunc->user_end());
1383 
1384   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1385   if (Cloner.OrigFunc->getEntryCount())
1386     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1387 
1388   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1389   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
1390 
1391   bool AnyInline = false;
1392   for (User *User : Users) {
1393     CallSite CS = getCallSite(User);
1394 
1395     if (IsLimitReached())
1396       continue;
1397 
1398 
1399     if (!shouldPartialInline(CS, Cloner, WeightedRcost))
1400       continue;
1401 
1402     auto &ORE = (*GetORE)(*CS.getCaller());
1403     // Construct remark before doing the inlining, as after successful inlining
1404     // the callsite is removed.
1405     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
1406     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1407        << ore::NV("Caller", CS.getCaller());
1408 
1409     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
1410     // We can only forward varargs when we outlined a single region, else we
1411     // bail on vararg functions.
1412     if (!InlineFunction(CS, IFI, nullptr, true,
1413                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1414                                          : nullptr)))
1415       continue;
1416 
1417     ORE.emit(OR);
1418 
1419     // Now update the entry count:
1420     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1421       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1422       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1423     }
1424 
1425     AnyInline = true;
1426     NumPartialInlining++;
1427     // Update the stats
1428     if (Cloner.ClonedOI)
1429       NumPartialInlined++;
1430     else
1431       NumColdOutlinePartialInlined++;
1432 
1433   }
1434 
1435   if (AnyInline) {
1436     Cloner.IsFunctionInlined = true;
1437     if (CalleeEntryCount)
1438       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
1439     auto &ORE = (*GetORE)(*Cloner.OrigFunc);
1440     ORE.emit([&]() {
1441       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1442              << "Partially inlined into at least one caller";
1443     });
1444 
1445   }
1446 
1447   return AnyInline;
1448 }
1449 
1450 bool PartialInlinerImpl::run(Module &M) {
1451   if (DisablePartialInlining)
1452     return false;
1453 
1454   std::vector<Function *> Worklist;
1455   Worklist.reserve(M.size());
1456   for (Function &F : M)
1457     if (!F.use_empty() && !F.isDeclaration())
1458       Worklist.push_back(&F);
1459 
1460   bool Changed = false;
1461   while (!Worklist.empty()) {
1462     Function *CurrFunc = Worklist.back();
1463     Worklist.pop_back();
1464 
1465     if (CurrFunc->use_empty())
1466       continue;
1467 
1468     bool Recursive = false;
1469     for (User *U : CurrFunc->users())
1470       if (Instruction *I = dyn_cast<Instruction>(U))
1471         if (I->getParent()->getParent() == CurrFunc) {
1472           Recursive = true;
1473           break;
1474         }
1475     if (Recursive)
1476       continue;
1477 
1478     std::pair<bool, Function * > Result = unswitchFunction(CurrFunc);
1479     if (Result.second)
1480       Worklist.push_back(Result.second);
1481     if (Result.first) {
1482       Changed = true;
1483     }
1484   }
1485 
1486   return Changed;
1487 }
1488 
1489 char PartialInlinerLegacyPass::ID = 0;
1490 
1491 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1492                       "Partial Inliner", false, false)
1493 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1494 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1495 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1496 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1497                     "Partial Inliner", false, false)
1498 
1499 ModulePass *llvm::createPartialInliningPass() {
1500   return new PartialInlinerLegacyPass();
1501 }
1502 
1503 PreservedAnalyses PartialInlinerPass::run(Module &M,
1504                                           ModuleAnalysisManager &AM) {
1505   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1506 
1507   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1508       [&FAM](Function &F) -> AssumptionCache & {
1509     return FAM.getResult<AssumptionAnalysis>(F);
1510   };
1511 
1512   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1513       [&FAM](Function &F) -> BlockFrequencyInfo & {
1514     return FAM.getResult<BlockFrequencyAnalysis>(F);
1515   };
1516 
1517   std::function<TargetTransformInfo &(Function &)> GetTTI =
1518       [&FAM](Function &F) -> TargetTransformInfo & {
1519     return FAM.getResult<TargetIRAnalysis>(F);
1520   };
1521 
1522   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
1523       [&FAM](Function &F) -> OptimizationRemarkEmitter & {
1524     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
1525   };
1526 
1527   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1528 
1529   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI, &GetORE)
1530           .run(M))
1531     return PreservedAnalyses::none();
1532   return PreservedAnalyses::all();
1533 }
1534