1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/BlockFrequencyInfo.h"
18 #include "llvm/Analysis/BranchProbabilityInfo.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/InlineCost.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
23 #include "llvm/Analysis/ProfileSummaryInfo.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Transforms/IPO.h"
34 #include "llvm/Transforms/Utils/Cloning.h"
35 #include "llvm/Transforms/Utils/CodeExtractor.h"
36 using namespace llvm;
37 
38 #define DEBUG_TYPE "partial-inlining"
39 
40 STATISTIC(NumPartialInlined,
41           "Number of callsites functions partially inlined into.");
42 
43 // Command line option to disable partial-inlining. The default is false:
44 static cl::opt<bool>
45     DisablePartialInlining("disable-partial-inlining", cl::init(false),
46                            cl::Hidden, cl::desc("Disable partial ininling"));
47 // This is an option used by testing:
48 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
49                                       cl::init(false), cl::ZeroOrMore,
50                                       cl::ReallyHidden,
51                                       cl::desc("Skip Cost Analysis"));
52 
53 static cl::opt<unsigned> MaxNumInlineBlocks(
54     "max-num-inline-blocks", cl::init(5), cl::Hidden,
55     cl::desc("Max number of blocks to be partially inlined"));
56 
57 // Command line option to set the maximum number of partial inlining allowed
58 // for the module. The default value of -1 means no limit.
59 static cl::opt<int> MaxNumPartialInlining(
60     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
61     cl::desc("Max number of partial inlining. The default is unlimited"));
62 
63 // Used only when PGO or user annotated branch data is absent. It is
64 // the least value that is used to weigh the outline region. If BFI
65 // produces larger value, the BFI value will be used.
66 static cl::opt<int>
67     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
68                              cl::Hidden, cl::ZeroOrMore,
69                              cl::desc("Relative frequency of outline region to "
70                                       "the entry block"));
71 
72 static cl::opt<unsigned> ExtraOutliningPenalty(
73     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
74     cl::desc("A debug option to add additional penalty to the computed one."));
75 
76 namespace {
77 
78 struct FunctionOutliningInfo {
79   FunctionOutliningInfo()
80       : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
81         ReturnBlockPreds() {}
82   // Returns the number of blocks to be inlined including all blocks
83   // in Entries and one return block.
84   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
85 
86   // A set of blocks including the function entry that guard
87   // the region to be outlined.
88   SmallVector<BasicBlock *, 4> Entries;
89   // The return block that is not included in the outlined region.
90   BasicBlock *ReturnBlock;
91   // The dominating block of the region to be outlined.
92   BasicBlock *NonReturnBlock;
93   // The set of blocks in Entries that that are predecessors to ReturnBlock
94   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
95 };
96 
97 struct PartialInlinerImpl {
98   PartialInlinerImpl(
99       std::function<AssumptionCache &(Function &)> *GetAC,
100       std::function<TargetTransformInfo &(Function &)> *GTTI,
101       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
102       ProfileSummaryInfo *ProfSI)
103       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
104   bool run(Module &M);
105   Function *unswitchFunction(Function *F);
106 
107   // This class speculatively clones the the function to be partial inlined.
108   // At the end of partial inlining, the remaining callsites to the cloned
109   // function that are not partially inlined will be fixed up to reference
110   // the original function, and the cloned function will be erased.
111   struct FunctionCloner {
112     FunctionCloner(Function *F, FunctionOutliningInfo *OI);
113     ~FunctionCloner();
114 
115     // Prepare for function outlining: making sure there is only
116     // one incoming edge from the extracted/outlined region to
117     // the return block.
118     void NormalizeReturnBlock();
119 
120     // Do function outlining:
121     Function *doFunctionOutlining();
122 
123     Function *OrigFunc = nullptr;
124     Function *ClonedFunc = nullptr;
125     Function *OutlinedFunc = nullptr;
126     BasicBlock *OutliningCallBB = nullptr;
127     // ClonedFunc is inlined in one of its callers after function
128     // outlining.
129     bool IsFunctionInlined = false;
130     // The cost of the region to be outlined.
131     int OutlinedRegionCost = 0;
132     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
133     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
134   };
135 
136 private:
137   int NumPartialInlining = 0;
138   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
139   std::function<TargetTransformInfo &(Function &)> *GetTTI;
140   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
141   ProfileSummaryInfo *PSI;
142 
143   // Return the frequency of the OutlininingBB relative to F's entry point.
144   // The result is no larger than 1 and is represented using BP.
145   // (Note that the outlined region's 'head' block can only have incoming
146   // edges from the guarding entry blocks).
147   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
148 
149   // Return true if the callee of CS should be partially inlined with
150   // profit.
151   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
152                            BlockFrequency WeightedOutliningRcost,
153                            OptimizationRemarkEmitter &ORE);
154 
155   // Try to inline DuplicateFunction (cloned from F with call to
156   // the OutlinedFunction into its callers. Return true
157   // if there is any successful inlining.
158   bool tryPartialInline(FunctionCloner &Cloner);
159 
160   // Compute the mapping from use site of DuplicationFunction to the enclosing
161   // BB's profile count.
162   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
163                                      DenseMap<User *, uint64_t> &SiteCountMap);
164 
165   bool IsLimitReached() {
166     return (MaxNumPartialInlining != -1 &&
167             NumPartialInlining >= MaxNumPartialInlining);
168   }
169 
170   static CallSite getCallSite(User *U) {
171     CallSite CS;
172     if (CallInst *CI = dyn_cast<CallInst>(U))
173       CS = CallSite(CI);
174     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
175       CS = CallSite(II);
176     else
177       llvm_unreachable("All uses must be calls");
178     return CS;
179   }
180 
181   static CallSite getOneCallSiteTo(Function *F) {
182     User *User = *F->user_begin();
183     return getCallSite(User);
184   }
185 
186   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
187     CallSite CS = getOneCallSiteTo(F);
188     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
189     BasicBlock *Block = CS.getParent();
190     return std::make_tuple(DLoc, Block);
191   }
192 
193   // Returns the costs associated with function outlining:
194   // - The first value is the non-weighted runtime cost for making the call
195   //   to the outlined function, including the addtional  setup cost in the
196   //    outlined function itself;
197   // - The second value is the estimated size of the new call sequence in
198   //   basic block Cloner.OutliningCallBB;
199   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
200   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
201   // approximate both the size and runtime cost (Note that in the current
202   // inline cost analysis, there is no clear distinction there either).
203   static int computeBBInlineCost(BasicBlock *BB);
204 
205   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
206 
207 };
208 
209 struct PartialInlinerLegacyPass : public ModulePass {
210   static char ID; // Pass identification, replacement for typeid
211   PartialInlinerLegacyPass() : ModulePass(ID) {
212     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
213   }
214 
215   void getAnalysisUsage(AnalysisUsage &AU) const override {
216     AU.addRequired<AssumptionCacheTracker>();
217     AU.addRequired<ProfileSummaryInfoWrapperPass>();
218     AU.addRequired<TargetTransformInfoWrapperPass>();
219   }
220   bool runOnModule(Module &M) override {
221     if (skipModule(M))
222       return false;
223 
224     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
225     TargetTransformInfoWrapperPass *TTIWP =
226         &getAnalysis<TargetTransformInfoWrapperPass>();
227     ProfileSummaryInfo *PSI =
228         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
229 
230     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
231         [&ACT](Function &F) -> AssumptionCache & {
232       return ACT->getAssumptionCache(F);
233     };
234 
235     std::function<TargetTransformInfo &(Function &)> GetTTI =
236         [&TTIWP](Function &F) -> TargetTransformInfo & {
237       return TTIWP->getTTI(F);
238     };
239 
240     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
241   }
242 };
243 }
244 
245 std::unique_ptr<FunctionOutliningInfo>
246 PartialInlinerImpl::computeOutliningInfo(Function *F) {
247   BasicBlock *EntryBlock = &F->front();
248   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
249   if (!BR || BR->isUnconditional())
250     return std::unique_ptr<FunctionOutliningInfo>();
251 
252   // Returns true if Succ is BB's successor
253   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
254     return is_contained(successors(BB), Succ);
255   };
256 
257   auto SuccSize = [](BasicBlock *BB) {
258     return std::distance(succ_begin(BB), succ_end(BB));
259   };
260 
261   auto IsReturnBlock = [](BasicBlock *BB) {
262     TerminatorInst *TI = BB->getTerminator();
263     return isa<ReturnInst>(TI);
264   };
265 
266   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
267     if (IsReturnBlock(Succ1))
268       return std::make_tuple(Succ1, Succ2);
269     if (IsReturnBlock(Succ2))
270       return std::make_tuple(Succ2, Succ1);
271 
272     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
273   };
274 
275   // Detect a triangular shape:
276   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
277     if (IsSuccessor(Succ1, Succ2))
278       return std::make_tuple(Succ1, Succ2);
279     if (IsSuccessor(Succ2, Succ1))
280       return std::make_tuple(Succ2, Succ1);
281 
282     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
283   };
284 
285   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
286       llvm::make_unique<FunctionOutliningInfo>();
287 
288   BasicBlock *CurrEntry = EntryBlock;
289   bool CandidateFound = false;
290   do {
291     // The number of blocks to be inlined has already reached
292     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
293     // disables partial inlining for the function.
294     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
295       break;
296 
297     if (SuccSize(CurrEntry) != 2)
298       break;
299 
300     BasicBlock *Succ1 = *succ_begin(CurrEntry);
301     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
302 
303     BasicBlock *ReturnBlock, *NonReturnBlock;
304     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
305 
306     if (ReturnBlock) {
307       OutliningInfo->Entries.push_back(CurrEntry);
308       OutliningInfo->ReturnBlock = ReturnBlock;
309       OutliningInfo->NonReturnBlock = NonReturnBlock;
310       CandidateFound = true;
311       break;
312     }
313 
314     BasicBlock *CommSucc;
315     BasicBlock *OtherSucc;
316     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
317 
318     if (!CommSucc)
319       break;
320 
321     OutliningInfo->Entries.push_back(CurrEntry);
322     CurrEntry = OtherSucc;
323 
324   } while (true);
325 
326   if (!CandidateFound)
327     return std::unique_ptr<FunctionOutliningInfo>();
328 
329   // Do sanity check of the entries: threre should not
330   // be any successors (not in the entry set) other than
331   // {ReturnBlock, NonReturnBlock}
332   assert(OutliningInfo->Entries[0] == &F->front() &&
333          "Function Entry must be the first in Entries vector");
334   DenseSet<BasicBlock *> Entries;
335   for (BasicBlock *E : OutliningInfo->Entries)
336     Entries.insert(E);
337 
338   // Returns true of BB has Predecessor which is not
339   // in Entries set.
340   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
341     for (auto Pred : predecessors(BB)) {
342       if (!Entries.count(Pred))
343         return true;
344     }
345     return false;
346   };
347   auto CheckAndNormalizeCandidate =
348       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
349         for (BasicBlock *E : OutliningInfo->Entries) {
350           for (auto Succ : successors(E)) {
351             if (Entries.count(Succ))
352               continue;
353             if (Succ == OutliningInfo->ReturnBlock)
354               OutliningInfo->ReturnBlockPreds.push_back(E);
355             else if (Succ != OutliningInfo->NonReturnBlock)
356               return false;
357           }
358           // There should not be any outside incoming edges either:
359           if (HasNonEntryPred(E))
360             return false;
361         }
362         return true;
363       };
364 
365   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
366     return std::unique_ptr<FunctionOutliningInfo>();
367 
368   // Now further growing the candidate's inlining region by
369   // peeling off dominating blocks from the outlining region:
370   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
371     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
372     if (SuccSize(Cand) != 2)
373       break;
374 
375     if (HasNonEntryPred(Cand))
376       break;
377 
378     BasicBlock *Succ1 = *succ_begin(Cand);
379     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
380 
381     BasicBlock *ReturnBlock, *NonReturnBlock;
382     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
383     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
384       break;
385 
386     if (NonReturnBlock->getSinglePredecessor() != Cand)
387       break;
388 
389     // Now grow and update OutlininigInfo:
390     OutliningInfo->Entries.push_back(Cand);
391     OutliningInfo->NonReturnBlock = NonReturnBlock;
392     OutliningInfo->ReturnBlockPreds.push_back(Cand);
393     Entries.insert(Cand);
394   }
395 
396   return OutliningInfo;
397 }
398 
399 // Check if there is PGO data or user annoated branch data:
400 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
401   if (F->getEntryCount())
402     return true;
403   // Now check if any of the entry block has MD_prof data:
404   for (auto *E : OI->Entries) {
405     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
406     if (!BR || BR->isUnconditional())
407       continue;
408     uint64_t T, F;
409     if (BR->extractProfMetadata(T, F))
410       return true;
411   }
412   return false;
413 }
414 
415 BranchProbability
416 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
417 
418   auto EntryFreq =
419       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
420   auto OutliningCallFreq =
421       Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
422 
423   auto OutlineRegionRelFreq =
424       BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
425                                               EntryFreq.getFrequency());
426 
427   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
428     return OutlineRegionRelFreq;
429 
430   // When profile data is not available, we need to be conservative in
431   // estimating the overall savings. Static branch prediction can usually
432   // guess the branch direction right (taken/non-taken), but the guessed
433   // branch probability is usually not biased enough. In case when the
434   // outlined region is predicted to be likely, its probability needs
435   // to be made higher (more biased) to not under-estimate the cost of
436   // function outlining. On the other hand, if the outlined region
437   // is predicted to be less likely, the predicted probablity is usually
438   // higher than the actual. For instance, the actual probability of the
439   // less likely target is only 5%, but the guessed probablity can be
440   // 40%. In the latter case, there is no need for further adjustement.
441   // FIXME: add an option for this.
442   if (OutlineRegionRelFreq < BranchProbability(45, 100))
443     return OutlineRegionRelFreq;
444 
445   OutlineRegionRelFreq = std::max(
446       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
447 
448   return OutlineRegionRelFreq;
449 }
450 
451 bool PartialInlinerImpl::shouldPartialInline(
452     CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
453     OptimizationRemarkEmitter &ORE) {
454 
455   using namespace ore;
456   if (SkipCostAnalysis)
457     return true;
458 
459   Instruction *Call = CS.getInstruction();
460   Function *Callee = CS.getCalledFunction();
461   assert(Callee == Cloner.ClonedFunc);
462 
463   Function *Caller = CS.getCaller();
464   auto &CalleeTTI = (*GetTTI)(*Callee);
465   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
466                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
467 
468   if (IC.isAlways()) {
469     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
470              << NV("Callee", Cloner.OrigFunc)
471              << " should always be fully inlined, not partially");
472     return false;
473   }
474 
475   if (IC.isNever()) {
476     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
477              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
478              << NV("Caller", Caller)
479              << " because it should never be inlined (cost=never)");
480     return false;
481   }
482 
483   if (!IC) {
484     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
485              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
486              << NV("Caller", Caller) << " because too costly to inline (cost="
487              << NV("Cost", IC.getCost()) << ", threshold="
488              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
489     return false;
490   }
491   const DataLayout &DL = Caller->getParent()->getDataLayout();
492 
493   // The savings of eliminating the call:
494   int NonWeightedSavings = getCallsiteCost(CS, DL);
495   BlockFrequency NormWeightedSavings(NonWeightedSavings);
496 
497   // Weighted saving is smaller than weighted cost, return false
498   if (NormWeightedSavings < WeightedOutliningRcost) {
499     ORE.emit(
500         OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
501         << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
502         << NV("Caller", Caller) << " runtime overhead (overhead="
503         << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
504         << ", savings="
505         << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
506         << " of making the outlined call is too high");
507 
508     return false;
509   }
510 
511   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
512            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
513            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
514            << " (threshold="
515            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
516   return true;
517 }
518 
519 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
520 // For now use a simplified version. The returned 'InlineCost' will be used
521 // to esimate the size cost as well as runtime cost of the BB.
522 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
523   int InlineCost = 0;
524   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
525   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
526     if (isa<DbgInfoIntrinsic>(I))
527       continue;
528 
529     switch (I->getOpcode()) {
530     case Instruction::BitCast:
531     case Instruction::PtrToInt:
532     case Instruction::IntToPtr:
533     case Instruction::Alloca:
534       continue;
535     case Instruction::GetElementPtr:
536       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
537         continue;
538     default:
539       break;
540     }
541 
542     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
543     if (IntrInst) {
544       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
545           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
546         continue;
547     }
548 
549     if (CallInst *CI = dyn_cast<CallInst>(I)) {
550       InlineCost += getCallsiteCost(CallSite(CI), DL);
551       continue;
552     }
553 
554     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
555       InlineCost += getCallsiteCost(CallSite(II), DL);
556       continue;
557     }
558 
559     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
560       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
561       continue;
562     }
563     InlineCost += InlineConstants::InstrCost;
564   }
565   return InlineCost;
566 }
567 
568 std::tuple<int, int>
569 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
570 
571   // Now compute the cost of the call sequence to the outlined function
572   // 'OutlinedFunction' in BB 'OutliningCallBB':
573   int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
574 
575   // Now compute the cost of the extracted/outlined function itself:
576   int OutlinedFunctionCost = 0;
577   for (BasicBlock &BB : *Cloner.OutlinedFunc) {
578     OutlinedFunctionCost += computeBBInlineCost(&BB);
579   }
580 
581   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
582          "Outlined function cost should be no less than the outlined region");
583   // The code extractor introduces a new root and exit stub blocks with
584   // additional unconditional branches. Those branches will be eliminated
585   // later with bb layout. The cost should be adjusted accordingly:
586   OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
587 
588   int OutliningRuntimeOverhead =
589       OutliningFuncCallCost +
590       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
591       ExtraOutliningPenalty;
592 
593   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
594 }
595 
596 // Create the callsite to profile count map which is
597 // used to update the original function's entry count,
598 // after the function is partially inlined into the callsite.
599 void PartialInlinerImpl::computeCallsiteToProfCountMap(
600     Function *DuplicateFunction,
601     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
602   std::vector<User *> Users(DuplicateFunction->user_begin(),
603                             DuplicateFunction->user_end());
604   Function *CurrentCaller = nullptr;
605   std::unique_ptr<BlockFrequencyInfo> TempBFI;
606   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
607 
608   auto ComputeCurrBFI = [&,this](Function *Caller) {
609       // For the old pass manager:
610       if (!GetBFI) {
611         DominatorTree DT(*Caller);
612         LoopInfo LI(DT);
613         BranchProbabilityInfo BPI(*Caller, LI);
614         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
615         CurrentCallerBFI = TempBFI.get();
616       } else {
617         // New pass manager:
618         CurrentCallerBFI = &(*GetBFI)(*Caller);
619       }
620   };
621 
622   for (User *User : Users) {
623     CallSite CS = getCallSite(User);
624     Function *Caller = CS.getCaller();
625     if (CurrentCaller != Caller) {
626       CurrentCaller = Caller;
627       ComputeCurrBFI(Caller);
628     } else {
629       assert(CurrentCallerBFI && "CallerBFI is not set");
630     }
631     BasicBlock *CallBB = CS.getInstruction()->getParent();
632     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
633     if (Count)
634       CallSiteToProfCountMap[User] = *Count;
635     else
636       CallSiteToProfCountMap[User] = 0;
637   }
638 }
639 
640 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
641                                                    FunctionOutliningInfo *OI)
642     : OrigFunc(F) {
643   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
644 
645   // Clone the function, so that we can hack away on it.
646   ValueToValueMapTy VMap;
647   ClonedFunc = CloneFunction(F, VMap);
648 
649   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
650   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
651   for (BasicBlock *BB : OI->Entries) {
652     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
653   }
654   for (BasicBlock *E : OI->ReturnBlockPreds) {
655     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
656     ClonedOI->ReturnBlockPreds.push_back(NewE);
657   }
658   // Go ahead and update all uses to the duplicate, so that we can just
659   // use the inliner functionality when we're done hacking.
660   F->replaceAllUsesWith(ClonedFunc);
661 }
662 
663 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
664 
665   auto getFirstPHI = [](BasicBlock *BB) {
666     BasicBlock::iterator I = BB->begin();
667     PHINode *FirstPhi = nullptr;
668     while (I != BB->end()) {
669       PHINode *Phi = dyn_cast<PHINode>(I);
670       if (!Phi)
671         break;
672       if (!FirstPhi) {
673         FirstPhi = Phi;
674         break;
675       }
676     }
677     return FirstPhi;
678   };
679 
680   // Special hackery is needed with PHI nodes that have inputs from more than
681   // one extracted block.  For simplicity, just split the PHIs into a two-level
682   // sequence of PHIs, some of which will go in the extracted region, and some
683   // of which will go outside.
684   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
685   // only split block when necessary:
686   PHINode *FirstPhi = getFirstPHI(PreReturn);
687   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
688 
689   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
690     return;
691 
692   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
693     Value *CommonValue = PN->getIncomingValue(0);
694     if (all_of(PN->incoming_values(),
695                [&](Value *V) { return V == CommonValue; }))
696       return CommonValue;
697     return nullptr;
698   };
699 
700   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
701       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
702   BasicBlock::iterator I = PreReturn->begin();
703   Instruction *Ins = &ClonedOI->ReturnBlock->front();
704   SmallVector<Instruction *, 4> DeadPhis;
705   while (I != PreReturn->end()) {
706     PHINode *OldPhi = dyn_cast<PHINode>(I);
707     if (!OldPhi)
708       break;
709 
710     PHINode *RetPhi =
711         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
712     OldPhi->replaceAllUsesWith(RetPhi);
713     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
714 
715     RetPhi->addIncoming(&*I, PreReturn);
716     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
717       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
718       OldPhi->removeIncomingValue(E);
719     }
720 
721     // After incoming values splitting, the old phi may become trivial.
722     // Keeping the trivial phi can introduce definition inside the outline
723     // region which is live-out, causing necessary overhead (load, store
724     // arg passing etc).
725     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
726       OldPhi->replaceAllUsesWith(OldPhiVal);
727       DeadPhis.push_back(OldPhi);
728     }
729     ++I;
730     }
731     for (auto *DP : DeadPhis)
732       DP->eraseFromParent();
733 
734     for (auto E : ClonedOI->ReturnBlockPreds) {
735       E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
736     }
737 }
738 
739 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
740   // Returns true if the block is to be partial inlined into the caller
741   // (i.e. not to be extracted to the out of line function)
742   auto ToBeInlined = [&, this](BasicBlock *BB) {
743     return BB == ClonedOI->ReturnBlock ||
744            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
745             ClonedOI->Entries.end());
746   };
747 
748   // Gather up the blocks that we're going to extract.
749   std::vector<BasicBlock *> ToExtract;
750   ToExtract.push_back(ClonedOI->NonReturnBlock);
751   OutlinedRegionCost +=
752       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
753   for (BasicBlock &BB : *ClonedFunc)
754     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
755       ToExtract.push_back(&BB);
756       // FIXME: the code extractor may hoist/sink more code
757       // into the outlined function which may make the outlining
758       // overhead (the difference of the outlined function cost
759       // and OutliningRegionCost) look larger.
760       OutlinedRegionCost += computeBBInlineCost(&BB);
761     }
762 
763   // The CodeExtractor needs a dominator tree.
764   DominatorTree DT;
765   DT.recalculate(*ClonedFunc);
766 
767   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
768   LoopInfo LI(DT);
769   BranchProbabilityInfo BPI(*ClonedFunc, LI);
770   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
771 
772   // Extract the body of the if.
773   OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
774                                ClonedFuncBFI.get(), &BPI)
775                      .extractCodeRegion();
776 
777   if (OutlinedFunc) {
778     OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
779         .getInstruction()
780         ->getParent();
781     assert(OutliningCallBB->getParent() == ClonedFunc);
782   }
783 
784   return OutlinedFunc;
785 }
786 
787 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
788   // Ditch the duplicate, since we're done with it, and rewrite all remaining
789   // users (function pointers, etc.) back to the original function.
790   ClonedFunc->replaceAllUsesWith(OrigFunc);
791   ClonedFunc->eraseFromParent();
792   if (!IsFunctionInlined) {
793     // Remove the function that is speculatively created if there is no
794     // reference.
795     if (OutlinedFunc)
796       OutlinedFunc->eraseFromParent();
797   }
798 }
799 
800 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
801 
802   if (F->hasAddressTaken())
803     return nullptr;
804 
805   // Let inliner handle it
806   if (F->hasFnAttribute(Attribute::AlwaysInline))
807     return nullptr;
808 
809   if (F->hasFnAttribute(Attribute::NoInline))
810     return nullptr;
811 
812   if (PSI->isFunctionEntryCold(F))
813     return nullptr;
814 
815   if (F->user_begin() == F->user_end())
816     return nullptr;
817 
818   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
819 
820   if (!OI)
821     return nullptr;
822 
823   FunctionCloner Cloner(F, OI.get());
824   Cloner.NormalizeReturnBlock();
825   Function *OutlinedFunction = Cloner.doFunctionOutlining();
826 
827   bool AnyInline = tryPartialInline(Cloner);
828 
829   if (AnyInline)
830     return OutlinedFunction;
831 
832   return nullptr;
833 }
834 
835 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
836   int NonWeightedRcost;
837   int SizeCost;
838 
839   if (Cloner.OutlinedFunc == nullptr)
840     return false;
841 
842   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
843 
844   auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
845   auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
846 
847   // The call sequence to the outlined function is larger than the original
848   // outlined region size, it does not increase the chances of inlining
849   // the function with outlining (The inliner uses the size increase to
850   // model the cost of inlining a callee).
851   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
852     OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
853     DebugLoc DLoc;
854     BasicBlock *Block;
855     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
856     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
857                                         DLoc, Block)
858              << ore::NV("Function", Cloner.OrigFunc)
859              << " not partially inlined into callers (Original Size = "
860              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
861              << ", Size of call sequence to outlined function = "
862              << ore::NV("NewSize", SizeCost) << ")");
863     return false;
864   }
865 
866   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
867          "F's users should all be replaced!");
868 
869   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
870                             Cloner.ClonedFunc->user_end());
871 
872   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
873   if (Cloner.OrigFunc->getEntryCount())
874     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
875 
876   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
877   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
878 
879   bool AnyInline = false;
880   for (User *User : Users) {
881     CallSite CS = getCallSite(User);
882 
883     if (IsLimitReached())
884       continue;
885 
886     OptimizationRemarkEmitter ORE(CS.getCaller());
887 
888     if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
889       continue;
890 
891     ORE.emit(
892         OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
893         << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
894         << ore::NV("Caller", CS.getCaller()));
895 
896     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
897     InlineFunction(CS, IFI);
898 
899     // Now update the entry count:
900     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
901       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
902       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
903     }
904 
905     AnyInline = true;
906     NumPartialInlining++;
907     // Update the stats
908     NumPartialInlined++;
909   }
910 
911   if (AnyInline) {
912     Cloner.IsFunctionInlined = true;
913     if (CalleeEntryCount)
914       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
915   }
916 
917   return AnyInline;
918 }
919 
920 bool PartialInlinerImpl::run(Module &M) {
921   if (DisablePartialInlining)
922     return false;
923 
924   std::vector<Function *> Worklist;
925   Worklist.reserve(M.size());
926   for (Function &F : M)
927     if (!F.use_empty() && !F.isDeclaration())
928       Worklist.push_back(&F);
929 
930   bool Changed = false;
931   while (!Worklist.empty()) {
932     Function *CurrFunc = Worklist.back();
933     Worklist.pop_back();
934 
935     if (CurrFunc->use_empty())
936       continue;
937 
938     bool Recursive = false;
939     for (User *U : CurrFunc->users())
940       if (Instruction *I = dyn_cast<Instruction>(U))
941         if (I->getParent()->getParent() == CurrFunc) {
942           Recursive = true;
943           break;
944         }
945     if (Recursive)
946       continue;
947 
948     if (Function *NewFunc = unswitchFunction(CurrFunc)) {
949       Worklist.push_back(NewFunc);
950       Changed = true;
951     }
952   }
953 
954   return Changed;
955 }
956 
957 char PartialInlinerLegacyPass::ID = 0;
958 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
959                       "Partial Inliner", false, false)
960 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
961 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
962 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
963 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
964                     "Partial Inliner", false, false)
965 
966 ModulePass *llvm::createPartialInliningPass() {
967   return new PartialInlinerLegacyPass();
968 }
969 
970 PreservedAnalyses PartialInlinerPass::run(Module &M,
971                                           ModuleAnalysisManager &AM) {
972   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
973 
974   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
975       [&FAM](Function &F) -> AssumptionCache & {
976     return FAM.getResult<AssumptionAnalysis>(F);
977   };
978 
979   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
980       [&FAM](Function &F) -> BlockFrequencyInfo & {
981     return FAM.getResult<BlockFrequencyAnalysis>(F);
982   };
983 
984   std::function<TargetTransformInfo &(Function &)> GetTTI =
985       [&FAM](Function &F) -> TargetTransformInfo & {
986     return FAM.getResult<TargetIRAnalysis>(F);
987   };
988 
989   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
990 
991   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
992     return PreservedAnalyses::none();
993   return PreservedAnalyses::all();
994 }
995