1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/BlockFrequencyInfo.h"
18 #include "llvm/Analysis/BranchProbabilityInfo.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/InlineCost.h"
21 #include "llvm/Analysis/LoopInfo.h"
22 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
23 #include "llvm/Analysis/ProfileSummaryInfo.h"
24 #include "llvm/Analysis/TargetLibraryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Module.h"
31 #include "llvm/Pass.h"
32 #include "llvm/Transforms/IPO.h"
33 #include "llvm/Transforms/Utils/Cloning.h"
34 #include "llvm/Transforms/Utils/CodeExtractor.h"
35 using namespace llvm;
36 
37 #define DEBUG_TYPE "partial-inlining"
38 
39 STATISTIC(NumPartialInlined,
40           "Number of callsites functions partially inlined into.");
41 
42 // Command line option to disable partial-inlining. The default is false:
43 static cl::opt<bool>
44     DisablePartialInlining("disable-partial-inlining", cl::init(false),
45                            cl::Hidden, cl::desc("Disable partial ininling"));
46 // This is an option used by testing:
47 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
48                                       cl::init(false), cl::ZeroOrMore,
49                                       cl::ReallyHidden,
50                                       cl::desc("Skip Cost Analysis"));
51 
52 static cl::opt<unsigned> MaxNumInlineBlocks(
53     "max-num-inline-blocks", cl::init(5), cl::Hidden,
54     cl::desc("Max number of blocks to be partially inlined"));
55 
56 // Command line option to set the maximum number of partial inlining allowed
57 // for the module. The default value of -1 means no limit.
58 static cl::opt<int> MaxNumPartialInlining(
59     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
60     cl::desc("Max number of partial inlining. The default is unlimited"));
61 
62 // Used only when PGO or user annotated branch data is absent. It is
63 // the least value that is used to weigh the outline region. If BFI
64 // produces larger value, the BFI value will be used.
65 static cl::opt<int>
66     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
67                              cl::Hidden, cl::ZeroOrMore,
68                              cl::desc("Relative frequency of outline region to "
69                                       "the entry block"));
70 
71 static cl::opt<unsigned> ExtraOutliningPenalty(
72     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
73     cl::desc("A debug option to add additional penalty to the computed one."));
74 
75 namespace {
76 
77 struct FunctionOutliningInfo {
78   FunctionOutliningInfo()
79       : Entries(), ReturnBlock(nullptr), NonReturnBlock(nullptr),
80         ReturnBlockPreds() {}
81   // Returns the number of blocks to be inlined including all blocks
82   // in Entries and one return block.
83   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
84 
85   // A set of blocks including the function entry that guard
86   // the region to be outlined.
87   SmallVector<BasicBlock *, 4> Entries;
88   // The return block that is not included in the outlined region.
89   BasicBlock *ReturnBlock;
90   // The dominating block of the region to be outlined.
91   BasicBlock *NonReturnBlock;
92   // The set of blocks in Entries that that are predecessors to ReturnBlock
93   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
94 };
95 
96 struct PartialInlinerImpl {
97   PartialInlinerImpl(
98       std::function<AssumptionCache &(Function &)> *GetAC,
99       std::function<TargetTransformInfo &(Function &)> *GTTI,
100       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
101       ProfileSummaryInfo *ProfSI)
102       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
103   bool run(Module &M);
104   Function *unswitchFunction(Function *F);
105 
106   // This class speculatively clones the the function to be partial inlined.
107   // At the end of partial inlining, the remaining callsites to the cloned
108   // function that are not partially inlined will be fixed up to reference
109   // the original function, and the cloned function will be erased.
110   struct FunctionCloner {
111     FunctionCloner(Function *F, FunctionOutliningInfo *OI);
112     ~FunctionCloner();
113 
114     // Prepare for function outlining: making sure there is only
115     // one incoming edge from the extracted/outlined region to
116     // the return block.
117     void NormalizeReturnBlock();
118 
119     // Do function outlining:
120     Function *doFunctionOutlining();
121 
122     Function *OrigFunc = nullptr;
123     Function *ClonedFunc = nullptr;
124     Function *OutlinedFunc = nullptr;
125     BasicBlock *OutliningCallBB = nullptr;
126     // ClonedFunc is inlined in one of its callers after function
127     // outlining.
128     bool IsFunctionInlined = false;
129     // The cost of the region to be outlined.
130     int OutlinedRegionCost = 0;
131     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
132     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
133   };
134 
135 private:
136   int NumPartialInlining = 0;
137   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
138   std::function<TargetTransformInfo &(Function &)> *GetTTI;
139   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
140   ProfileSummaryInfo *PSI;
141 
142   // Return the frequency of the OutlininingBB relative to F's entry point.
143   // The result is no larger than 1 and is represented using BP.
144   // (Note that the outlined region's 'head' block can only have incoming
145   // edges from the guarding entry blocks).
146   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
147 
148   // Return true if the callee of CS should be partially inlined with
149   // profit.
150   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
151                            BlockFrequency WeightedOutliningRcost,
152                            OptimizationRemarkEmitter &ORE);
153 
154   // Try to inline DuplicateFunction (cloned from F with call to
155   // the OutlinedFunction into its callers. Return true
156   // if there is any successful inlining.
157   bool tryPartialInline(FunctionCloner &Cloner);
158 
159   // Compute the mapping from use site of DuplicationFunction to the enclosing
160   // BB's profile count.
161   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
162                                      DenseMap<User *, uint64_t> &SiteCountMap);
163 
164   bool IsLimitReached() {
165     return (MaxNumPartialInlining != -1 &&
166             NumPartialInlining >= MaxNumPartialInlining);
167   }
168 
169   static CallSite getCallSite(User *U) {
170     CallSite CS;
171     if (CallInst *CI = dyn_cast<CallInst>(U))
172       CS = CallSite(CI);
173     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
174       CS = CallSite(II);
175     else
176       llvm_unreachable("All uses must be calls");
177     return CS;
178   }
179 
180   static CallSite getOneCallSiteTo(Function *F) {
181     User *User = *F->user_begin();
182     return getCallSite(User);
183   }
184 
185   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
186     CallSite CS = getOneCallSiteTo(F);
187     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
188     BasicBlock *Block = CS.getParent();
189     return std::make_tuple(DLoc, Block);
190   }
191 
192   // Returns the costs associated with function outlining:
193   // - The first value is the non-weighted runtime cost for making the call
194   //   to the outlined function, including the addtional  setup cost in the
195   //    outlined function itself;
196   // - The second value is the estimated size of the new call sequence in
197   //   basic block Cloner.OutliningCallBB;
198   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
199   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
200   // approximate both the size and runtime cost (Note that in the current
201   // inline cost analysis, there is no clear distinction there either).
202   static int computeBBInlineCost(BasicBlock *BB);
203 
204   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
205 
206 };
207 
208 struct PartialInlinerLegacyPass : public ModulePass {
209   static char ID; // Pass identification, replacement for typeid
210   PartialInlinerLegacyPass() : ModulePass(ID) {
211     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
212   }
213 
214   void getAnalysisUsage(AnalysisUsage &AU) const override {
215     AU.addRequired<AssumptionCacheTracker>();
216     AU.addRequired<ProfileSummaryInfoWrapperPass>();
217     AU.addRequired<TargetTransformInfoWrapperPass>();
218   }
219   bool runOnModule(Module &M) override {
220     if (skipModule(M))
221       return false;
222 
223     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
224     TargetTransformInfoWrapperPass *TTIWP =
225         &getAnalysis<TargetTransformInfoWrapperPass>();
226     ProfileSummaryInfo *PSI =
227         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
228 
229     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
230         [&ACT](Function &F) -> AssumptionCache & {
231       return ACT->getAssumptionCache(F);
232     };
233 
234     std::function<TargetTransformInfo &(Function &)> GetTTI =
235         [&TTIWP](Function &F) -> TargetTransformInfo & {
236       return TTIWP->getTTI(F);
237     };
238 
239     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
240   }
241 };
242 }
243 
244 std::unique_ptr<FunctionOutliningInfo>
245 PartialInlinerImpl::computeOutliningInfo(Function *F) {
246   BasicBlock *EntryBlock = &F->front();
247   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
248   if (!BR || BR->isUnconditional())
249     return std::unique_ptr<FunctionOutliningInfo>();
250 
251   // Returns true if Succ is BB's successor
252   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
253     return is_contained(successors(BB), Succ);
254   };
255 
256   auto SuccSize = [](BasicBlock *BB) {
257     return std::distance(succ_begin(BB), succ_end(BB));
258   };
259 
260   auto IsReturnBlock = [](BasicBlock *BB) {
261     TerminatorInst *TI = BB->getTerminator();
262     return isa<ReturnInst>(TI);
263   };
264 
265   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
266     if (IsReturnBlock(Succ1))
267       return std::make_tuple(Succ1, Succ2);
268     if (IsReturnBlock(Succ2))
269       return std::make_tuple(Succ2, Succ1);
270 
271     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
272   };
273 
274   // Detect a triangular shape:
275   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
276     if (IsSuccessor(Succ1, Succ2))
277       return std::make_tuple(Succ1, Succ2);
278     if (IsSuccessor(Succ2, Succ1))
279       return std::make_tuple(Succ2, Succ1);
280 
281     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
282   };
283 
284   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
285       llvm::make_unique<FunctionOutliningInfo>();
286 
287   BasicBlock *CurrEntry = EntryBlock;
288   bool CandidateFound = false;
289   do {
290     // The number of blocks to be inlined has already reached
291     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
292     // disables partial inlining for the function.
293     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
294       break;
295 
296     if (SuccSize(CurrEntry) != 2)
297       break;
298 
299     BasicBlock *Succ1 = *succ_begin(CurrEntry);
300     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
301 
302     BasicBlock *ReturnBlock, *NonReturnBlock;
303     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
304 
305     if (ReturnBlock) {
306       OutliningInfo->Entries.push_back(CurrEntry);
307       OutliningInfo->ReturnBlock = ReturnBlock;
308       OutliningInfo->NonReturnBlock = NonReturnBlock;
309       CandidateFound = true;
310       break;
311     }
312 
313     BasicBlock *CommSucc;
314     BasicBlock *OtherSucc;
315     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
316 
317     if (!CommSucc)
318       break;
319 
320     OutliningInfo->Entries.push_back(CurrEntry);
321     CurrEntry = OtherSucc;
322 
323   } while (true);
324 
325   if (!CandidateFound)
326     return std::unique_ptr<FunctionOutliningInfo>();
327 
328   // Do sanity check of the entries: threre should not
329   // be any successors (not in the entry set) other than
330   // {ReturnBlock, NonReturnBlock}
331   assert(OutliningInfo->Entries[0] == &F->front() &&
332          "Function Entry must be the first in Entries vector");
333   DenseSet<BasicBlock *> Entries;
334   for (BasicBlock *E : OutliningInfo->Entries)
335     Entries.insert(E);
336 
337   // Returns true of BB has Predecessor which is not
338   // in Entries set.
339   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
340     for (auto Pred : predecessors(BB)) {
341       if (!Entries.count(Pred))
342         return true;
343     }
344     return false;
345   };
346   auto CheckAndNormalizeCandidate =
347       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
348         for (BasicBlock *E : OutliningInfo->Entries) {
349           for (auto Succ : successors(E)) {
350             if (Entries.count(Succ))
351               continue;
352             if (Succ == OutliningInfo->ReturnBlock)
353               OutliningInfo->ReturnBlockPreds.push_back(E);
354             else if (Succ != OutliningInfo->NonReturnBlock)
355               return false;
356           }
357           // There should not be any outside incoming edges either:
358           if (HasNonEntryPred(E))
359             return false;
360         }
361         return true;
362       };
363 
364   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
365     return std::unique_ptr<FunctionOutliningInfo>();
366 
367   // Now further growing the candidate's inlining region by
368   // peeling off dominating blocks from the outlining region:
369   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
370     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
371     if (SuccSize(Cand) != 2)
372       break;
373 
374     if (HasNonEntryPred(Cand))
375       break;
376 
377     BasicBlock *Succ1 = *succ_begin(Cand);
378     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
379 
380     BasicBlock *ReturnBlock, *NonReturnBlock;
381     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
382     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
383       break;
384 
385     if (NonReturnBlock->getSinglePredecessor() != Cand)
386       break;
387 
388     // Now grow and update OutlininigInfo:
389     OutliningInfo->Entries.push_back(Cand);
390     OutliningInfo->NonReturnBlock = NonReturnBlock;
391     OutliningInfo->ReturnBlockPreds.push_back(Cand);
392     Entries.insert(Cand);
393   }
394 
395   return OutliningInfo;
396 }
397 
398 // Check if there is PGO data or user annoated branch data:
399 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
400   if (F->getEntryCount())
401     return true;
402   // Now check if any of the entry block has MD_prof data:
403   for (auto *E : OI->Entries) {
404     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
405     if (!BR || BR->isUnconditional())
406       continue;
407     uint64_t T, F;
408     if (BR->extractProfMetadata(T, F))
409       return true;
410   }
411   return false;
412 }
413 
414 BranchProbability
415 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
416 
417   auto EntryFreq =
418       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
419   auto OutliningCallFreq =
420       Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
421 
422   auto OutlineRegionRelFreq =
423       BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
424                                               EntryFreq.getFrequency());
425 
426   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
427     return OutlineRegionRelFreq;
428 
429   // When profile data is not available, we need to be conservative in
430   // estimating the overall savings. Static branch prediction can usually
431   // guess the branch direction right (taken/non-taken), but the guessed
432   // branch probability is usually not biased enough. In case when the
433   // outlined region is predicted to be likely, its probability needs
434   // to be made higher (more biased) to not under-estimate the cost of
435   // function outlining. On the other hand, if the outlined region
436   // is predicted to be less likely, the predicted probablity is usually
437   // higher than the actual. For instance, the actual probability of the
438   // less likely target is only 5%, but the guessed probablity can be
439   // 40%. In the latter case, there is no need for further adjustement.
440   // FIXME: add an option for this.
441   if (OutlineRegionRelFreq < BranchProbability(45, 100))
442     return OutlineRegionRelFreq;
443 
444   OutlineRegionRelFreq = std::max(
445       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
446 
447   return OutlineRegionRelFreq;
448 }
449 
450 bool PartialInlinerImpl::shouldPartialInline(
451     CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
452     OptimizationRemarkEmitter &ORE) {
453 
454   using namespace ore;
455   if (SkipCostAnalysis)
456     return true;
457 
458   Instruction *Call = CS.getInstruction();
459   Function *Callee = CS.getCalledFunction();
460   assert(Callee == Cloner.ClonedFunc);
461 
462   Function *Caller = CS.getCaller();
463   auto &CalleeTTI = (*GetTTI)(*Callee);
464   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
465                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
466 
467   if (IC.isAlways()) {
468     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
469              << NV("Callee", Cloner.OrigFunc)
470              << " should always be fully inlined, not partially");
471     return false;
472   }
473 
474   if (IC.isNever()) {
475     ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
476              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
477              << NV("Caller", Caller)
478              << " because it should never be inlined (cost=never)");
479     return false;
480   }
481 
482   if (!IC) {
483     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
484              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
485              << NV("Caller", Caller) << " because too costly to inline (cost="
486              << NV("Cost", IC.getCost()) << ", threshold="
487              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
488     return false;
489   }
490   const DataLayout &DL = Caller->getParent()->getDataLayout();
491 
492   // The savings of eliminating the call:
493   int NonWeightedSavings = getCallsiteCost(CS, DL);
494   BlockFrequency NormWeightedSavings(NonWeightedSavings);
495 
496   // Weighted saving is smaller than weighted cost, return false
497   if (NormWeightedSavings < WeightedOutliningRcost) {
498     ORE.emit(
499         OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call)
500         << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
501         << NV("Caller", Caller) << " runtime overhead (overhead="
502         << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
503         << ", savings="
504         << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")"
505         << " of making the outlined call is too high");
506 
507     return false;
508   }
509 
510   ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
511            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
512            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
513            << " (threshold="
514            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")");
515   return true;
516 }
517 
518 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
519 // For now use a simplified version. The returned 'InlineCost' will be used
520 // to esimate the size cost as well as runtime cost of the BB.
521 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
522   int InlineCost = 0;
523   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
524   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
525     if (isa<DbgInfoIntrinsic>(I))
526       continue;
527 
528     switch (I->getOpcode()) {
529     case Instruction::BitCast:
530     case Instruction::PtrToInt:
531     case Instruction::IntToPtr:
532     case Instruction::Alloca:
533       continue;
534     case Instruction::GetElementPtr:
535       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
536         continue;
537     default:
538       break;
539     }
540 
541     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
542     if (IntrInst) {
543       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
544           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
545         continue;
546     }
547 
548     if (CallInst *CI = dyn_cast<CallInst>(I)) {
549       InlineCost += getCallsiteCost(CallSite(CI), DL);
550       continue;
551     }
552 
553     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
554       InlineCost += getCallsiteCost(CallSite(II), DL);
555       continue;
556     }
557 
558     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
559       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
560       continue;
561     }
562     InlineCost += InlineConstants::InstrCost;
563   }
564   return InlineCost;
565 }
566 
567 std::tuple<int, int>
568 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
569 
570   // Now compute the cost of the call sequence to the outlined function
571   // 'OutlinedFunction' in BB 'OutliningCallBB':
572   int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
573 
574   // Now compute the cost of the extracted/outlined function itself:
575   int OutlinedFunctionCost = 0;
576   for (BasicBlock &BB : *Cloner.OutlinedFunc) {
577     OutlinedFunctionCost += computeBBInlineCost(&BB);
578   }
579 
580   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
581          "Outlined function cost should be no less than the outlined region");
582   // The code extractor introduces a new root and exit stub blocks with
583   // additional unconditional branches. Those branches will be eliminated
584   // later with bb layout. The cost should be adjusted accordingly:
585   OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
586 
587   int OutliningRuntimeOverhead =
588       OutliningFuncCallCost +
589       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
590       ExtraOutliningPenalty;
591 
592   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
593 }
594 
595 // Create the callsite to profile count map which is
596 // used to update the original function's entry count,
597 // after the function is partially inlined into the callsite.
598 void PartialInlinerImpl::computeCallsiteToProfCountMap(
599     Function *DuplicateFunction,
600     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
601   std::vector<User *> Users(DuplicateFunction->user_begin(),
602                             DuplicateFunction->user_end());
603   Function *CurrentCaller = nullptr;
604   std::unique_ptr<BlockFrequencyInfo> TempBFI;
605   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
606 
607   auto ComputeCurrBFI = [&,this](Function *Caller) {
608       // For the old pass manager:
609       if (!GetBFI) {
610         DominatorTree DT(*Caller);
611         LoopInfo LI(DT);
612         BranchProbabilityInfo BPI(*Caller, LI);
613         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
614         CurrentCallerBFI = TempBFI.get();
615       } else {
616         // New pass manager:
617         CurrentCallerBFI = &(*GetBFI)(*Caller);
618       }
619   };
620 
621   for (User *User : Users) {
622     CallSite CS = getCallSite(User);
623     Function *Caller = CS.getCaller();
624     if (CurrentCaller != Caller) {
625       CurrentCaller = Caller;
626       ComputeCurrBFI(Caller);
627     } else {
628       assert(CurrentCallerBFI && "CallerBFI is not set");
629     }
630     BasicBlock *CallBB = CS.getInstruction()->getParent();
631     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
632     if (Count)
633       CallSiteToProfCountMap[User] = *Count;
634     else
635       CallSiteToProfCountMap[User] = 0;
636   }
637 }
638 
639 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
640                                                    FunctionOutliningInfo *OI)
641     : OrigFunc(F) {
642   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
643 
644   // Clone the function, so that we can hack away on it.
645   ValueToValueMapTy VMap;
646   ClonedFunc = CloneFunction(F, VMap);
647 
648   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
649   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
650   for (BasicBlock *BB : OI->Entries) {
651     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
652   }
653   for (BasicBlock *E : OI->ReturnBlockPreds) {
654     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
655     ClonedOI->ReturnBlockPreds.push_back(NewE);
656   }
657   // Go ahead and update all uses to the duplicate, so that we can just
658   // use the inliner functionality when we're done hacking.
659   F->replaceAllUsesWith(ClonedFunc);
660 }
661 
662 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
663 
664   auto getFirstPHI = [](BasicBlock *BB) {
665     BasicBlock::iterator I = BB->begin();
666     PHINode *FirstPhi = nullptr;
667     while (I != BB->end()) {
668       PHINode *Phi = dyn_cast<PHINode>(I);
669       if (!Phi)
670         break;
671       if (!FirstPhi) {
672         FirstPhi = Phi;
673         break;
674       }
675     }
676     return FirstPhi;
677   };
678 
679   // Special hackery is needed with PHI nodes that have inputs from more than
680   // one extracted block.  For simplicity, just split the PHIs into a two-level
681   // sequence of PHIs, some of which will go in the extracted region, and some
682   // of which will go outside.
683   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
684   // only split block when necessary:
685   PHINode *FirstPhi = getFirstPHI(PreReturn);
686   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
687 
688   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
689     return;
690 
691   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
692     Value *CommonValue = PN->getIncomingValue(0);
693     if (all_of(PN->incoming_values(),
694                [&](Value *V) { return V == CommonValue; }))
695       return CommonValue;
696     return nullptr;
697   };
698 
699   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
700       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
701   BasicBlock::iterator I = PreReturn->begin();
702   Instruction *Ins = &ClonedOI->ReturnBlock->front();
703   SmallVector<Instruction *, 4> DeadPhis;
704   while (I != PreReturn->end()) {
705     PHINode *OldPhi = dyn_cast<PHINode>(I);
706     if (!OldPhi)
707       break;
708 
709     PHINode *RetPhi =
710         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
711     OldPhi->replaceAllUsesWith(RetPhi);
712     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
713 
714     RetPhi->addIncoming(&*I, PreReturn);
715     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
716       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
717       OldPhi->removeIncomingValue(E);
718     }
719 
720     // After incoming values splitting, the old phi may become trivial.
721     // Keeping the trivial phi can introduce definition inside the outline
722     // region which is live-out, causing necessary overhead (load, store
723     // arg passing etc).
724     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
725       OldPhi->replaceAllUsesWith(OldPhiVal);
726       DeadPhis.push_back(OldPhi);
727     }
728     ++I;
729     }
730     for (auto *DP : DeadPhis)
731       DP->eraseFromParent();
732 
733     for (auto E : ClonedOI->ReturnBlockPreds) {
734       E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
735     }
736 }
737 
738 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
739   // Returns true if the block is to be partial inlined into the caller
740   // (i.e. not to be extracted to the out of line function)
741   auto ToBeInlined = [&, this](BasicBlock *BB) {
742     return BB == ClonedOI->ReturnBlock ||
743            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
744             ClonedOI->Entries.end());
745   };
746 
747   // Gather up the blocks that we're going to extract.
748   std::vector<BasicBlock *> ToExtract;
749   ToExtract.push_back(ClonedOI->NonReturnBlock);
750   OutlinedRegionCost +=
751       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
752   for (BasicBlock &BB : *ClonedFunc)
753     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
754       ToExtract.push_back(&BB);
755       // FIXME: the code extractor may hoist/sink more code
756       // into the outlined function which may make the outlining
757       // overhead (the difference of the outlined function cost
758       // and OutliningRegionCost) look larger.
759       OutlinedRegionCost += computeBBInlineCost(&BB);
760     }
761 
762   // The CodeExtractor needs a dominator tree.
763   DominatorTree DT;
764   DT.recalculate(*ClonedFunc);
765 
766   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
767   LoopInfo LI(DT);
768   BranchProbabilityInfo BPI(*ClonedFunc, LI);
769   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
770 
771   // Extract the body of the if.
772   OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
773                                ClonedFuncBFI.get(), &BPI)
774                      .extractCodeRegion();
775 
776   if (OutlinedFunc) {
777     OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
778         .getInstruction()
779         ->getParent();
780     assert(OutliningCallBB->getParent() == ClonedFunc);
781   }
782 
783   return OutlinedFunc;
784 }
785 
786 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
787   // Ditch the duplicate, since we're done with it, and rewrite all remaining
788   // users (function pointers, etc.) back to the original function.
789   ClonedFunc->replaceAllUsesWith(OrigFunc);
790   ClonedFunc->eraseFromParent();
791   if (!IsFunctionInlined) {
792     // Remove the function that is speculatively created if there is no
793     // reference.
794     if (OutlinedFunc)
795       OutlinedFunc->eraseFromParent();
796   }
797 }
798 
799 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
800 
801   if (F->hasAddressTaken())
802     return nullptr;
803 
804   // Let inliner handle it
805   if (F->hasFnAttribute(Attribute::AlwaysInline))
806     return nullptr;
807 
808   if (F->hasFnAttribute(Attribute::NoInline))
809     return nullptr;
810 
811   if (PSI->isFunctionEntryCold(F))
812     return nullptr;
813 
814   if (F->user_begin() == F->user_end())
815     return nullptr;
816 
817   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
818 
819   if (!OI)
820     return nullptr;
821 
822   FunctionCloner Cloner(F, OI.get());
823   Cloner.NormalizeReturnBlock();
824   Function *OutlinedFunction = Cloner.doFunctionOutlining();
825 
826   bool AnyInline = tryPartialInline(Cloner);
827 
828   if (AnyInline)
829     return OutlinedFunction;
830 
831   return nullptr;
832 }
833 
834 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
835   int NonWeightedRcost;
836   int SizeCost;
837 
838   if (Cloner.OutlinedFunc == nullptr)
839     return false;
840 
841   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
842 
843   auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
844   auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
845 
846   // The call sequence to the outlined function is larger than the original
847   // outlined region size, it does not increase the chances of inlining
848   // the function with outlining (The inliner uses the size increase to
849   // model the cost of inlining a callee).
850   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
851     OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
852     DebugLoc DLoc;
853     BasicBlock *Block;
854     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
855     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
856                                         DLoc, Block)
857              << ore::NV("Function", Cloner.OrigFunc)
858              << " not partially inlined into callers (Original Size = "
859              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
860              << ", Size of call sequence to outlined function = "
861              << ore::NV("NewSize", SizeCost) << ")");
862     return false;
863   }
864 
865   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
866          "F's users should all be replaced!");
867 
868   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
869                             Cloner.ClonedFunc->user_end());
870 
871   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
872   if (Cloner.OrigFunc->getEntryCount())
873     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
874 
875   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
876   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
877 
878   bool AnyInline = false;
879   for (User *User : Users) {
880     CallSite CS = getCallSite(User);
881 
882     if (IsLimitReached())
883       continue;
884 
885     OptimizationRemarkEmitter ORE(CS.getCaller());
886 
887     if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
888       continue;
889 
890     ORE.emit(
891         OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction())
892         << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
893         << ore::NV("Caller", CS.getCaller()));
894 
895     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
896     InlineFunction(CS, IFI);
897 
898     // Now update the entry count:
899     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
900       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
901       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
902     }
903 
904     AnyInline = true;
905     NumPartialInlining++;
906     // Update the stats
907     NumPartialInlined++;
908   }
909 
910   if (AnyInline) {
911     Cloner.IsFunctionInlined = true;
912     if (CalleeEntryCount)
913       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
914   }
915 
916   return AnyInline;
917 }
918 
919 bool PartialInlinerImpl::run(Module &M) {
920   if (DisablePartialInlining)
921     return false;
922 
923   std::vector<Function *> Worklist;
924   Worklist.reserve(M.size());
925   for (Function &F : M)
926     if (!F.use_empty() && !F.isDeclaration())
927       Worklist.push_back(&F);
928 
929   bool Changed = false;
930   while (!Worklist.empty()) {
931     Function *CurrFunc = Worklist.back();
932     Worklist.pop_back();
933 
934     if (CurrFunc->use_empty())
935       continue;
936 
937     bool Recursive = false;
938     for (User *U : CurrFunc->users())
939       if (Instruction *I = dyn_cast<Instruction>(U))
940         if (I->getParent()->getParent() == CurrFunc) {
941           Recursive = true;
942           break;
943         }
944     if (Recursive)
945       continue;
946 
947     if (Function *NewFunc = unswitchFunction(CurrFunc)) {
948       Worklist.push_back(NewFunc);
949       Changed = true;
950     }
951   }
952 
953   return Changed;
954 }
955 
956 char PartialInlinerLegacyPass::ID = 0;
957 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
958                       "Partial Inliner", false, false)
959 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
960 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
961 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
962 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
963                     "Partial Inliner", false, false)
964 
965 ModulePass *llvm::createPartialInliningPass() {
966   return new PartialInlinerLegacyPass();
967 }
968 
969 PreservedAnalyses PartialInlinerPass::run(Module &M,
970                                           ModuleAnalysisManager &AM) {
971   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
972 
973   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
974       [&FAM](Function &F) -> AssumptionCache & {
975     return FAM.getResult<AssumptionAnalysis>(F);
976   };
977 
978   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
979       [&FAM](Function &F) -> BlockFrequencyInfo & {
980     return FAM.getResult<BlockFrequencyAnalysis>(F);
981   };
982 
983   std::function<TargetTransformInfo &(Function &)> GetTTI =
984       [&FAM](Function &F) -> TargetTransformInfo & {
985     return FAM.getResult<TargetIRAnalysis>(F);
986   };
987 
988   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
989 
990   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
991     return PreservedAnalyses::none();
992   return PreservedAnalyses::all();
993 }
994