1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/BlockFrequencyInfo.h"
24 #include "llvm/Analysis/BranchProbabilityInfo.h"
25 #include "llvm/Analysis/InlineCost.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
28 #include "llvm/Analysis/ProfileSummaryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 
71 // Command line option to disable partial-inlining. The default is false:
72 static cl::opt<bool>
73     DisablePartialInlining("disable-partial-inlining", cl::init(false),
74                            cl::Hidden, cl::desc("Disable partial ininling"));
75 
76 // This is an option used by testing:
77 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
78                                       cl::init(false), cl::ZeroOrMore,
79                                       cl::ReallyHidden,
80                                       cl::desc("Skip Cost Analysis"));
81 
82 static cl::opt<unsigned> MaxNumInlineBlocks(
83     "max-num-inline-blocks", cl::init(5), cl::Hidden,
84     cl::desc("Max number of blocks to be partially inlined"));
85 
86 // Command line option to set the maximum number of partial inlining allowed
87 // for the module. The default value of -1 means no limit.
88 static cl::opt<int> MaxNumPartialInlining(
89     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
90     cl::desc("Max number of partial inlining. The default is unlimited"));
91 
92 // Used only when PGO or user annotated branch data is absent. It is
93 // the least value that is used to weigh the outline region. If BFI
94 // produces larger value, the BFI value will be used.
95 static cl::opt<int>
96     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
97                              cl::Hidden, cl::ZeroOrMore,
98                              cl::desc("Relative frequency of outline region to "
99                                       "the entry block"));
100 
101 static cl::opt<unsigned> ExtraOutliningPenalty(
102     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
103     cl::desc("A debug option to add additional penalty to the computed one."));
104 
105 namespace {
106 
107 struct FunctionOutliningInfo {
108   FunctionOutliningInfo() = default;
109 
110   // Returns the number of blocks to be inlined including all blocks
111   // in Entries and one return block.
112   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
113 
114   // A set of blocks including the function entry that guard
115   // the region to be outlined.
116   SmallVector<BasicBlock *, 4> Entries;
117 
118   // The return block that is not included in the outlined region.
119   BasicBlock *ReturnBlock = nullptr;
120 
121   // The dominating block of the region to be outlined.
122   BasicBlock *NonReturnBlock = nullptr;
123 
124   // The set of blocks in Entries that that are predecessors to ReturnBlock
125   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
126 };
127 
128 struct PartialInlinerImpl {
129   PartialInlinerImpl(
130       std::function<AssumptionCache &(Function &)> *GetAC,
131       std::function<TargetTransformInfo &(Function &)> *GTTI,
132       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
133       ProfileSummaryInfo *ProfSI)
134       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
135 
136   bool run(Module &M);
137   Function *unswitchFunction(Function *F);
138 
139   // This class speculatively clones the the function to be partial inlined.
140   // At the end of partial inlining, the remaining callsites to the cloned
141   // function that are not partially inlined will be fixed up to reference
142   // the original function, and the cloned function will be erased.
143   struct FunctionCloner {
144     FunctionCloner(Function *F, FunctionOutliningInfo *OI);
145     ~FunctionCloner();
146 
147     // Prepare for function outlining: making sure there is only
148     // one incoming edge from the extracted/outlined region to
149     // the return block.
150     void NormalizeReturnBlock();
151 
152     // Do function outlining:
153     Function *doFunctionOutlining();
154 
155     Function *OrigFunc = nullptr;
156     Function *ClonedFunc = nullptr;
157     Function *OutlinedFunc = nullptr;
158     BasicBlock *OutliningCallBB = nullptr;
159     // ClonedFunc is inlined in one of its callers after function
160     // outlining.
161     bool IsFunctionInlined = false;
162     // The cost of the region to be outlined.
163     int OutlinedRegionCost = 0;
164     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
165     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
166   };
167 
168 private:
169   int NumPartialInlining = 0;
170   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
171   std::function<TargetTransformInfo &(Function &)> *GetTTI;
172   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
173   ProfileSummaryInfo *PSI;
174 
175   // Return the frequency of the OutlininingBB relative to F's entry point.
176   // The result is no larger than 1 and is represented using BP.
177   // (Note that the outlined region's 'head' block can only have incoming
178   // edges from the guarding entry blocks).
179   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
180 
181   // Return true if the callee of CS should be partially inlined with
182   // profit.
183   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
184                            BlockFrequency WeightedOutliningRcost,
185                            OptimizationRemarkEmitter &ORE);
186 
187   // Try to inline DuplicateFunction (cloned from F with call to
188   // the OutlinedFunction into its callers. Return true
189   // if there is any successful inlining.
190   bool tryPartialInline(FunctionCloner &Cloner);
191 
192   // Compute the mapping from use site of DuplicationFunction to the enclosing
193   // BB's profile count.
194   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
195                                      DenseMap<User *, uint64_t> &SiteCountMap);
196 
197   bool IsLimitReached() {
198     return (MaxNumPartialInlining != -1 &&
199             NumPartialInlining >= MaxNumPartialInlining);
200   }
201 
202   static CallSite getCallSite(User *U) {
203     CallSite CS;
204     if (CallInst *CI = dyn_cast<CallInst>(U))
205       CS = CallSite(CI);
206     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
207       CS = CallSite(II);
208     else
209       llvm_unreachable("All uses must be calls");
210     return CS;
211   }
212 
213   static CallSite getOneCallSiteTo(Function *F) {
214     User *User = *F->user_begin();
215     return getCallSite(User);
216   }
217 
218   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
219     CallSite CS = getOneCallSiteTo(F);
220     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
221     BasicBlock *Block = CS.getParent();
222     return std::make_tuple(DLoc, Block);
223   }
224 
225   // Returns the costs associated with function outlining:
226   // - The first value is the non-weighted runtime cost for making the call
227   //   to the outlined function, including the addtional  setup cost in the
228   //    outlined function itself;
229   // - The second value is the estimated size of the new call sequence in
230   //   basic block Cloner.OutliningCallBB;
231   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
232 
233   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
234   // approximate both the size and runtime cost (Note that in the current
235   // inline cost analysis, there is no clear distinction there either).
236   static int computeBBInlineCost(BasicBlock *BB);
237 
238   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
239 };
240 
241 struct PartialInlinerLegacyPass : public ModulePass {
242   static char ID; // Pass identification, replacement for typeid
243 
244   PartialInlinerLegacyPass() : ModulePass(ID) {
245     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
246   }
247 
248   void getAnalysisUsage(AnalysisUsage &AU) const override {
249     AU.addRequired<AssumptionCacheTracker>();
250     AU.addRequired<ProfileSummaryInfoWrapperPass>();
251     AU.addRequired<TargetTransformInfoWrapperPass>();
252   }
253 
254   bool runOnModule(Module &M) override {
255     if (skipModule(M))
256       return false;
257 
258     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
259     TargetTransformInfoWrapperPass *TTIWP =
260         &getAnalysis<TargetTransformInfoWrapperPass>();
261     ProfileSummaryInfo *PSI =
262         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
263 
264     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
265         [&ACT](Function &F) -> AssumptionCache & {
266       return ACT->getAssumptionCache(F);
267     };
268 
269     std::function<TargetTransformInfo &(Function &)> GetTTI =
270         [&TTIWP](Function &F) -> TargetTransformInfo & {
271       return TTIWP->getTTI(F);
272     };
273 
274     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
275   }
276 };
277 
278 } // end anonymous namespace
279 
280 std::unique_ptr<FunctionOutliningInfo>
281 PartialInlinerImpl::computeOutliningInfo(Function *F) {
282   BasicBlock *EntryBlock = &F->front();
283   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
284   if (!BR || BR->isUnconditional())
285     return std::unique_ptr<FunctionOutliningInfo>();
286 
287   // Returns true if Succ is BB's successor
288   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
289     return is_contained(successors(BB), Succ);
290   };
291 
292   auto SuccSize = [](BasicBlock *BB) {
293     return std::distance(succ_begin(BB), succ_end(BB));
294   };
295 
296   auto IsReturnBlock = [](BasicBlock *BB) {
297     TerminatorInst *TI = BB->getTerminator();
298     return isa<ReturnInst>(TI);
299   };
300 
301   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
302     if (IsReturnBlock(Succ1))
303       return std::make_tuple(Succ1, Succ2);
304     if (IsReturnBlock(Succ2))
305       return std::make_tuple(Succ2, Succ1);
306 
307     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
308   };
309 
310   // Detect a triangular shape:
311   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
312     if (IsSuccessor(Succ1, Succ2))
313       return std::make_tuple(Succ1, Succ2);
314     if (IsSuccessor(Succ2, Succ1))
315       return std::make_tuple(Succ2, Succ1);
316 
317     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
318   };
319 
320   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
321       llvm::make_unique<FunctionOutliningInfo>();
322 
323   BasicBlock *CurrEntry = EntryBlock;
324   bool CandidateFound = false;
325   do {
326     // The number of blocks to be inlined has already reached
327     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
328     // disables partial inlining for the function.
329     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
330       break;
331 
332     if (SuccSize(CurrEntry) != 2)
333       break;
334 
335     BasicBlock *Succ1 = *succ_begin(CurrEntry);
336     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
337 
338     BasicBlock *ReturnBlock, *NonReturnBlock;
339     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
340 
341     if (ReturnBlock) {
342       OutliningInfo->Entries.push_back(CurrEntry);
343       OutliningInfo->ReturnBlock = ReturnBlock;
344       OutliningInfo->NonReturnBlock = NonReturnBlock;
345       CandidateFound = true;
346       break;
347     }
348 
349     BasicBlock *CommSucc;
350     BasicBlock *OtherSucc;
351     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
352 
353     if (!CommSucc)
354       break;
355 
356     OutliningInfo->Entries.push_back(CurrEntry);
357     CurrEntry = OtherSucc;
358   } while (true);
359 
360   if (!CandidateFound)
361     return std::unique_ptr<FunctionOutliningInfo>();
362 
363   // Do sanity check of the entries: threre should not
364   // be any successors (not in the entry set) other than
365   // {ReturnBlock, NonReturnBlock}
366   assert(OutliningInfo->Entries[0] == &F->front() &&
367          "Function Entry must be the first in Entries vector");
368   DenseSet<BasicBlock *> Entries;
369   for (BasicBlock *E : OutliningInfo->Entries)
370     Entries.insert(E);
371 
372   // Returns true of BB has Predecessor which is not
373   // in Entries set.
374   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
375     for (auto Pred : predecessors(BB)) {
376       if (!Entries.count(Pred))
377         return true;
378     }
379     return false;
380   };
381   auto CheckAndNormalizeCandidate =
382       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
383         for (BasicBlock *E : OutliningInfo->Entries) {
384           for (auto Succ : successors(E)) {
385             if (Entries.count(Succ))
386               continue;
387             if (Succ == OutliningInfo->ReturnBlock)
388               OutliningInfo->ReturnBlockPreds.push_back(E);
389             else if (Succ != OutliningInfo->NonReturnBlock)
390               return false;
391           }
392           // There should not be any outside incoming edges either:
393           if (HasNonEntryPred(E))
394             return false;
395         }
396         return true;
397       };
398 
399   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
400     return std::unique_ptr<FunctionOutliningInfo>();
401 
402   // Now further growing the candidate's inlining region by
403   // peeling off dominating blocks from the outlining region:
404   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
405     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
406     if (SuccSize(Cand) != 2)
407       break;
408 
409     if (HasNonEntryPred(Cand))
410       break;
411 
412     BasicBlock *Succ1 = *succ_begin(Cand);
413     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
414 
415     BasicBlock *ReturnBlock, *NonReturnBlock;
416     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
417     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
418       break;
419 
420     if (NonReturnBlock->getSinglePredecessor() != Cand)
421       break;
422 
423     // Now grow and update OutlininigInfo:
424     OutliningInfo->Entries.push_back(Cand);
425     OutliningInfo->NonReturnBlock = NonReturnBlock;
426     OutliningInfo->ReturnBlockPreds.push_back(Cand);
427     Entries.insert(Cand);
428   }
429 
430   return OutliningInfo;
431 }
432 
433 // Check if there is PGO data or user annoated branch data:
434 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
435   if (F->getEntryCount())
436     return true;
437   // Now check if any of the entry block has MD_prof data:
438   for (auto *E : OI->Entries) {
439     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
440     if (!BR || BR->isUnconditional())
441       continue;
442     uint64_t T, F;
443     if (BR->extractProfMetadata(T, F))
444       return true;
445   }
446   return false;
447 }
448 
449 BranchProbability
450 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
451   auto EntryFreq =
452       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
453   auto OutliningCallFreq =
454       Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
455 
456   auto OutlineRegionRelFreq =
457       BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
458                                               EntryFreq.getFrequency());
459 
460   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
461     return OutlineRegionRelFreq;
462 
463   // When profile data is not available, we need to be conservative in
464   // estimating the overall savings. Static branch prediction can usually
465   // guess the branch direction right (taken/non-taken), but the guessed
466   // branch probability is usually not biased enough. In case when the
467   // outlined region is predicted to be likely, its probability needs
468   // to be made higher (more biased) to not under-estimate the cost of
469   // function outlining. On the other hand, if the outlined region
470   // is predicted to be less likely, the predicted probablity is usually
471   // higher than the actual. For instance, the actual probability of the
472   // less likely target is only 5%, but the guessed probablity can be
473   // 40%. In the latter case, there is no need for further adjustement.
474   // FIXME: add an option for this.
475   if (OutlineRegionRelFreq < BranchProbability(45, 100))
476     return OutlineRegionRelFreq;
477 
478   OutlineRegionRelFreq = std::max(
479       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
480 
481   return OutlineRegionRelFreq;
482 }
483 
484 bool PartialInlinerImpl::shouldPartialInline(
485     CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
486     OptimizationRemarkEmitter &ORE) {
487   using namespace ore;
488 
489   if (SkipCostAnalysis)
490     return true;
491 
492   Instruction *Call = CS.getInstruction();
493   Function *Callee = CS.getCalledFunction();
494   assert(Callee == Cloner.ClonedFunc);
495 
496   Function *Caller = CS.getCaller();
497   auto &CalleeTTI = (*GetTTI)(*Callee);
498   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
499                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
500 
501   if (IC.isAlways()) {
502     ORE.emit([&]() {
503       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
504              << NV("Callee", Cloner.OrigFunc)
505              << " should always be fully inlined, not partially";
506     });
507     return false;
508   }
509 
510   if (IC.isNever()) {
511     ORE.emit([&]() {
512       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
513              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
514              << NV("Caller", Caller)
515              << " because it should never be inlined (cost=never)";
516     });
517     return false;
518   }
519 
520   if (!IC) {
521     ORE.emit([&]() {
522       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
523              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
524              << NV("Caller", Caller) << " because too costly to inline (cost="
525              << NV("Cost", IC.getCost()) << ", threshold="
526              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
527     });
528     return false;
529   }
530   const DataLayout &DL = Caller->getParent()->getDataLayout();
531 
532   // The savings of eliminating the call:
533   int NonWeightedSavings = getCallsiteCost(CS, DL);
534   BlockFrequency NormWeightedSavings(NonWeightedSavings);
535 
536   // Weighted saving is smaller than weighted cost, return false
537   if (NormWeightedSavings < WeightedOutliningRcost) {
538     ORE.emit([&]() {
539       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
540                                         Call)
541              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
542              << NV("Caller", Caller) << " runtime overhead (overhead="
543              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
544              << ", savings="
545              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
546              << ")"
547              << " of making the outlined call is too high";
548     });
549 
550     return false;
551   }
552 
553   ORE.emit([&]() {
554     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
555            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
556            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
557            << " (threshold="
558            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
559   });
560   return true;
561 }
562 
563 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
564 // For now use a simplified version. The returned 'InlineCost' will be used
565 // to esimate the size cost as well as runtime cost of the BB.
566 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
567   int InlineCost = 0;
568   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
569   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
570     if (isa<DbgInfoIntrinsic>(I))
571       continue;
572 
573     switch (I->getOpcode()) {
574     case Instruction::BitCast:
575     case Instruction::PtrToInt:
576     case Instruction::IntToPtr:
577     case Instruction::Alloca:
578       continue;
579     case Instruction::GetElementPtr:
580       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
581         continue;
582     default:
583       break;
584     }
585 
586     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
587     if (IntrInst) {
588       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
589           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
590         continue;
591     }
592 
593     if (CallInst *CI = dyn_cast<CallInst>(I)) {
594       InlineCost += getCallsiteCost(CallSite(CI), DL);
595       continue;
596     }
597 
598     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
599       InlineCost += getCallsiteCost(CallSite(II), DL);
600       continue;
601     }
602 
603     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
604       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
605       continue;
606     }
607     InlineCost += InlineConstants::InstrCost;
608   }
609   return InlineCost;
610 }
611 
612 std::tuple<int, int>
613 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
614   // Now compute the cost of the call sequence to the outlined function
615   // 'OutlinedFunction' in BB 'OutliningCallBB':
616   int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
617 
618   // Now compute the cost of the extracted/outlined function itself:
619   int OutlinedFunctionCost = 0;
620   for (BasicBlock &BB : *Cloner.OutlinedFunc) {
621     OutlinedFunctionCost += computeBBInlineCost(&BB);
622   }
623 
624   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
625          "Outlined function cost should be no less than the outlined region");
626   // The code extractor introduces a new root and exit stub blocks with
627   // additional unconditional branches. Those branches will be eliminated
628   // later with bb layout. The cost should be adjusted accordingly:
629   OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
630 
631   int OutliningRuntimeOverhead =
632       OutliningFuncCallCost +
633       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
634       ExtraOutliningPenalty;
635 
636   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
637 }
638 
639 // Create the callsite to profile count map which is
640 // used to update the original function's entry count,
641 // after the function is partially inlined into the callsite.
642 void PartialInlinerImpl::computeCallsiteToProfCountMap(
643     Function *DuplicateFunction,
644     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
645   std::vector<User *> Users(DuplicateFunction->user_begin(),
646                             DuplicateFunction->user_end());
647   Function *CurrentCaller = nullptr;
648   std::unique_ptr<BlockFrequencyInfo> TempBFI;
649   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
650 
651   auto ComputeCurrBFI = [&,this](Function *Caller) {
652       // For the old pass manager:
653       if (!GetBFI) {
654         DominatorTree DT(*Caller);
655         LoopInfo LI(DT);
656         BranchProbabilityInfo BPI(*Caller, LI);
657         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
658         CurrentCallerBFI = TempBFI.get();
659       } else {
660         // New pass manager:
661         CurrentCallerBFI = &(*GetBFI)(*Caller);
662       }
663   };
664 
665   for (User *User : Users) {
666     CallSite CS = getCallSite(User);
667     Function *Caller = CS.getCaller();
668     if (CurrentCaller != Caller) {
669       CurrentCaller = Caller;
670       ComputeCurrBFI(Caller);
671     } else {
672       assert(CurrentCallerBFI && "CallerBFI is not set");
673     }
674     BasicBlock *CallBB = CS.getInstruction()->getParent();
675     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
676     if (Count)
677       CallSiteToProfCountMap[User] = *Count;
678     else
679       CallSiteToProfCountMap[User] = 0;
680   }
681 }
682 
683 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
684                                                    FunctionOutliningInfo *OI)
685     : OrigFunc(F) {
686   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
687 
688   // Clone the function, so that we can hack away on it.
689   ValueToValueMapTy VMap;
690   ClonedFunc = CloneFunction(F, VMap);
691 
692   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
693   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
694   for (BasicBlock *BB : OI->Entries) {
695     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
696   }
697   for (BasicBlock *E : OI->ReturnBlockPreds) {
698     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
699     ClonedOI->ReturnBlockPreds.push_back(NewE);
700   }
701   // Go ahead and update all uses to the duplicate, so that we can just
702   // use the inliner functionality when we're done hacking.
703   F->replaceAllUsesWith(ClonedFunc);
704 }
705 
706 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
707   auto getFirstPHI = [](BasicBlock *BB) {
708     BasicBlock::iterator I = BB->begin();
709     PHINode *FirstPhi = nullptr;
710     while (I != BB->end()) {
711       PHINode *Phi = dyn_cast<PHINode>(I);
712       if (!Phi)
713         break;
714       if (!FirstPhi) {
715         FirstPhi = Phi;
716         break;
717       }
718     }
719     return FirstPhi;
720   };
721 
722   // Special hackery is needed with PHI nodes that have inputs from more than
723   // one extracted block.  For simplicity, just split the PHIs into a two-level
724   // sequence of PHIs, some of which will go in the extracted region, and some
725   // of which will go outside.
726   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
727   // only split block when necessary:
728   PHINode *FirstPhi = getFirstPHI(PreReturn);
729   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
730 
731   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
732     return;
733 
734   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
735     Value *CommonValue = PN->getIncomingValue(0);
736     if (all_of(PN->incoming_values(),
737                [&](Value *V) { return V == CommonValue; }))
738       return CommonValue;
739     return nullptr;
740   };
741 
742   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
743       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
744   BasicBlock::iterator I = PreReturn->begin();
745   Instruction *Ins = &ClonedOI->ReturnBlock->front();
746   SmallVector<Instruction *, 4> DeadPhis;
747   while (I != PreReturn->end()) {
748     PHINode *OldPhi = dyn_cast<PHINode>(I);
749     if (!OldPhi)
750       break;
751 
752     PHINode *RetPhi =
753         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
754     OldPhi->replaceAllUsesWith(RetPhi);
755     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
756 
757     RetPhi->addIncoming(&*I, PreReturn);
758     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
759       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
760       OldPhi->removeIncomingValue(E);
761     }
762 
763     // After incoming values splitting, the old phi may become trivial.
764     // Keeping the trivial phi can introduce definition inside the outline
765     // region which is live-out, causing necessary overhead (load, store
766     // arg passing etc).
767     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
768       OldPhi->replaceAllUsesWith(OldPhiVal);
769       DeadPhis.push_back(OldPhi);
770     }
771     ++I;
772     }
773     for (auto *DP : DeadPhis)
774       DP->eraseFromParent();
775 
776     for (auto E : ClonedOI->ReturnBlockPreds) {
777       E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
778     }
779 }
780 
781 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
782   // Returns true if the block is to be partial inlined into the caller
783   // (i.e. not to be extracted to the out of line function)
784   auto ToBeInlined = [&, this](BasicBlock *BB) {
785     return BB == ClonedOI->ReturnBlock ||
786            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
787             ClonedOI->Entries.end());
788   };
789 
790   // Gather up the blocks that we're going to extract.
791   std::vector<BasicBlock *> ToExtract;
792   ToExtract.push_back(ClonedOI->NonReturnBlock);
793   OutlinedRegionCost +=
794       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
795   for (BasicBlock &BB : *ClonedFunc)
796     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
797       ToExtract.push_back(&BB);
798       // FIXME: the code extractor may hoist/sink more code
799       // into the outlined function which may make the outlining
800       // overhead (the difference of the outlined function cost
801       // and OutliningRegionCost) look larger.
802       OutlinedRegionCost += computeBBInlineCost(&BB);
803     }
804 
805   // The CodeExtractor needs a dominator tree.
806   DominatorTree DT;
807   DT.recalculate(*ClonedFunc);
808 
809   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
810   LoopInfo LI(DT);
811   BranchProbabilityInfo BPI(*ClonedFunc, LI);
812   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
813 
814   // Extract the body of the if.
815   OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
816                                ClonedFuncBFI.get(), &BPI)
817                      .extractCodeRegion();
818 
819   if (OutlinedFunc) {
820     OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
821         .getInstruction()
822         ->getParent();
823     assert(OutliningCallBB->getParent() == ClonedFunc);
824   }
825 
826   return OutlinedFunc;
827 }
828 
829 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
830   // Ditch the duplicate, since we're done with it, and rewrite all remaining
831   // users (function pointers, etc.) back to the original function.
832   ClonedFunc->replaceAllUsesWith(OrigFunc);
833   ClonedFunc->eraseFromParent();
834   if (!IsFunctionInlined) {
835     // Remove the function that is speculatively created if there is no
836     // reference.
837     if (OutlinedFunc)
838       OutlinedFunc->eraseFromParent();
839   }
840 }
841 
842 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
843   if (F->hasAddressTaken())
844     return nullptr;
845 
846   // Let inliner handle it
847   if (F->hasFnAttribute(Attribute::AlwaysInline))
848     return nullptr;
849 
850   if (F->hasFnAttribute(Attribute::NoInline))
851     return nullptr;
852 
853   if (PSI->isFunctionEntryCold(F))
854     return nullptr;
855 
856   if (F->user_begin() == F->user_end())
857     return nullptr;
858 
859   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
860 
861   if (!OI)
862     return nullptr;
863 
864   FunctionCloner Cloner(F, OI.get());
865   Cloner.NormalizeReturnBlock();
866   Function *OutlinedFunction = Cloner.doFunctionOutlining();
867 
868   bool AnyInline = tryPartialInline(Cloner);
869 
870   if (AnyInline)
871     return OutlinedFunction;
872 
873   return nullptr;
874 }
875 
876 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
877   int NonWeightedRcost;
878   int SizeCost;
879 
880   if (Cloner.OutlinedFunc == nullptr)
881     return false;
882 
883   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
884 
885   auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
886   auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
887 
888   // The call sequence to the outlined function is larger than the original
889   // outlined region size, it does not increase the chances of inlining
890   // the function with outlining (The inliner uses the size increase to
891   // model the cost of inlining a callee).
892   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
893     OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
894     DebugLoc DLoc;
895     BasicBlock *Block;
896     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
897     ORE.emit([&]() {
898       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
899                                         DLoc, Block)
900              << ore::NV("Function", Cloner.OrigFunc)
901              << " not partially inlined into callers (Original Size = "
902              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
903              << ", Size of call sequence to outlined function = "
904              << ore::NV("NewSize", SizeCost) << ")";
905     });
906     return false;
907   }
908 
909   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
910          "F's users should all be replaced!");
911 
912   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
913                             Cloner.ClonedFunc->user_end());
914 
915   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
916   if (Cloner.OrigFunc->getEntryCount())
917     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
918 
919   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
920   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
921 
922   bool AnyInline = false;
923   for (User *User : Users) {
924     CallSite CS = getCallSite(User);
925 
926     if (IsLimitReached())
927       continue;
928 
929     OptimizationRemarkEmitter ORE(CS.getCaller());
930 
931     if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
932       continue;
933 
934     ORE.emit([&]() {
935       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined",
936                                 CS.getInstruction())
937              << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
938              << ore::NV("Caller", CS.getCaller());
939     });
940 
941     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
942     InlineFunction(CS, IFI);
943 
944     // Now update the entry count:
945     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
946       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
947       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
948     }
949 
950     AnyInline = true;
951     NumPartialInlining++;
952     // Update the stats
953     NumPartialInlined++;
954   }
955 
956   if (AnyInline) {
957     Cloner.IsFunctionInlined = true;
958     if (CalleeEntryCount)
959       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
960   }
961 
962   return AnyInline;
963 }
964 
965 bool PartialInlinerImpl::run(Module &M) {
966   if (DisablePartialInlining)
967     return false;
968 
969   std::vector<Function *> Worklist;
970   Worklist.reserve(M.size());
971   for (Function &F : M)
972     if (!F.use_empty() && !F.isDeclaration())
973       Worklist.push_back(&F);
974 
975   bool Changed = false;
976   while (!Worklist.empty()) {
977     Function *CurrFunc = Worklist.back();
978     Worklist.pop_back();
979 
980     if (CurrFunc->use_empty())
981       continue;
982 
983     bool Recursive = false;
984     for (User *U : CurrFunc->users())
985       if (Instruction *I = dyn_cast<Instruction>(U))
986         if (I->getParent()->getParent() == CurrFunc) {
987           Recursive = true;
988           break;
989         }
990     if (Recursive)
991       continue;
992 
993     if (Function *NewFunc = unswitchFunction(CurrFunc)) {
994       Worklist.push_back(NewFunc);
995       Changed = true;
996     }
997   }
998 
999   return Changed;
1000 }
1001 
1002 char PartialInlinerLegacyPass::ID = 0;
1003 
1004 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1005                       "Partial Inliner", false, false)
1006 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1007 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1008 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1009 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1010                     "Partial Inliner", false, false)
1011 
1012 ModulePass *llvm::createPartialInliningPass() {
1013   return new PartialInlinerLegacyPass();
1014 }
1015 
1016 PreservedAnalyses PartialInlinerPass::run(Module &M,
1017                                           ModuleAnalysisManager &AM) {
1018   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1019 
1020   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1021       [&FAM](Function &F) -> AssumptionCache & {
1022     return FAM.getResult<AssumptionAnalysis>(F);
1023   };
1024 
1025   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1026       [&FAM](Function &F) -> BlockFrequencyInfo & {
1027     return FAM.getResult<BlockFrequencyAnalysis>(F);
1028   };
1029 
1030   std::function<TargetTransformInfo &(Function &)> GetTTI =
1031       [&FAM](Function &F) -> TargetTransformInfo & {
1032     return FAM.getResult<TargetIRAnalysis>(F);
1033   };
1034 
1035   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1036 
1037   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
1038     return PreservedAnalyses::none();
1039   return PreservedAnalyses::all();
1040 }
1041