1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs partial inlining, typically by inlining an if statement
11 // that surrounds the body of the function.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/IPO/PartialInlining.h"
16 #include "llvm/ADT/DenseMap.h"
17 #include "llvm/ADT/DenseSet.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/Optional.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/Analysis/BlockFrequencyInfo.h"
24 #include "llvm/Analysis/BranchProbabilityInfo.h"
25 #include "llvm/Analysis/InlineCost.h"
26 #include "llvm/Analysis/LoopInfo.h"
27 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
28 #include "llvm/Analysis/ProfileSummaryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/CallSite.h"
34 #include "llvm/IR/DebugLoc.h"
35 #include "llvm/IR/DiagnosticInfo.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/IntrinsicInst.h"
42 #include "llvm/IR/Intrinsics.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <functional>
59 #include <iterator>
60 #include <memory>
61 #include <tuple>
62 #include <vector>
63 
64 using namespace llvm;
65 
66 #define DEBUG_TYPE "partial-inlining"
67 
68 STATISTIC(NumPartialInlined,
69           "Number of callsites functions partially inlined into.");
70 
71 // Command line option to disable partial-inlining. The default is false:
72 static cl::opt<bool>
73     DisablePartialInlining("disable-partial-inlining", cl::init(false),
74                            cl::Hidden, cl::desc("Disable partial ininling"));
75 
76 // This is an option used by testing:
77 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
78                                       cl::init(false), cl::ZeroOrMore,
79                                       cl::ReallyHidden,
80                                       cl::desc("Skip Cost Analysis"));
81 
82 static cl::opt<unsigned> MaxNumInlineBlocks(
83     "max-num-inline-blocks", cl::init(5), cl::Hidden,
84     cl::desc("Max number of blocks to be partially inlined"));
85 
86 // Command line option to set the maximum number of partial inlining allowed
87 // for the module. The default value of -1 means no limit.
88 static cl::opt<int> MaxNumPartialInlining(
89     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
90     cl::desc("Max number of partial inlining. The default is unlimited"));
91 
92 // Used only when PGO or user annotated branch data is absent. It is
93 // the least value that is used to weigh the outline region. If BFI
94 // produces larger value, the BFI value will be used.
95 static cl::opt<int>
96     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
97                              cl::Hidden, cl::ZeroOrMore,
98                              cl::desc("Relative frequency of outline region to "
99                                       "the entry block"));
100 
101 static cl::opt<unsigned> ExtraOutliningPenalty(
102     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
103     cl::desc("A debug option to add additional penalty to the computed one."));
104 
105 namespace {
106 
107 struct FunctionOutliningInfo {
108   FunctionOutliningInfo() = default;
109 
110   // Returns the number of blocks to be inlined including all blocks
111   // in Entries and one return block.
112   unsigned GetNumInlinedBlocks() const { return Entries.size() + 1; }
113 
114   // A set of blocks including the function entry that guard
115   // the region to be outlined.
116   SmallVector<BasicBlock *, 4> Entries;
117 
118   // The return block that is not included in the outlined region.
119   BasicBlock *ReturnBlock = nullptr;
120 
121   // The dominating block of the region to be outlined.
122   BasicBlock *NonReturnBlock = nullptr;
123 
124   // The set of blocks in Entries that that are predecessors to ReturnBlock
125   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
126 };
127 
128 struct PartialInlinerImpl {
129   PartialInlinerImpl(
130       std::function<AssumptionCache &(Function &)> *GetAC,
131       std::function<TargetTransformInfo &(Function &)> *GTTI,
132       Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
133       ProfileSummaryInfo *ProfSI)
134       : GetAssumptionCache(GetAC), GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
135 
136   bool run(Module &M);
137   Function *unswitchFunction(Function *F);
138 
139   // This class speculatively clones the the function to be partial inlined.
140   // At the end of partial inlining, the remaining callsites to the cloned
141   // function that are not partially inlined will be fixed up to reference
142   // the original function, and the cloned function will be erased.
143   struct FunctionCloner {
144     FunctionCloner(Function *F, FunctionOutliningInfo *OI);
145     ~FunctionCloner();
146 
147     // Prepare for function outlining: making sure there is only
148     // one incoming edge from the extracted/outlined region to
149     // the return block.
150     void NormalizeReturnBlock();
151 
152     // Do function outlining.
153     // NOTE: For vararg functions that do the vararg handling in the outlined
154     //       function, we temporarily generate IR that does not properly
155     //       forward varargs to the outlined function. Calling InlineFunction
156     //       will update calls to the outlined functions to properly forward
157     //       the varargs.
158     Function *doFunctionOutlining();
159 
160     Function *OrigFunc = nullptr;
161     Function *ClonedFunc = nullptr;
162     Function *OutlinedFunc = nullptr;
163     BasicBlock *OutliningCallBB = nullptr;
164     // ClonedFunc is inlined in one of its callers after function
165     // outlining.
166     bool IsFunctionInlined = false;
167     // The cost of the region to be outlined.
168     int OutlinedRegionCost = 0;
169     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
170     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
171   };
172 
173 private:
174   int NumPartialInlining = 0;
175   std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
176   std::function<TargetTransformInfo &(Function &)> *GetTTI;
177   Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
178   ProfileSummaryInfo *PSI;
179 
180   // Return the frequency of the OutlininingBB relative to F's entry point.
181   // The result is no larger than 1 and is represented using BP.
182   // (Note that the outlined region's 'head' block can only have incoming
183   // edges from the guarding entry blocks).
184   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
185 
186   // Return true if the callee of CS should be partially inlined with
187   // profit.
188   bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
189                            BlockFrequency WeightedOutliningRcost,
190                            OptimizationRemarkEmitter &ORE);
191 
192   // Try to inline DuplicateFunction (cloned from F with call to
193   // the OutlinedFunction into its callers. Return true
194   // if there is any successful inlining.
195   bool tryPartialInline(FunctionCloner &Cloner);
196 
197   // Compute the mapping from use site of DuplicationFunction to the enclosing
198   // BB's profile count.
199   void computeCallsiteToProfCountMap(Function *DuplicateFunction,
200                                      DenseMap<User *, uint64_t> &SiteCountMap);
201 
202   bool IsLimitReached() {
203     return (MaxNumPartialInlining != -1 &&
204             NumPartialInlining >= MaxNumPartialInlining);
205   }
206 
207   static CallSite getCallSite(User *U) {
208     CallSite CS;
209     if (CallInst *CI = dyn_cast<CallInst>(U))
210       CS = CallSite(CI);
211     else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
212       CS = CallSite(II);
213     else
214       llvm_unreachable("All uses must be calls");
215     return CS;
216   }
217 
218   static CallSite getOneCallSiteTo(Function *F) {
219     User *User = *F->user_begin();
220     return getCallSite(User);
221   }
222 
223   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
224     CallSite CS = getOneCallSiteTo(F);
225     DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
226     BasicBlock *Block = CS.getParent();
227     return std::make_tuple(DLoc, Block);
228   }
229 
230   // Returns the costs associated with function outlining:
231   // - The first value is the non-weighted runtime cost for making the call
232   //   to the outlined function, including the addtional  setup cost in the
233   //    outlined function itself;
234   // - The second value is the estimated size of the new call sequence in
235   //   basic block Cloner.OutliningCallBB;
236   std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner);
237 
238   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
239   // approximate both the size and runtime cost (Note that in the current
240   // inline cost analysis, there is no clear distinction there either).
241   static int computeBBInlineCost(BasicBlock *BB);
242 
243   std::unique_ptr<FunctionOutliningInfo> computeOutliningInfo(Function *F);
244 };
245 
246 struct PartialInlinerLegacyPass : public ModulePass {
247   static char ID; // Pass identification, replacement for typeid
248 
249   PartialInlinerLegacyPass() : ModulePass(ID) {
250     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
251   }
252 
253   void getAnalysisUsage(AnalysisUsage &AU) const override {
254     AU.addRequired<AssumptionCacheTracker>();
255     AU.addRequired<ProfileSummaryInfoWrapperPass>();
256     AU.addRequired<TargetTransformInfoWrapperPass>();
257   }
258 
259   bool runOnModule(Module &M) override {
260     if (skipModule(M))
261       return false;
262 
263     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
264     TargetTransformInfoWrapperPass *TTIWP =
265         &getAnalysis<TargetTransformInfoWrapperPass>();
266     ProfileSummaryInfo *PSI =
267         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
268 
269     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
270         [&ACT](Function &F) -> AssumptionCache & {
271       return ACT->getAssumptionCache(F);
272     };
273 
274     std::function<TargetTransformInfo &(Function &)> GetTTI =
275         [&TTIWP](Function &F) -> TargetTransformInfo & {
276       return TTIWP->getTTI(F);
277     };
278 
279     return PartialInlinerImpl(&GetAssumptionCache, &GetTTI, None, PSI).run(M);
280   }
281 };
282 
283 } // end anonymous namespace
284 
285 std::unique_ptr<FunctionOutliningInfo>
286 PartialInlinerImpl::computeOutliningInfo(Function *F) {
287   BasicBlock *EntryBlock = &F->front();
288   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
289   if (!BR || BR->isUnconditional())
290     return std::unique_ptr<FunctionOutliningInfo>();
291 
292   // Returns true if Succ is BB's successor
293   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
294     return is_contained(successors(BB), Succ);
295   };
296 
297   auto SuccSize = [](BasicBlock *BB) {
298     return std::distance(succ_begin(BB), succ_end(BB));
299   };
300 
301   auto IsReturnBlock = [](BasicBlock *BB) {
302     TerminatorInst *TI = BB->getTerminator();
303     return isa<ReturnInst>(TI);
304   };
305 
306   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
307     if (IsReturnBlock(Succ1))
308       return std::make_tuple(Succ1, Succ2);
309     if (IsReturnBlock(Succ2))
310       return std::make_tuple(Succ2, Succ1);
311 
312     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
313   };
314 
315   // Detect a triangular shape:
316   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
317     if (IsSuccessor(Succ1, Succ2))
318       return std::make_tuple(Succ1, Succ2);
319     if (IsSuccessor(Succ2, Succ1))
320       return std::make_tuple(Succ2, Succ1);
321 
322     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
323   };
324 
325   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
326       llvm::make_unique<FunctionOutliningInfo>();
327 
328   BasicBlock *CurrEntry = EntryBlock;
329   bool CandidateFound = false;
330   do {
331     // The number of blocks to be inlined has already reached
332     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
333     // disables partial inlining for the function.
334     if (OutliningInfo->GetNumInlinedBlocks() >= MaxNumInlineBlocks)
335       break;
336 
337     if (SuccSize(CurrEntry) != 2)
338       break;
339 
340     BasicBlock *Succ1 = *succ_begin(CurrEntry);
341     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
342 
343     BasicBlock *ReturnBlock, *NonReturnBlock;
344     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
345 
346     if (ReturnBlock) {
347       OutliningInfo->Entries.push_back(CurrEntry);
348       OutliningInfo->ReturnBlock = ReturnBlock;
349       OutliningInfo->NonReturnBlock = NonReturnBlock;
350       CandidateFound = true;
351       break;
352     }
353 
354     BasicBlock *CommSucc;
355     BasicBlock *OtherSucc;
356     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
357 
358     if (!CommSucc)
359       break;
360 
361     OutliningInfo->Entries.push_back(CurrEntry);
362     CurrEntry = OtherSucc;
363   } while (true);
364 
365   if (!CandidateFound)
366     return std::unique_ptr<FunctionOutliningInfo>();
367 
368   // Do sanity check of the entries: threre should not
369   // be any successors (not in the entry set) other than
370   // {ReturnBlock, NonReturnBlock}
371   assert(OutliningInfo->Entries[0] == &F->front() &&
372          "Function Entry must be the first in Entries vector");
373   DenseSet<BasicBlock *> Entries;
374   for (BasicBlock *E : OutliningInfo->Entries)
375     Entries.insert(E);
376 
377   // Returns true of BB has Predecessor which is not
378   // in Entries set.
379   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
380     for (auto Pred : predecessors(BB)) {
381       if (!Entries.count(Pred))
382         return true;
383     }
384     return false;
385   };
386   auto CheckAndNormalizeCandidate =
387       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
388         for (BasicBlock *E : OutliningInfo->Entries) {
389           for (auto Succ : successors(E)) {
390             if (Entries.count(Succ))
391               continue;
392             if (Succ == OutliningInfo->ReturnBlock)
393               OutliningInfo->ReturnBlockPreds.push_back(E);
394             else if (Succ != OutliningInfo->NonReturnBlock)
395               return false;
396           }
397           // There should not be any outside incoming edges either:
398           if (HasNonEntryPred(E))
399             return false;
400         }
401         return true;
402       };
403 
404   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
405     return std::unique_ptr<FunctionOutliningInfo>();
406 
407   // Now further growing the candidate's inlining region by
408   // peeling off dominating blocks from the outlining region:
409   while (OutliningInfo->GetNumInlinedBlocks() < MaxNumInlineBlocks) {
410     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
411     if (SuccSize(Cand) != 2)
412       break;
413 
414     if (HasNonEntryPred(Cand))
415       break;
416 
417     BasicBlock *Succ1 = *succ_begin(Cand);
418     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
419 
420     BasicBlock *ReturnBlock, *NonReturnBlock;
421     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
422     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
423       break;
424 
425     if (NonReturnBlock->getSinglePredecessor() != Cand)
426       break;
427 
428     // Now grow and update OutlininigInfo:
429     OutliningInfo->Entries.push_back(Cand);
430     OutliningInfo->NonReturnBlock = NonReturnBlock;
431     OutliningInfo->ReturnBlockPreds.push_back(Cand);
432     Entries.insert(Cand);
433   }
434 
435   return OutliningInfo;
436 }
437 
438 // Check if there is PGO data or user annoated branch data:
439 static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) {
440   if (F->getEntryCount())
441     return true;
442   // Now check if any of the entry block has MD_prof data:
443   for (auto *E : OI->Entries) {
444     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
445     if (!BR || BR->isUnconditional())
446       continue;
447     uint64_t T, F;
448     if (BR->extractProfMetadata(T, F))
449       return true;
450   }
451   return false;
452 }
453 
454 BranchProbability
455 PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
456   auto EntryFreq =
457       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
458   auto OutliningCallFreq =
459       Cloner.ClonedFuncBFI->getBlockFreq(Cloner.OutliningCallBB);
460 
461   auto OutlineRegionRelFreq =
462       BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(),
463                                               EntryFreq.getFrequency());
464 
465   if (hasProfileData(Cloner.OrigFunc, Cloner.ClonedOI.get()))
466     return OutlineRegionRelFreq;
467 
468   // When profile data is not available, we need to be conservative in
469   // estimating the overall savings. Static branch prediction can usually
470   // guess the branch direction right (taken/non-taken), but the guessed
471   // branch probability is usually not biased enough. In case when the
472   // outlined region is predicted to be likely, its probability needs
473   // to be made higher (more biased) to not under-estimate the cost of
474   // function outlining. On the other hand, if the outlined region
475   // is predicted to be less likely, the predicted probablity is usually
476   // higher than the actual. For instance, the actual probability of the
477   // less likely target is only 5%, but the guessed probablity can be
478   // 40%. In the latter case, there is no need for further adjustement.
479   // FIXME: add an option for this.
480   if (OutlineRegionRelFreq < BranchProbability(45, 100))
481     return OutlineRegionRelFreq;
482 
483   OutlineRegionRelFreq = std::max(
484       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
485 
486   return OutlineRegionRelFreq;
487 }
488 
489 bool PartialInlinerImpl::shouldPartialInline(
490     CallSite CS, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
491     OptimizationRemarkEmitter &ORE) {
492   using namespace ore;
493 
494   if (SkipCostAnalysis)
495     return true;
496 
497   Instruction *Call = CS.getInstruction();
498   Function *Callee = CS.getCalledFunction();
499   assert(Callee == Cloner.ClonedFunc);
500 
501   Function *Caller = CS.getCaller();
502   auto &CalleeTTI = (*GetTTI)(*Callee);
503   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
504                                 *GetAssumptionCache, GetBFI, PSI, &ORE);
505 
506   if (IC.isAlways()) {
507     ORE.emit([&]() {
508       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
509              << NV("Callee", Cloner.OrigFunc)
510              << " should always be fully inlined, not partially";
511     });
512     return false;
513   }
514 
515   if (IC.isNever()) {
516     ORE.emit([&]() {
517       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
518              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
519              << NV("Caller", Caller)
520              << " because it should never be inlined (cost=never)";
521     });
522     return false;
523   }
524 
525   if (!IC) {
526     ORE.emit([&]() {
527       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
528              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
529              << NV("Caller", Caller) << " because too costly to inline (cost="
530              << NV("Cost", IC.getCost()) << ", threshold="
531              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
532     });
533     return false;
534   }
535   const DataLayout &DL = Caller->getParent()->getDataLayout();
536 
537   // The savings of eliminating the call:
538   int NonWeightedSavings = getCallsiteCost(CS, DL);
539   BlockFrequency NormWeightedSavings(NonWeightedSavings);
540 
541   // Weighted saving is smaller than weighted cost, return false
542   if (NormWeightedSavings < WeightedOutliningRcost) {
543     ORE.emit([&]() {
544       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
545                                         Call)
546              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
547              << NV("Caller", Caller) << " runtime overhead (overhead="
548              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
549              << ", savings="
550              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
551              << ")"
552              << " of making the outlined call is too high";
553     });
554 
555     return false;
556   }
557 
558   ORE.emit([&]() {
559     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
560            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
561            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
562            << " (threshold="
563            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
564   });
565   return true;
566 }
567 
568 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
569 // For now use a simplified version. The returned 'InlineCost' will be used
570 // to esimate the size cost as well as runtime cost of the BB.
571 int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) {
572   int InlineCost = 0;
573   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
574   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
575     if (isa<DbgInfoIntrinsic>(I))
576       continue;
577 
578     switch (I->getOpcode()) {
579     case Instruction::BitCast:
580     case Instruction::PtrToInt:
581     case Instruction::IntToPtr:
582     case Instruction::Alloca:
583       continue;
584     case Instruction::GetElementPtr:
585       if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
586         continue;
587     default:
588       break;
589     }
590 
591     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(I);
592     if (IntrInst) {
593       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start ||
594           IntrInst->getIntrinsicID() == Intrinsic::lifetime_end)
595         continue;
596     }
597 
598     if (CallInst *CI = dyn_cast<CallInst>(I)) {
599       InlineCost += getCallsiteCost(CallSite(CI), DL);
600       continue;
601     }
602 
603     if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
604       InlineCost += getCallsiteCost(CallSite(II), DL);
605       continue;
606     }
607 
608     if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
609       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
610       continue;
611     }
612     InlineCost += InlineConstants::InstrCost;
613   }
614   return InlineCost;
615 }
616 
617 std::tuple<int, int>
618 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) {
619   // Now compute the cost of the call sequence to the outlined function
620   // 'OutlinedFunction' in BB 'OutliningCallBB':
621   int OutliningFuncCallCost = computeBBInlineCost(Cloner.OutliningCallBB);
622 
623   // Now compute the cost of the extracted/outlined function itself:
624   int OutlinedFunctionCost = 0;
625   for (BasicBlock &BB : *Cloner.OutlinedFunc) {
626     OutlinedFunctionCost += computeBBInlineCost(&BB);
627   }
628 
629   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
630          "Outlined function cost should be no less than the outlined region");
631   // The code extractor introduces a new root and exit stub blocks with
632   // additional unconditional branches. Those branches will be eliminated
633   // later with bb layout. The cost should be adjusted accordingly:
634   OutlinedFunctionCost -= 2 * InlineConstants::InstrCost;
635 
636   int OutliningRuntimeOverhead =
637       OutliningFuncCallCost +
638       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
639       ExtraOutliningPenalty;
640 
641   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
642 }
643 
644 // Create the callsite to profile count map which is
645 // used to update the original function's entry count,
646 // after the function is partially inlined into the callsite.
647 void PartialInlinerImpl::computeCallsiteToProfCountMap(
648     Function *DuplicateFunction,
649     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) {
650   std::vector<User *> Users(DuplicateFunction->user_begin(),
651                             DuplicateFunction->user_end());
652   Function *CurrentCaller = nullptr;
653   std::unique_ptr<BlockFrequencyInfo> TempBFI;
654   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
655 
656   auto ComputeCurrBFI = [&,this](Function *Caller) {
657       // For the old pass manager:
658       if (!GetBFI) {
659         DominatorTree DT(*Caller);
660         LoopInfo LI(DT);
661         BranchProbabilityInfo BPI(*Caller, LI);
662         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
663         CurrentCallerBFI = TempBFI.get();
664       } else {
665         // New pass manager:
666         CurrentCallerBFI = &(*GetBFI)(*Caller);
667       }
668   };
669 
670   for (User *User : Users) {
671     CallSite CS = getCallSite(User);
672     Function *Caller = CS.getCaller();
673     if (CurrentCaller != Caller) {
674       CurrentCaller = Caller;
675       ComputeCurrBFI(Caller);
676     } else {
677       assert(CurrentCallerBFI && "CallerBFI is not set");
678     }
679     BasicBlock *CallBB = CS.getInstruction()->getParent();
680     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
681     if (Count)
682       CallSiteToProfCountMap[User] = *Count;
683     else
684       CallSiteToProfCountMap[User] = 0;
685   }
686 }
687 
688 PartialInlinerImpl::FunctionCloner::FunctionCloner(Function *F,
689                                                    FunctionOutliningInfo *OI)
690     : OrigFunc(F) {
691   ClonedOI = llvm::make_unique<FunctionOutliningInfo>();
692 
693   // Clone the function, so that we can hack away on it.
694   ValueToValueMapTy VMap;
695   ClonedFunc = CloneFunction(F, VMap);
696 
697   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
698   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
699   for (BasicBlock *BB : OI->Entries) {
700     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
701   }
702   for (BasicBlock *E : OI->ReturnBlockPreds) {
703     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
704     ClonedOI->ReturnBlockPreds.push_back(NewE);
705   }
706   // Go ahead and update all uses to the duplicate, so that we can just
707   // use the inliner functionality when we're done hacking.
708   F->replaceAllUsesWith(ClonedFunc);
709 }
710 
711 void PartialInlinerImpl::FunctionCloner::NormalizeReturnBlock() {
712   auto getFirstPHI = [](BasicBlock *BB) {
713     BasicBlock::iterator I = BB->begin();
714     PHINode *FirstPhi = nullptr;
715     while (I != BB->end()) {
716       PHINode *Phi = dyn_cast<PHINode>(I);
717       if (!Phi)
718         break;
719       if (!FirstPhi) {
720         FirstPhi = Phi;
721         break;
722       }
723     }
724     return FirstPhi;
725   };
726 
727   // Special hackery is needed with PHI nodes that have inputs from more than
728   // one extracted block.  For simplicity, just split the PHIs into a two-level
729   // sequence of PHIs, some of which will go in the extracted region, and some
730   // of which will go outside.
731   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
732   // only split block when necessary:
733   PHINode *FirstPhi = getFirstPHI(PreReturn);
734   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
735 
736   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
737     return;
738 
739   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
740     Value *CommonValue = PN->getIncomingValue(0);
741     if (all_of(PN->incoming_values(),
742                [&](Value *V) { return V == CommonValue; }))
743       return CommonValue;
744     return nullptr;
745   };
746 
747   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
748       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
749   BasicBlock::iterator I = PreReturn->begin();
750   Instruction *Ins = &ClonedOI->ReturnBlock->front();
751   SmallVector<Instruction *, 4> DeadPhis;
752   while (I != PreReturn->end()) {
753     PHINode *OldPhi = dyn_cast<PHINode>(I);
754     if (!OldPhi)
755       break;
756 
757     PHINode *RetPhi =
758         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
759     OldPhi->replaceAllUsesWith(RetPhi);
760     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
761 
762     RetPhi->addIncoming(&*I, PreReturn);
763     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
764       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
765       OldPhi->removeIncomingValue(E);
766     }
767 
768     // After incoming values splitting, the old phi may become trivial.
769     // Keeping the trivial phi can introduce definition inside the outline
770     // region which is live-out, causing necessary overhead (load, store
771     // arg passing etc).
772     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
773       OldPhi->replaceAllUsesWith(OldPhiVal);
774       DeadPhis.push_back(OldPhi);
775     }
776     ++I;
777     }
778     for (auto *DP : DeadPhis)
779       DP->eraseFromParent();
780 
781     for (auto E : ClonedOI->ReturnBlockPreds) {
782       E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
783     }
784 }
785 
786 Function *PartialInlinerImpl::FunctionCloner::doFunctionOutlining() {
787   // Returns true if the block is to be partial inlined into the caller
788   // (i.e. not to be extracted to the out of line function)
789   auto ToBeInlined = [&, this](BasicBlock *BB) {
790     return BB == ClonedOI->ReturnBlock ||
791            (std::find(ClonedOI->Entries.begin(), ClonedOI->Entries.end(), BB) !=
792             ClonedOI->Entries.end());
793   };
794 
795   // Gather up the blocks that we're going to extract.
796   std::vector<BasicBlock *> ToExtract;
797   ToExtract.push_back(ClonedOI->NonReturnBlock);
798   OutlinedRegionCost +=
799       PartialInlinerImpl::computeBBInlineCost(ClonedOI->NonReturnBlock);
800   for (BasicBlock &BB : *ClonedFunc)
801     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
802       ToExtract.push_back(&BB);
803       // FIXME: the code extractor may hoist/sink more code
804       // into the outlined function which may make the outlining
805       // overhead (the difference of the outlined function cost
806       // and OutliningRegionCost) look larger.
807       OutlinedRegionCost += computeBBInlineCost(&BB);
808     }
809 
810   // The CodeExtractor needs a dominator tree.
811   DominatorTree DT;
812   DT.recalculate(*ClonedFunc);
813 
814   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
815   LoopInfo LI(DT);
816   BranchProbabilityInfo BPI(*ClonedFunc, LI);
817   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
818 
819   // Extract the body of the if.
820   OutlinedFunc = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
821                                ClonedFuncBFI.get(), &BPI,
822                                /* AllowVarargs */ true)
823                      .extractCodeRegion();
824 
825   if (OutlinedFunc) {
826     OutliningCallBB = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
827         .getInstruction()
828         ->getParent();
829     assert(OutliningCallBB->getParent() == ClonedFunc);
830   }
831 
832   return OutlinedFunc;
833 }
834 
835 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
836   // Ditch the duplicate, since we're done with it, and rewrite all remaining
837   // users (function pointers, etc.) back to the original function.
838   ClonedFunc->replaceAllUsesWith(OrigFunc);
839   ClonedFunc->eraseFromParent();
840   if (!IsFunctionInlined) {
841     // Remove the function that is speculatively created if there is no
842     // reference.
843     if (OutlinedFunc)
844       OutlinedFunc->eraseFromParent();
845   }
846 }
847 
848 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
849   if (F->hasAddressTaken())
850     return nullptr;
851 
852   // Let inliner handle it
853   if (F->hasFnAttribute(Attribute::AlwaysInline))
854     return nullptr;
855 
856   if (F->hasFnAttribute(Attribute::NoInline))
857     return nullptr;
858 
859   if (PSI->isFunctionEntryCold(F))
860     return nullptr;
861 
862   if (F->user_begin() == F->user_end())
863     return nullptr;
864 
865   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
866 
867   if (!OI)
868     return nullptr;
869 
870   FunctionCloner Cloner(F, OI.get());
871   Cloner.NormalizeReturnBlock();
872   Function *OutlinedFunction = Cloner.doFunctionOutlining();
873 
874   bool AnyInline = tryPartialInline(Cloner);
875 
876   if (AnyInline)
877     return OutlinedFunction;
878 
879   return nullptr;
880 }
881 
882 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
883   int NonWeightedRcost;
884   int SizeCost;
885 
886   if (Cloner.OutlinedFunc == nullptr)
887     return false;
888 
889   std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
890 
891   auto RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
892   auto WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
893 
894   // The call sequence to the outlined function is larger than the original
895   // outlined region size, it does not increase the chances of inlining
896   // the function with outlining (The inliner uses the size increase to
897   // model the cost of inlining a callee).
898   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
899     OptimizationRemarkEmitter ORE(Cloner.OrigFunc);
900     DebugLoc DLoc;
901     BasicBlock *Block;
902     std::tie(DLoc, Block) = getOneDebugLoc(Cloner.ClonedFunc);
903     ORE.emit([&]() {
904       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
905                                         DLoc, Block)
906              << ore::NV("Function", Cloner.OrigFunc)
907              << " not partially inlined into callers (Original Size = "
908              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
909              << ", Size of call sequence to outlined function = "
910              << ore::NV("NewSize", SizeCost) << ")";
911     });
912     return false;
913   }
914 
915   assert(Cloner.OrigFunc->user_begin() == Cloner.OrigFunc->user_end() &&
916          "F's users should all be replaced!");
917 
918   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
919                             Cloner.ClonedFunc->user_end());
920 
921   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
922   if (Cloner.OrigFunc->getEntryCount())
923     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
924 
925   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
926   uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0);
927 
928   bool AnyInline = false;
929   for (User *User : Users) {
930     CallSite CS = getCallSite(User);
931 
932     if (IsLimitReached())
933       continue;
934 
935     OptimizationRemarkEmitter ORE(CS.getCaller());
936 
937     if (!shouldPartialInline(CS, Cloner, WeightedRcost, ORE))
938       continue;
939 
940     // Construct remark before doing the inlining, as after successful inlining
941     // the callsite is removed.
942     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
943     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
944        << ore::NV("Caller", CS.getCaller());
945 
946     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
947     if (!InlineFunction(CS, IFI, nullptr, true, Cloner.OutlinedFunc))
948       continue;
949 
950     ORE.emit(OR);
951 
952     // Now update the entry count:
953     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
954       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
955       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
956     }
957 
958     AnyInline = true;
959     NumPartialInlining++;
960     // Update the stats
961     NumPartialInlined++;
962   }
963 
964   if (AnyInline) {
965     Cloner.IsFunctionInlined = true;
966     if (CalleeEntryCount)
967       Cloner.OrigFunc->setEntryCount(CalleeEntryCountV);
968   }
969 
970   return AnyInline;
971 }
972 
973 bool PartialInlinerImpl::run(Module &M) {
974   if (DisablePartialInlining)
975     return false;
976 
977   std::vector<Function *> Worklist;
978   Worklist.reserve(M.size());
979   for (Function &F : M)
980     if (!F.use_empty() && !F.isDeclaration())
981       Worklist.push_back(&F);
982 
983   bool Changed = false;
984   while (!Worklist.empty()) {
985     Function *CurrFunc = Worklist.back();
986     Worklist.pop_back();
987 
988     if (CurrFunc->use_empty())
989       continue;
990 
991     bool Recursive = false;
992     for (User *U : CurrFunc->users())
993       if (Instruction *I = dyn_cast<Instruction>(U))
994         if (I->getParent()->getParent() == CurrFunc) {
995           Recursive = true;
996           break;
997         }
998     if (Recursive)
999       continue;
1000 
1001     if (Function *NewFunc = unswitchFunction(CurrFunc)) {
1002       Worklist.push_back(NewFunc);
1003       Changed = true;
1004     }
1005   }
1006 
1007   return Changed;
1008 }
1009 
1010 char PartialInlinerLegacyPass::ID = 0;
1011 
1012 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1013                       "Partial Inliner", false, false)
1014 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1015 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1016 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1017 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1018                     "Partial Inliner", false, false)
1019 
1020 ModulePass *llvm::createPartialInliningPass() {
1021   return new PartialInlinerLegacyPass();
1022 }
1023 
1024 PreservedAnalyses PartialInlinerPass::run(Module &M,
1025                                           ModuleAnalysisManager &AM) {
1026   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1027 
1028   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
1029       [&FAM](Function &F) -> AssumptionCache & {
1030     return FAM.getResult<AssumptionAnalysis>(F);
1031   };
1032 
1033   std::function<BlockFrequencyInfo &(Function &)> GetBFI =
1034       [&FAM](Function &F) -> BlockFrequencyInfo & {
1035     return FAM.getResult<BlockFrequencyAnalysis>(F);
1036   };
1037 
1038   std::function<TargetTransformInfo &(Function &)> GetTTI =
1039       [&FAM](Function &F) -> TargetTransformInfo & {
1040     return FAM.getResult<TargetIRAnalysis>(F);
1041   };
1042 
1043   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
1044 
1045   if (PartialInlinerImpl(&GetAssumptionCache, &GetTTI, {GetBFI}, PSI).run(M))
1046     return PreservedAnalyses::none();
1047   return PreservedAnalyses::all();
1048 }
1049