1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/None.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
23 #include "llvm/Analysis/BranchProbabilityInfo.h"
24 #include "llvm/Analysis/InlineCost.h"
25 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
27 #include "llvm/Analysis/ProfileSummaryInfo.h"
28 #include "llvm/Analysis/TargetLibraryInfo.h"
29 #include "llvm/Analysis/TargetTransformInfo.h"
30 #include "llvm/IR/Attributes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/CFG.h"
33 #include "llvm/IR/DebugLoc.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Dominators.h"
36 #include "llvm/IR/Function.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/IntrinsicInst.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Module.h"
43 #include "llvm/IR/Operator.h"
44 #include "llvm/IR/User.h"
45 #include "llvm/InitializePasses.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/BlockFrequency.h"
48 #include "llvm/Support/BranchProbability.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Transforms/IPO.h"
53 #include "llvm/Transforms/Utils/Cloning.h"
54 #include "llvm/Transforms/Utils/CodeExtractor.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 #include <cstdint>
59 #include <functional>
60 #include <iterator>
61 #include <memory>
62 #include <tuple>
63 #include <vector>
64 
65 using namespace llvm;
66 
67 #define DEBUG_TYPE "partial-inlining"
68 
69 STATISTIC(NumPartialInlined,
70           "Number of callsites functions partially inlined into.");
71 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
72                                         "cold outlined regions were partially "
73                                         "inlined into its caller(s).");
74 STATISTIC(NumColdRegionsFound,
75            "Number of cold single entry/exit regions found.");
76 STATISTIC(NumColdRegionsOutlined,
77            "Number of cold single entry/exit regions outlined.");
78 
79 // Command line option to disable partial-inlining. The default is false:
80 static cl::opt<bool>
81     DisablePartialInlining("disable-partial-inlining", cl::init(false),
82                            cl::Hidden, cl::desc("Disable partial inlining"));
83 // Command line option to disable multi-region partial-inlining. The default is
84 // false:
85 static cl::opt<bool> DisableMultiRegionPartialInline(
86     "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
87     cl::desc("Disable multi-region partial inlining"));
88 
89 // Command line option to force outlining in regions with live exit variables.
90 // The default is false:
91 static cl::opt<bool>
92     ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
93                cl::desc("Force outline regions with live exits"));
94 
95 // Command line option to enable marking outline functions with Cold Calling
96 // Convention. The default is false:
97 static cl::opt<bool>
98     MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
99                        cl::desc("Mark outline function calls with ColdCC"));
100 
101 // This is an option used by testing:
102 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
103                                       cl::init(false), cl::ZeroOrMore,
104                                       cl::ReallyHidden,
105                                       cl::desc("Skip Cost Analysis"));
106 // Used to determine if a cold region is worth outlining based on
107 // its inlining cost compared to the original function.  Default is set at 10%.
108 // ie. if the cold region reduces the inlining cost of the original function by
109 // at least 10%.
110 static cl::opt<float> MinRegionSizeRatio(
111     "min-region-size-ratio", cl::init(0.1), cl::Hidden,
112     cl::desc("Minimum ratio comparing relative sizes of each "
113              "outline candidate and original function"));
114 // Used to tune the minimum number of execution counts needed in the predecessor
115 // block to the cold edge. ie. confidence interval.
116 static cl::opt<unsigned>
117     MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
118                              cl::desc("Minimum block executions to consider "
119                                       "its BranchProbabilityInfo valid"));
120 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
121 // if the branch probability is 10% or less, then it is deemed as 'cold'.
122 static cl::opt<float> ColdBranchRatio(
123     "cold-branch-ratio", cl::init(0.1), cl::Hidden,
124     cl::desc("Minimum BranchProbability to consider a region cold."));
125 
126 static cl::opt<unsigned> MaxNumInlineBlocks(
127     "max-num-inline-blocks", cl::init(5), cl::Hidden,
128     cl::desc("Max number of blocks to be partially inlined"));
129 
130 // Command line option to set the maximum number of partial inlining allowed
131 // for the module. The default value of -1 means no limit.
132 static cl::opt<int> MaxNumPartialInlining(
133     "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
134     cl::desc("Max number of partial inlining. The default is unlimited"));
135 
136 // Used only when PGO or user annotated branch data is absent. It is
137 // the least value that is used to weigh the outline region. If BFI
138 // produces larger value, the BFI value will be used.
139 static cl::opt<int>
140     OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
141                              cl::Hidden, cl::ZeroOrMore,
142                              cl::desc("Relative frequency of outline region to "
143                                       "the entry block"));
144 
145 static cl::opt<unsigned> ExtraOutliningPenalty(
146     "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
147     cl::desc("A debug option to add additional penalty to the computed one."));
148 
149 namespace {
150 
151 struct FunctionOutliningInfo {
152   FunctionOutliningInfo() = default;
153 
154   // Returns the number of blocks to be inlined including all blocks
155   // in Entries and one return block.
156   unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
157 
158   // A set of blocks including the function entry that guard
159   // the region to be outlined.
160   SmallVector<BasicBlock *, 4> Entries;
161 
162   // The return block that is not included in the outlined region.
163   BasicBlock *ReturnBlock = nullptr;
164 
165   // The dominating block of the region to be outlined.
166   BasicBlock *NonReturnBlock = nullptr;
167 
168   // The set of blocks in Entries that that are predecessors to ReturnBlock
169   SmallVector<BasicBlock *, 4> ReturnBlockPreds;
170 };
171 
172 struct FunctionOutliningMultiRegionInfo {
173   FunctionOutliningMultiRegionInfo() = default;
174 
175   // Container for outline regions
176   struct OutlineRegionInfo {
177     OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
178                       BasicBlock *EntryBlock, BasicBlock *ExitBlock,
179                       BasicBlock *ReturnBlock)
180         : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
181           ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
182     SmallVector<BasicBlock *, 8> Region;
183     BasicBlock *EntryBlock;
184     BasicBlock *ExitBlock;
185     BasicBlock *ReturnBlock;
186   };
187 
188   SmallVector<OutlineRegionInfo, 4> ORI;
189 };
190 
191 struct PartialInlinerImpl {
192 
193   PartialInlinerImpl(
194       function_ref<AssumptionCache &(Function &)> GetAC,
195       function_ref<AssumptionCache *(Function &)> LookupAC,
196       function_ref<TargetTransformInfo &(Function &)> GTTI,
197       function_ref<const TargetLibraryInfo &(Function &)> GTLI,
198       ProfileSummaryInfo &ProfSI,
199       function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
200       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
201         GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
202 
203   bool run(Module &M);
204   // Main part of the transformation that calls helper functions to find
205   // outlining candidates, clone & outline the function, and attempt to
206   // partially inline the resulting function. Returns true if
207   // inlining was successful, false otherwise.  Also returns the outline
208   // function (only if we partially inlined early returns) as there is a
209   // possibility to further "peel" early return statements that were left in the
210   // outline function due to code size.
211   std::pair<bool, Function *> unswitchFunction(Function &F);
212 
213   // This class speculatively clones the function to be partial inlined.
214   // At the end of partial inlining, the remaining callsites to the cloned
215   // function that are not partially inlined will be fixed up to reference
216   // the original function, and the cloned function will be erased.
217   struct FunctionCloner {
218     // Two constructors, one for single region outlining, the other for
219     // multi-region outlining.
220     FunctionCloner(Function *F, FunctionOutliningInfo *OI,
221                    OptimizationRemarkEmitter &ORE,
222                    function_ref<AssumptionCache *(Function &)> LookupAC,
223                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
224     FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
225                    OptimizationRemarkEmitter &ORE,
226                    function_ref<AssumptionCache *(Function &)> LookupAC,
227                    function_ref<TargetTransformInfo &(Function &)> GetTTI);
228 
229     ~FunctionCloner();
230 
231     // Prepare for function outlining: making sure there is only
232     // one incoming edge from the extracted/outlined region to
233     // the return block.
234     void normalizeReturnBlock() const;
235 
236     // Do function outlining for cold regions.
237     bool doMultiRegionFunctionOutlining();
238     // Do function outlining for region after early return block(s).
239     // NOTE: For vararg functions that do the vararg handling in the outlined
240     //       function, we temporarily generate IR that does not properly
241     //       forward varargs to the outlined function. Calling InlineFunction
242     //       will update calls to the outlined functions to properly forward
243     //       the varargs.
244     Function *doSingleRegionFunctionOutlining();
245 
246     Function *OrigFunc = nullptr;
247     Function *ClonedFunc = nullptr;
248 
249     typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
250     // Keep track of Outlined Functions and the basic block they're called from.
251     SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
252 
253     // ClonedFunc is inlined in one of its callers after function
254     // outlining.
255     bool IsFunctionInlined = false;
256     // The cost of the region to be outlined.
257     InstructionCost OutlinedRegionCost = 0;
258     // ClonedOI is specific to outlining non-early return blocks.
259     std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
260     // ClonedOMRI is specific to outlining cold regions.
261     std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
262     std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
263     OptimizationRemarkEmitter &ORE;
264     function_ref<AssumptionCache *(Function &)> LookupAC;
265     function_ref<TargetTransformInfo &(Function &)> GetTTI;
266   };
267 
268 private:
269   int NumPartialInlining = 0;
270   function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
271   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
272   function_ref<TargetTransformInfo &(Function &)> GetTTI;
273   function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
274   function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
275   ProfileSummaryInfo &PSI;
276 
277   // Return the frequency of the OutlininingBB relative to F's entry point.
278   // The result is no larger than 1 and is represented using BP.
279   // (Note that the outlined region's 'head' block can only have incoming
280   // edges from the guarding entry blocks).
281   BranchProbability
282   getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
283 
284   // Return true if the callee of CB should be partially inlined with
285   // profit.
286   bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
287                            BlockFrequency WeightedOutliningRcost,
288                            OptimizationRemarkEmitter &ORE) const;
289 
290   // Try to inline DuplicateFunction (cloned from F with call to
291   // the OutlinedFunction into its callers. Return true
292   // if there is any successful inlining.
293   bool tryPartialInline(FunctionCloner &Cloner);
294 
295   // Compute the mapping from use site of DuplicationFunction to the enclosing
296   // BB's profile count.
297   void
298   computeCallsiteToProfCountMap(Function *DuplicateFunction,
299                                 DenseMap<User *, uint64_t> &SiteCountMap) const;
300 
301   bool isLimitReached() const {
302     return (MaxNumPartialInlining != -1 &&
303             NumPartialInlining >= MaxNumPartialInlining);
304   }
305 
306   static CallBase *getSupportedCallBase(User *U) {
307     if (isa<CallInst>(U) || isa<InvokeInst>(U))
308       return cast<CallBase>(U);
309     llvm_unreachable("All uses must be calls");
310     return nullptr;
311   }
312 
313   static CallBase *getOneCallSiteTo(Function &F) {
314     User *User = *F.user_begin();
315     return getSupportedCallBase(User);
316   }
317 
318   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
319     CallBase *CB = getOneCallSiteTo(F);
320     DebugLoc DLoc = CB->getDebugLoc();
321     BasicBlock *Block = CB->getParent();
322     return std::make_tuple(DLoc, Block);
323   }
324 
325   // Returns the costs associated with function outlining:
326   // - The first value is the non-weighted runtime cost for making the call
327   //   to the outlined function, including the addtional  setup cost in the
328   //    outlined function itself;
329   // - The second value is the estimated size of the new call sequence in
330   //   basic block Cloner.OutliningCallBB;
331   std::tuple<InstructionCost, InstructionCost>
332   computeOutliningCosts(FunctionCloner &Cloner) const;
333 
334   // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
335   // approximate both the size and runtime cost (Note that in the current
336   // inline cost analysis, there is no clear distinction there either).
337   static InstructionCost computeBBInlineCost(BasicBlock *BB,
338                                              TargetTransformInfo *TTI);
339 
340   std::unique_ptr<FunctionOutliningInfo>
341   computeOutliningInfo(Function &F) const;
342 
343   std::unique_ptr<FunctionOutliningMultiRegionInfo>
344   computeOutliningColdRegionsInfo(Function &F,
345                                   OptimizationRemarkEmitter &ORE) const;
346 };
347 
348 struct PartialInlinerLegacyPass : public ModulePass {
349   static char ID; // Pass identification, replacement for typeid
350 
351   PartialInlinerLegacyPass() : ModulePass(ID) {
352     initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
353   }
354 
355   void getAnalysisUsage(AnalysisUsage &AU) const override {
356     AU.addRequired<AssumptionCacheTracker>();
357     AU.addRequired<ProfileSummaryInfoWrapperPass>();
358     AU.addRequired<TargetTransformInfoWrapperPass>();
359     AU.addRequired<TargetLibraryInfoWrapperPass>();
360   }
361 
362   bool runOnModule(Module &M) override {
363     if (skipModule(M))
364       return false;
365 
366     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
367     TargetTransformInfoWrapperPass *TTIWP =
368         &getAnalysis<TargetTransformInfoWrapperPass>();
369     ProfileSummaryInfo &PSI =
370         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
371 
372     auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
373       return ACT->getAssumptionCache(F);
374     };
375 
376     auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
377       return ACT->lookupAssumptionCache(F);
378     };
379 
380     auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
381       return TTIWP->getTTI(F);
382     };
383 
384     auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
385       return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
386     };
387 
388     return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
389                               GetTLI, PSI)
390         .run(M);
391   }
392 };
393 
394 } // end anonymous namespace
395 
396 std::unique_ptr<FunctionOutliningMultiRegionInfo>
397 PartialInlinerImpl::computeOutliningColdRegionsInfo(
398     Function &F, OptimizationRemarkEmitter &ORE) const {
399   BasicBlock *EntryBlock = &F.front();
400 
401   DominatorTree DT(F);
402   LoopInfo LI(DT);
403   BranchProbabilityInfo BPI(F, LI);
404   std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
405   BlockFrequencyInfo *BFI;
406   if (!GetBFI) {
407     ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
408     BFI = ScopedBFI.get();
409   } else
410     BFI = &(GetBFI(F));
411 
412   // Return if we don't have profiling information.
413   if (!PSI.hasInstrumentationProfile())
414     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
415 
416   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
417       std::make_unique<FunctionOutliningMultiRegionInfo>();
418 
419   auto IsSingleExit =
420       [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
421     BasicBlock *ExitBlock = nullptr;
422     for (auto *Block : BlockList) {
423       for (BasicBlock *Succ : successors(Block)) {
424         if (!is_contained(BlockList, Succ)) {
425           if (ExitBlock) {
426             ORE.emit([&]() {
427               return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
428                                               &Succ->front())
429                      << "Region dominated by "
430                      << ore::NV("Block", BlockList.front()->getName())
431                      << " has more than one region exit edge.";
432             });
433             return nullptr;
434           }
435 
436           ExitBlock = Block;
437         }
438       }
439     }
440     return ExitBlock;
441   };
442 
443   auto BBProfileCount = [BFI](BasicBlock *BB) {
444     return BFI->getBlockProfileCount(BB).getValueOr(0);
445   };
446 
447   // Use the same computeBBInlineCost function to compute the cost savings of
448   // the outlining the candidate region.
449   TargetTransformInfo *FTTI = &GetTTI(F);
450   InstructionCost OverallFunctionCost = 0;
451   for (auto &BB : F)
452     OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
453 
454   LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
455                     << "\n";);
456 
457   InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
458       [&](auto Cost) { return Cost * MinRegionSizeRatio; });
459 
460   BranchProbability MinBranchProbability(
461       static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
462       MinBlockCounterExecution);
463   bool ColdCandidateFound = false;
464   BasicBlock *CurrEntry = EntryBlock;
465   std::vector<BasicBlock *> DFS;
466   DenseMap<BasicBlock *, bool> VisitedMap;
467   DFS.push_back(CurrEntry);
468   VisitedMap[CurrEntry] = true;
469 
470   // Use Depth First Search on the basic blocks to find CFG edges that are
471   // considered cold.
472   // Cold regions considered must also have its inline cost compared to the
473   // overall inline cost of the original function.  The region is outlined only
474   // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
475   // more.
476   while (!DFS.empty()) {
477     auto *ThisBB = DFS.back();
478     DFS.pop_back();
479     // Only consider regions with predecessor blocks that are considered
480     // not-cold (default: part of the top 99.99% of all block counters)
481     // AND greater than our minimum block execution count (default: 100).
482     if (PSI.isColdBlock(ThisBB, BFI) ||
483         BBProfileCount(ThisBB) < MinBlockCounterExecution)
484       continue;
485     for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
486       if (VisitedMap[*SI])
487         continue;
488       VisitedMap[*SI] = true;
489       DFS.push_back(*SI);
490       // If branch isn't cold, we skip to the next one.
491       BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
492       if (SuccProb > MinBranchProbability)
493         continue;
494 
495       LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
496                         << SI->getName()
497                         << "\nBranch Probability = " << SuccProb << "\n";);
498 
499       SmallVector<BasicBlock *, 8> DominateVector;
500       DT.getDescendants(*SI, DominateVector);
501       assert(!DominateVector.empty() &&
502              "SI should be reachable and have at least itself as descendant");
503 
504       // We can only outline single entry regions (for now).
505       if (!DominateVector.front()->hasNPredecessors(1)) {
506         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
507                           << " doesn't have a single predecessor in the "
508                              "dominator tree\n";);
509         continue;
510       }
511 
512       BasicBlock *ExitBlock = nullptr;
513       // We can only outline single exit regions (for now).
514       if (!(ExitBlock = IsSingleExit(DominateVector))) {
515         LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
516                           << " doesn't have a unique successor\n";);
517         continue;
518       }
519 
520       InstructionCost OutlineRegionCost = 0;
521       for (auto *BB : DominateVector)
522         OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
523 
524       LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
525                         << "\n";);
526 
527       if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
528         ORE.emit([&]() {
529           return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
530                                             &SI->front())
531                  << ore::NV("Callee", &F)
532                  << " inline cost-savings smaller than "
533                  << ore::NV("Cost", MinOutlineRegionCost);
534         });
535 
536         LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
537                           << MinOutlineRegionCost << "\n";);
538         continue;
539       }
540 
541       // For now, ignore blocks that belong to a SISE region that is a
542       // candidate for outlining.  In the future, we may want to look
543       // at inner regions because the outer region may have live-exit
544       // variables.
545       for (auto *BB : DominateVector)
546         VisitedMap[BB] = true;
547 
548       // ReturnBlock here means the block after the outline call
549       BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
550       FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
551           DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
552       OutliningInfo->ORI.push_back(RegInfo);
553       LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
554                         << DominateVector.front()->getName() << "\n";);
555       ColdCandidateFound = true;
556       NumColdRegionsFound++;
557     }
558   }
559 
560   if (ColdCandidateFound)
561     return OutliningInfo;
562 
563   return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
564 }
565 
566 std::unique_ptr<FunctionOutliningInfo>
567 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
568   BasicBlock *EntryBlock = &F.front();
569   BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
570   if (!BR || BR->isUnconditional())
571     return std::unique_ptr<FunctionOutliningInfo>();
572 
573   // Returns true if Succ is BB's successor
574   auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
575     return is_contained(successors(BB), Succ);
576   };
577 
578   auto IsReturnBlock = [](BasicBlock *BB) {
579     Instruction *TI = BB->getTerminator();
580     return isa<ReturnInst>(TI);
581   };
582 
583   auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
584     if (IsReturnBlock(Succ1))
585       return std::make_tuple(Succ1, Succ2);
586     if (IsReturnBlock(Succ2))
587       return std::make_tuple(Succ2, Succ1);
588 
589     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
590   };
591 
592   // Detect a triangular shape:
593   auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
594     if (IsSuccessor(Succ1, Succ2))
595       return std::make_tuple(Succ1, Succ2);
596     if (IsSuccessor(Succ2, Succ1))
597       return std::make_tuple(Succ2, Succ1);
598 
599     return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
600   };
601 
602   std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
603       std::make_unique<FunctionOutliningInfo>();
604 
605   BasicBlock *CurrEntry = EntryBlock;
606   bool CandidateFound = false;
607   do {
608     // The number of blocks to be inlined has already reached
609     // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
610     // disables partial inlining for the function.
611     if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
612       break;
613 
614     if (succ_size(CurrEntry) != 2)
615       break;
616 
617     BasicBlock *Succ1 = *succ_begin(CurrEntry);
618     BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
619 
620     BasicBlock *ReturnBlock, *NonReturnBlock;
621     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
622 
623     if (ReturnBlock) {
624       OutliningInfo->Entries.push_back(CurrEntry);
625       OutliningInfo->ReturnBlock = ReturnBlock;
626       OutliningInfo->NonReturnBlock = NonReturnBlock;
627       CandidateFound = true;
628       break;
629     }
630 
631     BasicBlock *CommSucc, *OtherSucc;
632     std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
633 
634     if (!CommSucc)
635       break;
636 
637     OutliningInfo->Entries.push_back(CurrEntry);
638     CurrEntry = OtherSucc;
639   } while (true);
640 
641   if (!CandidateFound)
642     return std::unique_ptr<FunctionOutliningInfo>();
643 
644   // There should not be any successors (not in the entry set) other than
645   // {ReturnBlock, NonReturnBlock}
646   assert(OutliningInfo->Entries[0] == &F.front() &&
647          "Function Entry must be the first in Entries vector");
648   DenseSet<BasicBlock *> Entries;
649   for (BasicBlock *E : OutliningInfo->Entries)
650     Entries.insert(E);
651 
652   // Returns true of BB has Predecessor which is not
653   // in Entries set.
654   auto HasNonEntryPred = [Entries](BasicBlock *BB) {
655     for (auto *Pred : predecessors(BB)) {
656       if (!Entries.count(Pred))
657         return true;
658     }
659     return false;
660   };
661   auto CheckAndNormalizeCandidate =
662       [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
663         for (BasicBlock *E : OutliningInfo->Entries) {
664           for (auto *Succ : successors(E)) {
665             if (Entries.count(Succ))
666               continue;
667             if (Succ == OutliningInfo->ReturnBlock)
668               OutliningInfo->ReturnBlockPreds.push_back(E);
669             else if (Succ != OutliningInfo->NonReturnBlock)
670               return false;
671           }
672           // There should not be any outside incoming edges either:
673           if (HasNonEntryPred(E))
674             return false;
675         }
676         return true;
677       };
678 
679   if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
680     return std::unique_ptr<FunctionOutliningInfo>();
681 
682   // Now further growing the candidate's inlining region by
683   // peeling off dominating blocks from the outlining region:
684   while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
685     BasicBlock *Cand = OutliningInfo->NonReturnBlock;
686     if (succ_size(Cand) != 2)
687       break;
688 
689     if (HasNonEntryPred(Cand))
690       break;
691 
692     BasicBlock *Succ1 = *succ_begin(Cand);
693     BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
694 
695     BasicBlock *ReturnBlock, *NonReturnBlock;
696     std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
697     if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
698       break;
699 
700     if (NonReturnBlock->getSinglePredecessor() != Cand)
701       break;
702 
703     // Now grow and update OutlininigInfo:
704     OutliningInfo->Entries.push_back(Cand);
705     OutliningInfo->NonReturnBlock = NonReturnBlock;
706     OutliningInfo->ReturnBlockPreds.push_back(Cand);
707     Entries.insert(Cand);
708   }
709 
710   return OutliningInfo;
711 }
712 
713 // Check if there is PGO data or user annotated branch data:
714 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
715   if (F.hasProfileData())
716     return true;
717   // Now check if any of the entry block has MD_prof data:
718   for (auto *E : OI.Entries) {
719     BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
720     if (!BR || BR->isUnconditional())
721       continue;
722     uint64_t T, F;
723     if (BR->extractProfMetadata(T, F))
724       return true;
725   }
726   return false;
727 }
728 
729 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
730     FunctionCloner &Cloner) const {
731   BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
732   auto EntryFreq =
733       Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
734   auto OutliningCallFreq =
735       Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
736   // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
737   // we outlined any regions, so we may encounter situations where the
738   // OutliningCallFreq is *slightly* bigger than the EntryFreq.
739   if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
740     OutliningCallFreq = EntryFreq;
741 
742   auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
743       OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
744 
745   if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
746     return OutlineRegionRelFreq;
747 
748   // When profile data is not available, we need to be conservative in
749   // estimating the overall savings. Static branch prediction can usually
750   // guess the branch direction right (taken/non-taken), but the guessed
751   // branch probability is usually not biased enough. In case when the
752   // outlined region is predicted to be likely, its probability needs
753   // to be made higher (more biased) to not under-estimate the cost of
754   // function outlining. On the other hand, if the outlined region
755   // is predicted to be less likely, the predicted probablity is usually
756   // higher than the actual. For instance, the actual probability of the
757   // less likely target is only 5%, but the guessed probablity can be
758   // 40%. In the latter case, there is no need for further adjustement.
759   // FIXME: add an option for this.
760   if (OutlineRegionRelFreq < BranchProbability(45, 100))
761     return OutlineRegionRelFreq;
762 
763   OutlineRegionRelFreq = std::max(
764       OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
765 
766   return OutlineRegionRelFreq;
767 }
768 
769 bool PartialInlinerImpl::shouldPartialInline(
770     CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
771     OptimizationRemarkEmitter &ORE) const {
772   using namespace ore;
773 
774   Function *Callee = CB.getCalledFunction();
775   assert(Callee == Cloner.ClonedFunc);
776 
777   if (SkipCostAnalysis)
778     return isInlineViable(*Callee).isSuccess();
779 
780   Function *Caller = CB.getCaller();
781   auto &CalleeTTI = GetTTI(*Callee);
782   bool RemarksEnabled =
783       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
784           DEBUG_TYPE);
785   InlineCost IC =
786       getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
787                     GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
788 
789   if (IC.isAlways()) {
790     ORE.emit([&]() {
791       return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
792              << NV("Callee", Cloner.OrigFunc)
793              << " should always be fully inlined, not partially";
794     });
795     return false;
796   }
797 
798   if (IC.isNever()) {
799     ORE.emit([&]() {
800       return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
801              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
802              << NV("Caller", Caller)
803              << " because it should never be inlined (cost=never)";
804     });
805     return false;
806   }
807 
808   if (!IC) {
809     ORE.emit([&]() {
810       return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
811              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
812              << NV("Caller", Caller) << " because too costly to inline (cost="
813              << NV("Cost", IC.getCost()) << ", threshold="
814              << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
815     });
816     return false;
817   }
818   const DataLayout &DL = Caller->getParent()->getDataLayout();
819 
820   // The savings of eliminating the call:
821   int NonWeightedSavings = getCallsiteCost(CB, DL);
822   BlockFrequency NormWeightedSavings(NonWeightedSavings);
823 
824   // Weighted saving is smaller than weighted cost, return false
825   if (NormWeightedSavings < WeightedOutliningRcost) {
826     ORE.emit([&]() {
827       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
828                                         &CB)
829              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
830              << NV("Caller", Caller) << " runtime overhead (overhead="
831              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
832              << ", savings="
833              << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
834              << ")"
835              << " of making the outlined call is too high";
836     });
837 
838     return false;
839   }
840 
841   ORE.emit([&]() {
842     return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
843            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
844            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
845            << " (threshold="
846            << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
847   });
848   return true;
849 }
850 
851 // TODO: Ideally  we should share Inliner's InlineCost Analysis code.
852 // For now use a simplified version. The returned 'InlineCost' will be used
853 // to esimate the size cost as well as runtime cost of the BB.
854 InstructionCost
855 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
856                                         TargetTransformInfo *TTI) {
857   InstructionCost InlineCost = 0;
858   const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
859   for (Instruction &I : BB->instructionsWithoutDebug()) {
860     // Skip free instructions.
861     switch (I.getOpcode()) {
862     case Instruction::BitCast:
863     case Instruction::PtrToInt:
864     case Instruction::IntToPtr:
865     case Instruction::Alloca:
866     case Instruction::PHI:
867       continue;
868     case Instruction::GetElementPtr:
869       if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
870         continue;
871       break;
872     default:
873       break;
874     }
875 
876     if (I.isLifetimeStartOrEnd())
877       continue;
878 
879     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
880       Intrinsic::ID IID = II->getIntrinsicID();
881       SmallVector<Type *, 4> Tys;
882       FastMathFlags FMF;
883       for (Value *Val : II->args())
884         Tys.push_back(Val->getType());
885 
886       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
887         FMF = FPMO->getFastMathFlags();
888 
889       IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
890       InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
891       continue;
892     }
893 
894     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
895       InlineCost += getCallsiteCost(*CI, DL);
896       continue;
897     }
898 
899     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
900       InlineCost += getCallsiteCost(*II, DL);
901       continue;
902     }
903 
904     if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
905       InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
906       continue;
907     }
908     InlineCost += InlineConstants::InstrCost;
909   }
910 
911   return InlineCost;
912 }
913 
914 std::tuple<InstructionCost, InstructionCost>
915 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
916   InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
917   for (auto FuncBBPair : Cloner.OutlinedFunctions) {
918     Function *OutlinedFunc = FuncBBPair.first;
919     BasicBlock* OutliningCallBB = FuncBBPair.second;
920     // Now compute the cost of the call sequence to the outlined function
921     // 'OutlinedFunction' in BB 'OutliningCallBB':
922     auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
923     OutliningFuncCallCost +=
924         computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
925 
926     // Now compute the cost of the extracted/outlined function itself:
927     for (BasicBlock &BB : *OutlinedFunc)
928       OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
929   }
930   assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
931          "Outlined function cost should be no less than the outlined region");
932 
933   // The code extractor introduces a new root and exit stub blocks with
934   // additional unconditional branches. Those branches will be eliminated
935   // later with bb layout. The cost should be adjusted accordingly:
936   OutlinedFunctionCost -=
937       2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
938 
939   InstructionCost OutliningRuntimeOverhead =
940       OutliningFuncCallCost +
941       (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
942       ExtraOutliningPenalty.getValue();
943 
944   return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
945 }
946 
947 // Create the callsite to profile count map which is
948 // used to update the original function's entry count,
949 // after the function is partially inlined into the callsite.
950 void PartialInlinerImpl::computeCallsiteToProfCountMap(
951     Function *DuplicateFunction,
952     DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
953   std::vector<User *> Users(DuplicateFunction->user_begin(),
954                             DuplicateFunction->user_end());
955   Function *CurrentCaller = nullptr;
956   std::unique_ptr<BlockFrequencyInfo> TempBFI;
957   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
958 
959   auto ComputeCurrBFI = [&,this](Function *Caller) {
960       // For the old pass manager:
961       if (!GetBFI) {
962         DominatorTree DT(*Caller);
963         LoopInfo LI(DT);
964         BranchProbabilityInfo BPI(*Caller, LI);
965         TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
966         CurrentCallerBFI = TempBFI.get();
967       } else {
968         // New pass manager:
969         CurrentCallerBFI = &(GetBFI(*Caller));
970       }
971   };
972 
973   for (User *User : Users) {
974     // Don't bother with BlockAddress used by CallBr for asm goto.
975     if (isa<BlockAddress>(User))
976       continue;
977     CallBase *CB = getSupportedCallBase(User);
978     Function *Caller = CB->getCaller();
979     if (CurrentCaller != Caller) {
980       CurrentCaller = Caller;
981       ComputeCurrBFI(Caller);
982     } else {
983       assert(CurrentCallerBFI && "CallerBFI is not set");
984     }
985     BasicBlock *CallBB = CB->getParent();
986     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
987     if (Count)
988       CallSiteToProfCountMap[User] = *Count;
989     else
990       CallSiteToProfCountMap[User] = 0;
991   }
992 }
993 
994 PartialInlinerImpl::FunctionCloner::FunctionCloner(
995     Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
996     function_ref<AssumptionCache *(Function &)> LookupAC,
997     function_ref<TargetTransformInfo &(Function &)> GetTTI)
998     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
999   ClonedOI = std::make_unique<FunctionOutliningInfo>();
1000 
1001   // Clone the function, so that we can hack away on it.
1002   ValueToValueMapTy VMap;
1003   ClonedFunc = CloneFunction(F, VMap);
1004 
1005   ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1006   ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1007   for (BasicBlock *BB : OI->Entries)
1008     ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1009 
1010   for (BasicBlock *E : OI->ReturnBlockPreds) {
1011     BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1012     ClonedOI->ReturnBlockPreds.push_back(NewE);
1013   }
1014   // Go ahead and update all uses to the duplicate, so that we can just
1015   // use the inliner functionality when we're done hacking.
1016   F->replaceAllUsesWith(ClonedFunc);
1017 }
1018 
1019 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1020     Function *F, FunctionOutliningMultiRegionInfo *OI,
1021     OptimizationRemarkEmitter &ORE,
1022     function_ref<AssumptionCache *(Function &)> LookupAC,
1023     function_ref<TargetTransformInfo &(Function &)> GetTTI)
1024     : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1025   ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1026 
1027   // Clone the function, so that we can hack away on it.
1028   ValueToValueMapTy VMap;
1029   ClonedFunc = CloneFunction(F, VMap);
1030 
1031   // Go through all Outline Candidate Regions and update all BasicBlock
1032   // information.
1033   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1034        OI->ORI) {
1035     SmallVector<BasicBlock *, 8> Region;
1036     for (BasicBlock *BB : RegionInfo.Region)
1037       Region.push_back(cast<BasicBlock>(VMap[BB]));
1038 
1039     BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1040     BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1041     BasicBlock *NewReturnBlock = nullptr;
1042     if (RegionInfo.ReturnBlock)
1043       NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1044     FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1045         Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1046     ClonedOMRI->ORI.push_back(MappedRegionInfo);
1047   }
1048   // Go ahead and update all uses to the duplicate, so that we can just
1049   // use the inliner functionality when we're done hacking.
1050   F->replaceAllUsesWith(ClonedFunc);
1051 }
1052 
1053 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1054   auto GetFirstPHI = [](BasicBlock *BB) {
1055     BasicBlock::iterator I = BB->begin();
1056     PHINode *FirstPhi = nullptr;
1057     while (I != BB->end()) {
1058       PHINode *Phi = dyn_cast<PHINode>(I);
1059       if (!Phi)
1060         break;
1061       if (!FirstPhi) {
1062         FirstPhi = Phi;
1063         break;
1064       }
1065     }
1066     return FirstPhi;
1067   };
1068 
1069   // Shouldn't need to normalize PHIs if we're not outlining non-early return
1070   // blocks.
1071   if (!ClonedOI)
1072     return;
1073 
1074   // Special hackery is needed with PHI nodes that have inputs from more than
1075   // one extracted block.  For simplicity, just split the PHIs into a two-level
1076   // sequence of PHIs, some of which will go in the extracted region, and some
1077   // of which will go outside.
1078   BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1079   // only split block when necessary:
1080   PHINode *FirstPhi = GetFirstPHI(PreReturn);
1081   unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1082 
1083   if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1084     return;
1085 
1086   auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1087     Value *CommonValue = PN->getIncomingValue(0);
1088     if (all_of(PN->incoming_values(),
1089                [&](Value *V) { return V == CommonValue; }))
1090       return CommonValue;
1091     return nullptr;
1092   };
1093 
1094   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1095       ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1096   BasicBlock::iterator I = PreReturn->begin();
1097   Instruction *Ins = &ClonedOI->ReturnBlock->front();
1098   SmallVector<Instruction *, 4> DeadPhis;
1099   while (I != PreReturn->end()) {
1100     PHINode *OldPhi = dyn_cast<PHINode>(I);
1101     if (!OldPhi)
1102       break;
1103 
1104     PHINode *RetPhi =
1105         PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1106     OldPhi->replaceAllUsesWith(RetPhi);
1107     Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1108 
1109     RetPhi->addIncoming(&*I, PreReturn);
1110     for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1111       RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1112       OldPhi->removeIncomingValue(E);
1113     }
1114 
1115     // After incoming values splitting, the old phi may become trivial.
1116     // Keeping the trivial phi can introduce definition inside the outline
1117     // region which is live-out, causing necessary overhead (load, store
1118     // arg passing etc).
1119     if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1120       OldPhi->replaceAllUsesWith(OldPhiVal);
1121       DeadPhis.push_back(OldPhi);
1122     }
1123     ++I;
1124   }
1125   for (auto *DP : DeadPhis)
1126     DP->eraseFromParent();
1127 
1128   for (auto *E : ClonedOI->ReturnBlockPreds)
1129     E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1130 }
1131 
1132 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1133 
1134   auto ComputeRegionCost =
1135       [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1136     InstructionCost Cost = 0;
1137     for (BasicBlock* BB : Region)
1138       Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1139     return Cost;
1140   };
1141 
1142   assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1143 
1144   if (ClonedOMRI->ORI.empty())
1145     return false;
1146 
1147   // The CodeExtractor needs a dominator tree.
1148   DominatorTree DT;
1149   DT.recalculate(*ClonedFunc);
1150 
1151   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1152   LoopInfo LI(DT);
1153   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1154   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1155 
1156   // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1157   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1158 
1159   SetVector<Value *> Inputs, Outputs, Sinks;
1160   for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1161        ClonedOMRI->ORI) {
1162     InstructionCost CurrentOutlinedRegionCost =
1163         ComputeRegionCost(RegionInfo.Region);
1164 
1165     CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1166                      ClonedFuncBFI.get(), &BPI,
1167                      LookupAC(*RegionInfo.EntryBlock->getParent()),
1168                      /* AllowVarargs */ false);
1169 
1170     CE.findInputsOutputs(Inputs, Outputs, Sinks);
1171 
1172     LLVM_DEBUG({
1173       dbgs() << "inputs: " << Inputs.size() << "\n";
1174       dbgs() << "outputs: " << Outputs.size() << "\n";
1175       for (Value *value : Inputs)
1176         dbgs() << "value used in func: " << *value << "\n";
1177       for (Value *output : Outputs)
1178         dbgs() << "instr used in func: " << *output << "\n";
1179     });
1180 
1181     // Do not extract regions that have live exit variables.
1182     if (Outputs.size() > 0 && !ForceLiveExit)
1183       continue;
1184 
1185     if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1186       CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1187       BasicBlock *OutliningCallBB = OCS->getParent();
1188       assert(OutliningCallBB->getParent() == ClonedFunc);
1189       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1190       NumColdRegionsOutlined++;
1191       OutlinedRegionCost += CurrentOutlinedRegionCost;
1192 
1193       if (MarkOutlinedColdCC) {
1194         OutlinedFunc->setCallingConv(CallingConv::Cold);
1195         OCS->setCallingConv(CallingConv::Cold);
1196       }
1197     } else
1198       ORE.emit([&]() {
1199         return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1200                                         &RegionInfo.Region.front()->front())
1201                << "Failed to extract region at block "
1202                << ore::NV("Block", RegionInfo.Region.front());
1203       });
1204   }
1205 
1206   return !OutlinedFunctions.empty();
1207 }
1208 
1209 Function *
1210 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1211   // Returns true if the block is to be partial inlined into the caller
1212   // (i.e. not to be extracted to the out of line function)
1213   auto ToBeInlined = [&, this](BasicBlock *BB) {
1214     return BB == ClonedOI->ReturnBlock ||
1215            llvm::is_contained(ClonedOI->Entries, BB);
1216   };
1217 
1218   assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1219   // The CodeExtractor needs a dominator tree.
1220   DominatorTree DT;
1221   DT.recalculate(*ClonedFunc);
1222 
1223   // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1224   LoopInfo LI(DT);
1225   BranchProbabilityInfo BPI(*ClonedFunc, LI);
1226   ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1227 
1228   // Gather up the blocks that we're going to extract.
1229   std::vector<BasicBlock *> ToExtract;
1230   auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1231   ToExtract.push_back(ClonedOI->NonReturnBlock);
1232   OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1233       ClonedOI->NonReturnBlock, ClonedFuncTTI);
1234   for (BasicBlock &BB : *ClonedFunc)
1235     if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1236       ToExtract.push_back(&BB);
1237       // FIXME: the code extractor may hoist/sink more code
1238       // into the outlined function which may make the outlining
1239       // overhead (the difference of the outlined function cost
1240       // and OutliningRegionCost) look larger.
1241       OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1242     }
1243 
1244   // Extract the body of the if.
1245   CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1246   Function *OutlinedFunc =
1247       CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1248                     ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1249                     /* AllowVarargs */ true)
1250           .extractCodeRegion(CEAC);
1251 
1252   if (OutlinedFunc) {
1253     BasicBlock *OutliningCallBB =
1254         PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1255     assert(OutliningCallBB->getParent() == ClonedFunc);
1256     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1257   } else
1258     ORE.emit([&]() {
1259       return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1260                                       &ToExtract.front()->front())
1261              << "Failed to extract region at block "
1262              << ore::NV("Block", ToExtract.front());
1263     });
1264 
1265   return OutlinedFunc;
1266 }
1267 
1268 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1269   // Ditch the duplicate, since we're done with it, and rewrite all remaining
1270   // users (function pointers, etc.) back to the original function.
1271   ClonedFunc->replaceAllUsesWith(OrigFunc);
1272   ClonedFunc->eraseFromParent();
1273   if (!IsFunctionInlined) {
1274     // Remove each function that was speculatively created if there is no
1275     // reference.
1276     for (auto FuncBBPair : OutlinedFunctions) {
1277       Function *Func = FuncBBPair.first;
1278       Func->eraseFromParent();
1279     }
1280   }
1281 }
1282 
1283 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1284   if (F.hasAddressTaken())
1285     return {false, nullptr};
1286 
1287   // Let inliner handle it
1288   if (F.hasFnAttribute(Attribute::AlwaysInline))
1289     return {false, nullptr};
1290 
1291   if (F.hasFnAttribute(Attribute::NoInline))
1292     return {false, nullptr};
1293 
1294   if (PSI.isFunctionEntryCold(&F))
1295     return {false, nullptr};
1296 
1297   if (F.users().empty())
1298     return {false, nullptr};
1299 
1300   OptimizationRemarkEmitter ORE(&F);
1301 
1302   // Only try to outline cold regions if we have a profile summary, which
1303   // implies we have profiling information.
1304   if (PSI.hasProfileSummary() && F.hasProfileData() &&
1305       !DisableMultiRegionPartialInline) {
1306     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1307         computeOutliningColdRegionsInfo(F, ORE);
1308     if (OMRI) {
1309       FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1310 
1311       LLVM_DEBUG({
1312         dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1313         dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1314                << "\n";
1315       });
1316 
1317       bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1318 
1319       if (DidOutline) {
1320         LLVM_DEBUG({
1321           dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1322           Cloner.ClonedFunc->print(dbgs());
1323           dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1324         });
1325 
1326         if (tryPartialInline(Cloner))
1327           return {true, nullptr};
1328       }
1329     }
1330   }
1331 
1332   // Fall-thru to regular partial inlining if we:
1333   //    i) can't find any cold regions to outline, or
1334   //   ii) can't inline the outlined function anywhere.
1335   std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1336   if (!OI)
1337     return {false, nullptr};
1338 
1339   FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1340   Cloner.normalizeReturnBlock();
1341 
1342   Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1343 
1344   if (!OutlinedFunction)
1345     return {false, nullptr};
1346 
1347   if (tryPartialInline(Cloner))
1348     return {true, OutlinedFunction};
1349 
1350   return {false, nullptr};
1351 }
1352 
1353 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1354   if (Cloner.OutlinedFunctions.empty())
1355     return false;
1356 
1357   int SizeCost = 0;
1358   BlockFrequency WeightedRcost;
1359   int NonWeightedRcost;
1360 
1361   auto OutliningCosts = computeOutliningCosts(Cloner);
1362   assert(std::get<0>(OutliningCosts).isValid() &&
1363          std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
1364 
1365   SizeCost = *std::get<0>(OutliningCosts).getValue();
1366   NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
1367 
1368   // Only calculate RelativeToEntryFreq when we are doing single region
1369   // outlining.
1370   BranchProbability RelativeToEntryFreq;
1371   if (Cloner.ClonedOI)
1372     RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1373   else
1374     // RelativeToEntryFreq doesn't make sense when we have more than one
1375     // outlined call because each call will have a different relative frequency
1376     // to the entry block.  We can consider using the average, but the
1377     // usefulness of that information is questionable. For now, assume we never
1378     // execute the calls to outlined functions.
1379     RelativeToEntryFreq = BranchProbability(0, 1);
1380 
1381   WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1382 
1383   // The call sequence(s) to the outlined function(s) are larger than the sum of
1384   // the original outlined region size(s), it does not increase the chances of
1385   // inlining the function with outlining (The inliner uses the size increase to
1386   // model the cost of inlining a callee).
1387   if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1388     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1389     DebugLoc DLoc;
1390     BasicBlock *Block;
1391     std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1392     OrigFuncORE.emit([&]() {
1393       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1394                                         DLoc, Block)
1395              << ore::NV("Function", Cloner.OrigFunc)
1396              << " not partially inlined into callers (Original Size = "
1397              << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1398              << ", Size of call sequence to outlined function = "
1399              << ore::NV("NewSize", SizeCost) << ")";
1400     });
1401     return false;
1402   }
1403 
1404   assert(Cloner.OrigFunc->users().empty() &&
1405          "F's users should all be replaced!");
1406 
1407   std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1408                             Cloner.ClonedFunc->user_end());
1409 
1410   DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1411   auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1412   if (CalleeEntryCount)
1413     computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1414 
1415   uint64_t CalleeEntryCountV =
1416       (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1417 
1418   bool AnyInline = false;
1419   for (User *User : Users) {
1420     // Don't bother with BlockAddress used by CallBr for asm goto.
1421     if (isa<BlockAddress>(User))
1422       continue;
1423 
1424     CallBase *CB = getSupportedCallBase(User);
1425 
1426     if (isLimitReached())
1427       continue;
1428 
1429     OptimizationRemarkEmitter CallerORE(CB->getCaller());
1430     if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1431       continue;
1432 
1433     // Construct remark before doing the inlining, as after successful inlining
1434     // the callsite is removed.
1435     OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1436     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1437        << ore::NV("Caller", CB->getCaller());
1438 
1439     InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1440     // We can only forward varargs when we outlined a single region, else we
1441     // bail on vararg functions.
1442     if (!InlineFunction(*CB, IFI, nullptr, true,
1443                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1444                                          : nullptr))
1445              .isSuccess())
1446       continue;
1447 
1448     CallerORE.emit(OR);
1449 
1450     // Now update the entry count:
1451     if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1452       uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1453       CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1454     }
1455 
1456     AnyInline = true;
1457     NumPartialInlining++;
1458     // Update the stats
1459     if (Cloner.ClonedOI)
1460       NumPartialInlined++;
1461     else
1462       NumColdOutlinePartialInlined++;
1463   }
1464 
1465   if (AnyInline) {
1466     Cloner.IsFunctionInlined = true;
1467     if (CalleeEntryCount)
1468       Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1469           CalleeEntryCountV, CalleeEntryCount->getType()));
1470     OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1471     OrigFuncORE.emit([&]() {
1472       return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1473              << "Partially inlined into at least one caller";
1474     });
1475   }
1476 
1477   return AnyInline;
1478 }
1479 
1480 bool PartialInlinerImpl::run(Module &M) {
1481   if (DisablePartialInlining)
1482     return false;
1483 
1484   std::vector<Function *> Worklist;
1485   Worklist.reserve(M.size());
1486   for (Function &F : M)
1487     if (!F.use_empty() && !F.isDeclaration())
1488       Worklist.push_back(&F);
1489 
1490   bool Changed = false;
1491   while (!Worklist.empty()) {
1492     Function *CurrFunc = Worklist.back();
1493     Worklist.pop_back();
1494 
1495     if (CurrFunc->use_empty())
1496       continue;
1497 
1498     bool Recursive = false;
1499     for (User *U : CurrFunc->users())
1500       if (Instruction *I = dyn_cast<Instruction>(U))
1501         if (I->getParent()->getParent() == CurrFunc) {
1502           Recursive = true;
1503           break;
1504         }
1505     if (Recursive)
1506       continue;
1507 
1508     std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1509     if (Result.second)
1510       Worklist.push_back(Result.second);
1511     Changed |= Result.first;
1512   }
1513 
1514   return Changed;
1515 }
1516 
1517 char PartialInlinerLegacyPass::ID = 0;
1518 
1519 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1520                       "Partial Inliner", false, false)
1521 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1522 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1523 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1524 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1525 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1526                     "Partial Inliner", false, false)
1527 
1528 ModulePass *llvm::createPartialInliningPass() {
1529   return new PartialInlinerLegacyPass();
1530 }
1531 
1532 PreservedAnalyses PartialInlinerPass::run(Module &M,
1533                                           ModuleAnalysisManager &AM) {
1534   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1535 
1536   auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1537     return FAM.getResult<AssumptionAnalysis>(F);
1538   };
1539 
1540   auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1541     return FAM.getCachedResult<AssumptionAnalysis>(F);
1542   };
1543 
1544   auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1545     return FAM.getResult<BlockFrequencyAnalysis>(F);
1546   };
1547 
1548   auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1549     return FAM.getResult<TargetIRAnalysis>(F);
1550   };
1551 
1552   auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1553     return FAM.getResult<TargetLibraryAnalysis>(F);
1554   };
1555 
1556   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1557 
1558   if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1559                          GetTLI, PSI, GetBFI)
1560           .run(M))
1561     return PreservedAnalyses::none();
1562   return PreservedAnalyses::all();
1563 }
1564