1 //===- PartialInlining.cpp - Inline parts of functions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass performs partial inlining, typically by inlining an if statement
10 // that surrounds the body of the function.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/Transforms/IPO/PartialInlining.h"
15 #include "llvm/ADT/DenseMap.h"
16 #include "llvm/ADT/DenseSet.h"
17 #include "llvm/ADT/Optional.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
22 #include "llvm/Analysis/BranchProbabilityInfo.h"
23 #include "llvm/Analysis/InlineCost.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/ProfileSummaryInfo.h"
27 #include "llvm/Analysis/TargetLibraryInfo.h"
28 #include "llvm/Analysis/TargetTransformInfo.h"
29 #include "llvm/IR/Attributes.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/CFG.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/IR/Function.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Module.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/User.h"
44 #include "llvm/InitializePasses.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/BlockFrequency.h"
47 #include "llvm/Support/BranchProbability.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/CommandLine.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Transforms/IPO.h"
52 #include "llvm/Transforms/Utils/Cloning.h"
53 #include "llvm/Transforms/Utils/CodeExtractor.h"
54 #include "llvm/Transforms/Utils/ValueMapper.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <memory>
59 #include <tuple>
60 #include <vector>
61
62 using namespace llvm;
63
64 #define DEBUG_TYPE "partial-inlining"
65
66 STATISTIC(NumPartialInlined,
67 "Number of callsites functions partially inlined into.");
68 STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
69 "cold outlined regions were partially "
70 "inlined into its caller(s).");
71 STATISTIC(NumColdRegionsFound,
72 "Number of cold single entry/exit regions found.");
73 STATISTIC(NumColdRegionsOutlined,
74 "Number of cold single entry/exit regions outlined.");
75
76 // Command line option to disable partial-inlining. The default is false:
77 static cl::opt<bool>
78 DisablePartialInlining("disable-partial-inlining", cl::init(false),
79 cl::Hidden, cl::desc("Disable partial inlining"));
80 // Command line option to disable multi-region partial-inlining. The default is
81 // false:
82 static cl::opt<bool> DisableMultiRegionPartialInline(
83 "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
84 cl::desc("Disable multi-region partial inlining"));
85
86 // Command line option to force outlining in regions with live exit variables.
87 // The default is false:
88 static cl::opt<bool>
89 ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
90 cl::desc("Force outline regions with live exits"));
91
92 // Command line option to enable marking outline functions with Cold Calling
93 // Convention. The default is false:
94 static cl::opt<bool>
95 MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
96 cl::desc("Mark outline function calls with ColdCC"));
97
98 // This is an option used by testing:
99 static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
100
101 cl::ReallyHidden,
102 cl::desc("Skip Cost Analysis"));
103 // Used to determine if a cold region is worth outlining based on
104 // its inlining cost compared to the original function. Default is set at 10%.
105 // ie. if the cold region reduces the inlining cost of the original function by
106 // at least 10%.
107 static cl::opt<float> MinRegionSizeRatio(
108 "min-region-size-ratio", cl::init(0.1), cl::Hidden,
109 cl::desc("Minimum ratio comparing relative sizes of each "
110 "outline candidate and original function"));
111 // Used to tune the minimum number of execution counts needed in the predecessor
112 // block to the cold edge. ie. confidence interval.
113 static cl::opt<unsigned>
114 MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
115 cl::desc("Minimum block executions to consider "
116 "its BranchProbabilityInfo valid"));
117 // Used to determine when an edge is considered cold. Default is set to 10%. ie.
118 // if the branch probability is 10% or less, then it is deemed as 'cold'.
119 static cl::opt<float> ColdBranchRatio(
120 "cold-branch-ratio", cl::init(0.1), cl::Hidden,
121 cl::desc("Minimum BranchProbability to consider a region cold."));
122
123 static cl::opt<unsigned> MaxNumInlineBlocks(
124 "max-num-inline-blocks", cl::init(5), cl::Hidden,
125 cl::desc("Max number of blocks to be partially inlined"));
126
127 // Command line option to set the maximum number of partial inlining allowed
128 // for the module. The default value of -1 means no limit.
129 static cl::opt<int> MaxNumPartialInlining(
130 "max-partial-inlining", cl::init(-1), cl::Hidden,
131 cl::desc("Max number of partial inlining. The default is unlimited"));
132
133 // Used only when PGO or user annotated branch data is absent. It is
134 // the least value that is used to weigh the outline region. If BFI
135 // produces larger value, the BFI value will be used.
136 static cl::opt<int>
137 OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
138 cl::Hidden,
139 cl::desc("Relative frequency of outline region to "
140 "the entry block"));
141
142 static cl::opt<unsigned> ExtraOutliningPenalty(
143 "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
144 cl::desc("A debug option to add additional penalty to the computed one."));
145
146 namespace {
147
148 struct FunctionOutliningInfo {
149 FunctionOutliningInfo() = default;
150
151 // Returns the number of blocks to be inlined including all blocks
152 // in Entries and one return block.
getNumInlinedBlocks__anonc51db3fa0111::FunctionOutliningInfo153 unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
154
155 // A set of blocks including the function entry that guard
156 // the region to be outlined.
157 SmallVector<BasicBlock *, 4> Entries;
158
159 // The return block that is not included in the outlined region.
160 BasicBlock *ReturnBlock = nullptr;
161
162 // The dominating block of the region to be outlined.
163 BasicBlock *NonReturnBlock = nullptr;
164
165 // The set of blocks in Entries that that are predecessors to ReturnBlock
166 SmallVector<BasicBlock *, 4> ReturnBlockPreds;
167 };
168
169 struct FunctionOutliningMultiRegionInfo {
170 FunctionOutliningMultiRegionInfo() = default;
171
172 // Container for outline regions
173 struct OutlineRegionInfo {
OutlineRegionInfo__anonc51db3fa0111::FunctionOutliningMultiRegionInfo::OutlineRegionInfo174 OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
175 BasicBlock *EntryBlock, BasicBlock *ExitBlock,
176 BasicBlock *ReturnBlock)
177 : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
178 ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
179 SmallVector<BasicBlock *, 8> Region;
180 BasicBlock *EntryBlock;
181 BasicBlock *ExitBlock;
182 BasicBlock *ReturnBlock;
183 };
184
185 SmallVector<OutlineRegionInfo, 4> ORI;
186 };
187
188 struct PartialInlinerImpl {
189
PartialInlinerImpl__anonc51db3fa0111::PartialInlinerImpl190 PartialInlinerImpl(
191 function_ref<AssumptionCache &(Function &)> GetAC,
192 function_ref<AssumptionCache *(Function &)> LookupAC,
193 function_ref<TargetTransformInfo &(Function &)> GTTI,
194 function_ref<const TargetLibraryInfo &(Function &)> GTLI,
195 ProfileSummaryInfo &ProfSI,
196 function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
197 : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
198 GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
199
200 bool run(Module &M);
201 // Main part of the transformation that calls helper functions to find
202 // outlining candidates, clone & outline the function, and attempt to
203 // partially inline the resulting function. Returns true if
204 // inlining was successful, false otherwise. Also returns the outline
205 // function (only if we partially inlined early returns) as there is a
206 // possibility to further "peel" early return statements that were left in the
207 // outline function due to code size.
208 std::pair<bool, Function *> unswitchFunction(Function &F);
209
210 // This class speculatively clones the function to be partial inlined.
211 // At the end of partial inlining, the remaining callsites to the cloned
212 // function that are not partially inlined will be fixed up to reference
213 // the original function, and the cloned function will be erased.
214 struct FunctionCloner {
215 // Two constructors, one for single region outlining, the other for
216 // multi-region outlining.
217 FunctionCloner(Function *F, FunctionOutliningInfo *OI,
218 OptimizationRemarkEmitter &ORE,
219 function_ref<AssumptionCache *(Function &)> LookupAC,
220 function_ref<TargetTransformInfo &(Function &)> GetTTI);
221 FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
222 OptimizationRemarkEmitter &ORE,
223 function_ref<AssumptionCache *(Function &)> LookupAC,
224 function_ref<TargetTransformInfo &(Function &)> GetTTI);
225
226 ~FunctionCloner();
227
228 // Prepare for function outlining: making sure there is only
229 // one incoming edge from the extracted/outlined region to
230 // the return block.
231 void normalizeReturnBlock() const;
232
233 // Do function outlining for cold regions.
234 bool doMultiRegionFunctionOutlining();
235 // Do function outlining for region after early return block(s).
236 // NOTE: For vararg functions that do the vararg handling in the outlined
237 // function, we temporarily generate IR that does not properly
238 // forward varargs to the outlined function. Calling InlineFunction
239 // will update calls to the outlined functions to properly forward
240 // the varargs.
241 Function *doSingleRegionFunctionOutlining();
242
243 Function *OrigFunc = nullptr;
244 Function *ClonedFunc = nullptr;
245
246 typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
247 // Keep track of Outlined Functions and the basic block they're called from.
248 SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
249
250 // ClonedFunc is inlined in one of its callers after function
251 // outlining.
252 bool IsFunctionInlined = false;
253 // The cost of the region to be outlined.
254 InstructionCost OutlinedRegionCost = 0;
255 // ClonedOI is specific to outlining non-early return blocks.
256 std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
257 // ClonedOMRI is specific to outlining cold regions.
258 std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
259 std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
260 OptimizationRemarkEmitter &ORE;
261 function_ref<AssumptionCache *(Function &)> LookupAC;
262 function_ref<TargetTransformInfo &(Function &)> GetTTI;
263 };
264
265 private:
266 int NumPartialInlining = 0;
267 function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
268 function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
269 function_ref<TargetTransformInfo &(Function &)> GetTTI;
270 function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
271 function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
272 ProfileSummaryInfo &PSI;
273
274 // Return the frequency of the OutlininingBB relative to F's entry point.
275 // The result is no larger than 1 and is represented using BP.
276 // (Note that the outlined region's 'head' block can only have incoming
277 // edges from the guarding entry blocks).
278 BranchProbability
279 getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
280
281 // Return true if the callee of CB should be partially inlined with
282 // profit.
283 bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
284 BlockFrequency WeightedOutliningRcost,
285 OptimizationRemarkEmitter &ORE) const;
286
287 // Try to inline DuplicateFunction (cloned from F with call to
288 // the OutlinedFunction into its callers. Return true
289 // if there is any successful inlining.
290 bool tryPartialInline(FunctionCloner &Cloner);
291
292 // Compute the mapping from use site of DuplicationFunction to the enclosing
293 // BB's profile count.
294 void
295 computeCallsiteToProfCountMap(Function *DuplicateFunction,
296 DenseMap<User *, uint64_t> &SiteCountMap) const;
297
isLimitReached__anonc51db3fa0111::PartialInlinerImpl298 bool isLimitReached() const {
299 return (MaxNumPartialInlining != -1 &&
300 NumPartialInlining >= MaxNumPartialInlining);
301 }
302
getSupportedCallBase__anonc51db3fa0111::PartialInlinerImpl303 static CallBase *getSupportedCallBase(User *U) {
304 if (isa<CallInst>(U) || isa<InvokeInst>(U))
305 return cast<CallBase>(U);
306 llvm_unreachable("All uses must be calls");
307 return nullptr;
308 }
309
getOneCallSiteTo__anonc51db3fa0111::PartialInlinerImpl310 static CallBase *getOneCallSiteTo(Function &F) {
311 User *User = *F.user_begin();
312 return getSupportedCallBase(User);
313 }
314
getOneDebugLoc__anonc51db3fa0111::PartialInlinerImpl315 std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
316 CallBase *CB = getOneCallSiteTo(F);
317 DebugLoc DLoc = CB->getDebugLoc();
318 BasicBlock *Block = CB->getParent();
319 return std::make_tuple(DLoc, Block);
320 }
321
322 // Returns the costs associated with function outlining:
323 // - The first value is the non-weighted runtime cost for making the call
324 // to the outlined function, including the addtional setup cost in the
325 // outlined function itself;
326 // - The second value is the estimated size of the new call sequence in
327 // basic block Cloner.OutliningCallBB;
328 std::tuple<InstructionCost, InstructionCost>
329 computeOutliningCosts(FunctionCloner &Cloner) const;
330
331 // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
332 // approximate both the size and runtime cost (Note that in the current
333 // inline cost analysis, there is no clear distinction there either).
334 static InstructionCost computeBBInlineCost(BasicBlock *BB,
335 TargetTransformInfo *TTI);
336
337 std::unique_ptr<FunctionOutliningInfo>
338 computeOutliningInfo(Function &F) const;
339
340 std::unique_ptr<FunctionOutliningMultiRegionInfo>
341 computeOutliningColdRegionsInfo(Function &F,
342 OptimizationRemarkEmitter &ORE) const;
343 };
344
345 struct PartialInlinerLegacyPass : public ModulePass {
346 static char ID; // Pass identification, replacement for typeid
347
PartialInlinerLegacyPass__anonc51db3fa0111::PartialInlinerLegacyPass348 PartialInlinerLegacyPass() : ModulePass(ID) {
349 initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
350 }
351
getAnalysisUsage__anonc51db3fa0111::PartialInlinerLegacyPass352 void getAnalysisUsage(AnalysisUsage &AU) const override {
353 AU.addRequired<AssumptionCacheTracker>();
354 AU.addRequired<ProfileSummaryInfoWrapperPass>();
355 AU.addRequired<TargetTransformInfoWrapperPass>();
356 AU.addRequired<TargetLibraryInfoWrapperPass>();
357 }
358
runOnModule__anonc51db3fa0111::PartialInlinerLegacyPass359 bool runOnModule(Module &M) override {
360 if (skipModule(M))
361 return false;
362
363 AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
364 TargetTransformInfoWrapperPass *TTIWP =
365 &getAnalysis<TargetTransformInfoWrapperPass>();
366 ProfileSummaryInfo &PSI =
367 getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
368
369 auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
370 return ACT->getAssumptionCache(F);
371 };
372
373 auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
374 return ACT->lookupAssumptionCache(F);
375 };
376
377 auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
378 return TTIWP->getTTI(F);
379 };
380
381 auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
382 return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
383 };
384
385 return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
386 GetTLI, PSI)
387 .run(M);
388 }
389 };
390
391 } // end anonymous namespace
392
393 std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function & F,OptimizationRemarkEmitter & ORE) const394 PartialInlinerImpl::computeOutliningColdRegionsInfo(
395 Function &F, OptimizationRemarkEmitter &ORE) const {
396 BasicBlock *EntryBlock = &F.front();
397
398 DominatorTree DT(F);
399 LoopInfo LI(DT);
400 BranchProbabilityInfo BPI(F, LI);
401 std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
402 BlockFrequencyInfo *BFI;
403 if (!GetBFI) {
404 ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
405 BFI = ScopedBFI.get();
406 } else
407 BFI = &(GetBFI(F));
408
409 // Return if we don't have profiling information.
410 if (!PSI.hasInstrumentationProfile())
411 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
412
413 std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
414 std::make_unique<FunctionOutliningMultiRegionInfo>();
415
416 auto IsSingleExit =
417 [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
418 BasicBlock *ExitBlock = nullptr;
419 for (auto *Block : BlockList) {
420 for (BasicBlock *Succ : successors(Block)) {
421 if (!is_contained(BlockList, Succ)) {
422 if (ExitBlock) {
423 ORE.emit([&]() {
424 return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
425 &Succ->front())
426 << "Region dominated by "
427 << ore::NV("Block", BlockList.front()->getName())
428 << " has more than one region exit edge.";
429 });
430 return nullptr;
431 }
432
433 ExitBlock = Block;
434 }
435 }
436 }
437 return ExitBlock;
438 };
439
440 auto BBProfileCount = [BFI](BasicBlock *BB) {
441 return BFI->getBlockProfileCount(BB).value_or(0);
442 };
443
444 // Use the same computeBBInlineCost function to compute the cost savings of
445 // the outlining the candidate region.
446 TargetTransformInfo *FTTI = &GetTTI(F);
447 InstructionCost OverallFunctionCost = 0;
448 for (auto &BB : F)
449 OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
450
451 LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
452 << "\n";);
453
454 InstructionCost MinOutlineRegionCost = OverallFunctionCost.map(
455 [&](auto Cost) { return Cost * MinRegionSizeRatio; });
456
457 BranchProbability MinBranchProbability(
458 static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
459 MinBlockCounterExecution);
460 bool ColdCandidateFound = false;
461 BasicBlock *CurrEntry = EntryBlock;
462 std::vector<BasicBlock *> DFS;
463 DenseMap<BasicBlock *, bool> VisitedMap;
464 DFS.push_back(CurrEntry);
465 VisitedMap[CurrEntry] = true;
466
467 // Use Depth First Search on the basic blocks to find CFG edges that are
468 // considered cold.
469 // Cold regions considered must also have its inline cost compared to the
470 // overall inline cost of the original function. The region is outlined only
471 // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
472 // more.
473 while (!DFS.empty()) {
474 auto *ThisBB = DFS.back();
475 DFS.pop_back();
476 // Only consider regions with predecessor blocks that are considered
477 // not-cold (default: part of the top 99.99% of all block counters)
478 // AND greater than our minimum block execution count (default: 100).
479 if (PSI.isColdBlock(ThisBB, BFI) ||
480 BBProfileCount(ThisBB) < MinBlockCounterExecution)
481 continue;
482 for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
483 if (VisitedMap[*SI])
484 continue;
485 VisitedMap[*SI] = true;
486 DFS.push_back(*SI);
487 // If branch isn't cold, we skip to the next one.
488 BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
489 if (SuccProb > MinBranchProbability)
490 continue;
491
492 LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
493 << SI->getName()
494 << "\nBranch Probability = " << SuccProb << "\n";);
495
496 SmallVector<BasicBlock *, 8> DominateVector;
497 DT.getDescendants(*SI, DominateVector);
498 assert(!DominateVector.empty() &&
499 "SI should be reachable and have at least itself as descendant");
500
501 // We can only outline single entry regions (for now).
502 if (!DominateVector.front()->hasNPredecessors(1)) {
503 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
504 << " doesn't have a single predecessor in the "
505 "dominator tree\n";);
506 continue;
507 }
508
509 BasicBlock *ExitBlock = nullptr;
510 // We can only outline single exit regions (for now).
511 if (!(ExitBlock = IsSingleExit(DominateVector))) {
512 LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
513 << " doesn't have a unique successor\n";);
514 continue;
515 }
516
517 InstructionCost OutlineRegionCost = 0;
518 for (auto *BB : DominateVector)
519 OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
520
521 LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
522 << "\n";);
523
524 if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
525 ORE.emit([&]() {
526 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
527 &SI->front())
528 << ore::NV("Callee", &F)
529 << " inline cost-savings smaller than "
530 << ore::NV("Cost", MinOutlineRegionCost);
531 });
532
533 LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
534 << MinOutlineRegionCost << "\n";);
535 continue;
536 }
537
538 // For now, ignore blocks that belong to a SISE region that is a
539 // candidate for outlining. In the future, we may want to look
540 // at inner regions because the outer region may have live-exit
541 // variables.
542 for (auto *BB : DominateVector)
543 VisitedMap[BB] = true;
544
545 // ReturnBlock here means the block after the outline call
546 BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
547 FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
548 DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
549 OutliningInfo->ORI.push_back(RegInfo);
550 LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
551 << DominateVector.front()->getName() << "\n";);
552 ColdCandidateFound = true;
553 NumColdRegionsFound++;
554 }
555 }
556
557 if (ColdCandidateFound)
558 return OutliningInfo;
559
560 return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
561 }
562
563 std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function & F) const564 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
565 BasicBlock *EntryBlock = &F.front();
566 BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
567 if (!BR || BR->isUnconditional())
568 return std::unique_ptr<FunctionOutliningInfo>();
569
570 // Returns true if Succ is BB's successor
571 auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
572 return is_contained(successors(BB), Succ);
573 };
574
575 auto IsReturnBlock = [](BasicBlock *BB) {
576 Instruction *TI = BB->getTerminator();
577 return isa<ReturnInst>(TI);
578 };
579
580 auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
581 if (IsReturnBlock(Succ1))
582 return std::make_tuple(Succ1, Succ2);
583 if (IsReturnBlock(Succ2))
584 return std::make_tuple(Succ2, Succ1);
585
586 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
587 };
588
589 // Detect a triangular shape:
590 auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
591 if (IsSuccessor(Succ1, Succ2))
592 return std::make_tuple(Succ1, Succ2);
593 if (IsSuccessor(Succ2, Succ1))
594 return std::make_tuple(Succ2, Succ1);
595
596 return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
597 };
598
599 std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
600 std::make_unique<FunctionOutliningInfo>();
601
602 BasicBlock *CurrEntry = EntryBlock;
603 bool CandidateFound = false;
604 do {
605 // The number of blocks to be inlined has already reached
606 // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
607 // disables partial inlining for the function.
608 if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
609 break;
610
611 if (succ_size(CurrEntry) != 2)
612 break;
613
614 BasicBlock *Succ1 = *succ_begin(CurrEntry);
615 BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
616
617 BasicBlock *ReturnBlock, *NonReturnBlock;
618 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
619
620 if (ReturnBlock) {
621 OutliningInfo->Entries.push_back(CurrEntry);
622 OutliningInfo->ReturnBlock = ReturnBlock;
623 OutliningInfo->NonReturnBlock = NonReturnBlock;
624 CandidateFound = true;
625 break;
626 }
627
628 BasicBlock *CommSucc, *OtherSucc;
629 std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
630
631 if (!CommSucc)
632 break;
633
634 OutliningInfo->Entries.push_back(CurrEntry);
635 CurrEntry = OtherSucc;
636 } while (true);
637
638 if (!CandidateFound)
639 return std::unique_ptr<FunctionOutliningInfo>();
640
641 // There should not be any successors (not in the entry set) other than
642 // {ReturnBlock, NonReturnBlock}
643 assert(OutliningInfo->Entries[0] == &F.front() &&
644 "Function Entry must be the first in Entries vector");
645 DenseSet<BasicBlock *> Entries;
646 for (BasicBlock *E : OutliningInfo->Entries)
647 Entries.insert(E);
648
649 // Returns true of BB has Predecessor which is not
650 // in Entries set.
651 auto HasNonEntryPred = [Entries](BasicBlock *BB) {
652 for (auto *Pred : predecessors(BB)) {
653 if (!Entries.count(Pred))
654 return true;
655 }
656 return false;
657 };
658 auto CheckAndNormalizeCandidate =
659 [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
660 for (BasicBlock *E : OutliningInfo->Entries) {
661 for (auto *Succ : successors(E)) {
662 if (Entries.count(Succ))
663 continue;
664 if (Succ == OutliningInfo->ReturnBlock)
665 OutliningInfo->ReturnBlockPreds.push_back(E);
666 else if (Succ != OutliningInfo->NonReturnBlock)
667 return false;
668 }
669 // There should not be any outside incoming edges either:
670 if (HasNonEntryPred(E))
671 return false;
672 }
673 return true;
674 };
675
676 if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
677 return std::unique_ptr<FunctionOutliningInfo>();
678
679 // Now further growing the candidate's inlining region by
680 // peeling off dominating blocks from the outlining region:
681 while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
682 BasicBlock *Cand = OutliningInfo->NonReturnBlock;
683 if (succ_size(Cand) != 2)
684 break;
685
686 if (HasNonEntryPred(Cand))
687 break;
688
689 BasicBlock *Succ1 = *succ_begin(Cand);
690 BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
691
692 BasicBlock *ReturnBlock, *NonReturnBlock;
693 std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
694 if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
695 break;
696
697 if (NonReturnBlock->getSinglePredecessor() != Cand)
698 break;
699
700 // Now grow and update OutlininigInfo:
701 OutliningInfo->Entries.push_back(Cand);
702 OutliningInfo->NonReturnBlock = NonReturnBlock;
703 OutliningInfo->ReturnBlockPreds.push_back(Cand);
704 Entries.insert(Cand);
705 }
706
707 return OutliningInfo;
708 }
709
710 // Check if there is PGO data or user annotated branch data:
hasProfileData(const Function & F,const FunctionOutliningInfo & OI)711 static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
712 if (F.hasProfileData())
713 return true;
714 // Now check if any of the entry block has MD_prof data:
715 for (auto *E : OI.Entries) {
716 BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
717 if (!BR || BR->isUnconditional())
718 continue;
719 uint64_t T, F;
720 if (BR->extractProfMetadata(T, F))
721 return true;
722 }
723 return false;
724 }
725
getOutliningCallBBRelativeFreq(FunctionCloner & Cloner) const726 BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
727 FunctionCloner &Cloner) const {
728 BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
729 auto EntryFreq =
730 Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
731 auto OutliningCallFreq =
732 Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
733 // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
734 // we outlined any regions, so we may encounter situations where the
735 // OutliningCallFreq is *slightly* bigger than the EntryFreq.
736 if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
737 OutliningCallFreq = EntryFreq;
738
739 auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
740 OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
741
742 if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI))
743 return OutlineRegionRelFreq;
744
745 // When profile data is not available, we need to be conservative in
746 // estimating the overall savings. Static branch prediction can usually
747 // guess the branch direction right (taken/non-taken), but the guessed
748 // branch probability is usually not biased enough. In case when the
749 // outlined region is predicted to be likely, its probability needs
750 // to be made higher (more biased) to not under-estimate the cost of
751 // function outlining. On the other hand, if the outlined region
752 // is predicted to be less likely, the predicted probablity is usually
753 // higher than the actual. For instance, the actual probability of the
754 // less likely target is only 5%, but the guessed probablity can be
755 // 40%. In the latter case, there is no need for further adjustement.
756 // FIXME: add an option for this.
757 if (OutlineRegionRelFreq < BranchProbability(45, 100))
758 return OutlineRegionRelFreq;
759
760 OutlineRegionRelFreq = std::max(
761 OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
762
763 return OutlineRegionRelFreq;
764 }
765
shouldPartialInline(CallBase & CB,FunctionCloner & Cloner,BlockFrequency WeightedOutliningRcost,OptimizationRemarkEmitter & ORE) const766 bool PartialInlinerImpl::shouldPartialInline(
767 CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
768 OptimizationRemarkEmitter &ORE) const {
769 using namespace ore;
770
771 Function *Callee = CB.getCalledFunction();
772 assert(Callee == Cloner.ClonedFunc);
773
774 if (SkipCostAnalysis)
775 return isInlineViable(*Callee).isSuccess();
776
777 Function *Caller = CB.getCaller();
778 auto &CalleeTTI = GetTTI(*Callee);
779 bool RemarksEnabled =
780 Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
781 DEBUG_TYPE);
782 InlineCost IC =
783 getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
784 GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
785
786 if (IC.isAlways()) {
787 ORE.emit([&]() {
788 return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
789 << NV("Callee", Cloner.OrigFunc)
790 << " should always be fully inlined, not partially";
791 });
792 return false;
793 }
794
795 if (IC.isNever()) {
796 ORE.emit([&]() {
797 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
798 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
799 << NV("Caller", Caller)
800 << " because it should never be inlined (cost=never)";
801 });
802 return false;
803 }
804
805 if (!IC) {
806 ORE.emit([&]() {
807 return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
808 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
809 << NV("Caller", Caller) << " because too costly to inline (cost="
810 << NV("Cost", IC.getCost()) << ", threshold="
811 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
812 });
813 return false;
814 }
815 const DataLayout &DL = Caller->getParent()->getDataLayout();
816
817 // The savings of eliminating the call:
818 int NonWeightedSavings = getCallsiteCost(CB, DL);
819 BlockFrequency NormWeightedSavings(NonWeightedSavings);
820
821 // Weighted saving is smaller than weighted cost, return false
822 if (NormWeightedSavings < WeightedOutliningRcost) {
823 ORE.emit([&]() {
824 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
825 &CB)
826 << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
827 << NV("Caller", Caller) << " runtime overhead (overhead="
828 << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
829 << ", savings="
830 << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
831 << ")"
832 << " of making the outlined call is too high";
833 });
834
835 return false;
836 }
837
838 ORE.emit([&]() {
839 return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
840 << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
841 << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
842 << " (threshold="
843 << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
844 });
845 return true;
846 }
847
848 // TODO: Ideally we should share Inliner's InlineCost Analysis code.
849 // For now use a simplified version. The returned 'InlineCost' will be used
850 // to esimate the size cost as well as runtime cost of the BB.
851 InstructionCost
computeBBInlineCost(BasicBlock * BB,TargetTransformInfo * TTI)852 PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
853 TargetTransformInfo *TTI) {
854 InstructionCost InlineCost = 0;
855 const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
856 for (Instruction &I : BB->instructionsWithoutDebug()) {
857 // Skip free instructions.
858 switch (I.getOpcode()) {
859 case Instruction::BitCast:
860 case Instruction::PtrToInt:
861 case Instruction::IntToPtr:
862 case Instruction::Alloca:
863 case Instruction::PHI:
864 continue;
865 case Instruction::GetElementPtr:
866 if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
867 continue;
868 break;
869 default:
870 break;
871 }
872
873 if (I.isLifetimeStartOrEnd())
874 continue;
875
876 if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
877 Intrinsic::ID IID = II->getIntrinsicID();
878 SmallVector<Type *, 4> Tys;
879 FastMathFlags FMF;
880 for (Value *Val : II->args())
881 Tys.push_back(Val->getType());
882
883 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
884 FMF = FPMO->getFastMathFlags();
885
886 IntrinsicCostAttributes ICA(IID, II->getType(), Tys, FMF);
887 InlineCost += TTI->getIntrinsicInstrCost(ICA, TTI::TCK_SizeAndLatency);
888 continue;
889 }
890
891 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
892 InlineCost += getCallsiteCost(*CI, DL);
893 continue;
894 }
895
896 if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
897 InlineCost += getCallsiteCost(*II, DL);
898 continue;
899 }
900
901 if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
902 InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
903 continue;
904 }
905 InlineCost += InlineConstants::InstrCost;
906 }
907
908 return InlineCost;
909 }
910
911 std::tuple<InstructionCost, InstructionCost>
computeOutliningCosts(FunctionCloner & Cloner) const912 PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
913 InstructionCost OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
914 for (auto FuncBBPair : Cloner.OutlinedFunctions) {
915 Function *OutlinedFunc = FuncBBPair.first;
916 BasicBlock* OutliningCallBB = FuncBBPair.second;
917 // Now compute the cost of the call sequence to the outlined function
918 // 'OutlinedFunction' in BB 'OutliningCallBB':
919 auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
920 OutliningFuncCallCost +=
921 computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
922
923 // Now compute the cost of the extracted/outlined function itself:
924 for (BasicBlock &BB : *OutlinedFunc)
925 OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
926 }
927 assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
928 "Outlined function cost should be no less than the outlined region");
929
930 // The code extractor introduces a new root and exit stub blocks with
931 // additional unconditional branches. Those branches will be eliminated
932 // later with bb layout. The cost should be adjusted accordingly:
933 OutlinedFunctionCost -=
934 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
935
936 InstructionCost OutliningRuntimeOverhead =
937 OutliningFuncCallCost +
938 (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
939 ExtraOutliningPenalty.getValue();
940
941 return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
942 }
943
944 // Create the callsite to profile count map which is
945 // used to update the original function's entry count,
946 // after the function is partially inlined into the callsite.
computeCallsiteToProfCountMap(Function * DuplicateFunction,DenseMap<User *,uint64_t> & CallSiteToProfCountMap) const947 void PartialInlinerImpl::computeCallsiteToProfCountMap(
948 Function *DuplicateFunction,
949 DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
950 std::vector<User *> Users(DuplicateFunction->user_begin(),
951 DuplicateFunction->user_end());
952 Function *CurrentCaller = nullptr;
953 std::unique_ptr<BlockFrequencyInfo> TempBFI;
954 BlockFrequencyInfo *CurrentCallerBFI = nullptr;
955
956 auto ComputeCurrBFI = [&,this](Function *Caller) {
957 // For the old pass manager:
958 if (!GetBFI) {
959 DominatorTree DT(*Caller);
960 LoopInfo LI(DT);
961 BranchProbabilityInfo BPI(*Caller, LI);
962 TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
963 CurrentCallerBFI = TempBFI.get();
964 } else {
965 // New pass manager:
966 CurrentCallerBFI = &(GetBFI(*Caller));
967 }
968 };
969
970 for (User *User : Users) {
971 // Don't bother with BlockAddress used by CallBr for asm goto.
972 if (isa<BlockAddress>(User))
973 continue;
974 CallBase *CB = getSupportedCallBase(User);
975 Function *Caller = CB->getCaller();
976 if (CurrentCaller != Caller) {
977 CurrentCaller = Caller;
978 ComputeCurrBFI(Caller);
979 } else {
980 assert(CurrentCallerBFI && "CallerBFI is not set");
981 }
982 BasicBlock *CallBB = CB->getParent();
983 auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
984 if (Count)
985 CallSiteToProfCountMap[User] = *Count;
986 else
987 CallSiteToProfCountMap[User] = 0;
988 }
989 }
990
FunctionCloner(Function * F,FunctionOutliningInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)991 PartialInlinerImpl::FunctionCloner::FunctionCloner(
992 Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
993 function_ref<AssumptionCache *(Function &)> LookupAC,
994 function_ref<TargetTransformInfo &(Function &)> GetTTI)
995 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
996 ClonedOI = std::make_unique<FunctionOutliningInfo>();
997
998 // Clone the function, so that we can hack away on it.
999 ValueToValueMapTy VMap;
1000 ClonedFunc = CloneFunction(F, VMap);
1001
1002 ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
1003 ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
1004 for (BasicBlock *BB : OI->Entries)
1005 ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
1006
1007 for (BasicBlock *E : OI->ReturnBlockPreds) {
1008 BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
1009 ClonedOI->ReturnBlockPreds.push_back(NewE);
1010 }
1011 // Go ahead and update all uses to the duplicate, so that we can just
1012 // use the inliner functionality when we're done hacking.
1013 F->replaceAllUsesWith(ClonedFunc);
1014 }
1015
FunctionCloner(Function * F,FunctionOutliningMultiRegionInfo * OI,OptimizationRemarkEmitter & ORE,function_ref<AssumptionCache * (Function &)> LookupAC,function_ref<TargetTransformInfo & (Function &)> GetTTI)1016 PartialInlinerImpl::FunctionCloner::FunctionCloner(
1017 Function *F, FunctionOutliningMultiRegionInfo *OI,
1018 OptimizationRemarkEmitter &ORE,
1019 function_ref<AssumptionCache *(Function &)> LookupAC,
1020 function_ref<TargetTransformInfo &(Function &)> GetTTI)
1021 : OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
1022 ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
1023
1024 // Clone the function, so that we can hack away on it.
1025 ValueToValueMapTy VMap;
1026 ClonedFunc = CloneFunction(F, VMap);
1027
1028 // Go through all Outline Candidate Regions and update all BasicBlock
1029 // information.
1030 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1031 OI->ORI) {
1032 SmallVector<BasicBlock *, 8> Region;
1033 for (BasicBlock *BB : RegionInfo.Region)
1034 Region.push_back(cast<BasicBlock>(VMap[BB]));
1035
1036 BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
1037 BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
1038 BasicBlock *NewReturnBlock = nullptr;
1039 if (RegionInfo.ReturnBlock)
1040 NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
1041 FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
1042 Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
1043 ClonedOMRI->ORI.push_back(MappedRegionInfo);
1044 }
1045 // Go ahead and update all uses to the duplicate, so that we can just
1046 // use the inliner functionality when we're done hacking.
1047 F->replaceAllUsesWith(ClonedFunc);
1048 }
1049
normalizeReturnBlock() const1050 void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
1051 auto GetFirstPHI = [](BasicBlock *BB) {
1052 BasicBlock::iterator I = BB->begin();
1053 PHINode *FirstPhi = nullptr;
1054 while (I != BB->end()) {
1055 PHINode *Phi = dyn_cast<PHINode>(I);
1056 if (!Phi)
1057 break;
1058 if (!FirstPhi) {
1059 FirstPhi = Phi;
1060 break;
1061 }
1062 }
1063 return FirstPhi;
1064 };
1065
1066 // Shouldn't need to normalize PHIs if we're not outlining non-early return
1067 // blocks.
1068 if (!ClonedOI)
1069 return;
1070
1071 // Special hackery is needed with PHI nodes that have inputs from more than
1072 // one extracted block. For simplicity, just split the PHIs into a two-level
1073 // sequence of PHIs, some of which will go in the extracted region, and some
1074 // of which will go outside.
1075 BasicBlock *PreReturn = ClonedOI->ReturnBlock;
1076 // only split block when necessary:
1077 PHINode *FirstPhi = GetFirstPHI(PreReturn);
1078 unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
1079
1080 if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
1081 return;
1082
1083 auto IsTrivialPhi = [](PHINode *PN) -> Value * {
1084 Value *CommonValue = PN->getIncomingValue(0);
1085 if (all_of(PN->incoming_values(),
1086 [&](Value *V) { return V == CommonValue; }))
1087 return CommonValue;
1088 return nullptr;
1089 };
1090
1091 ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
1092 ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
1093 BasicBlock::iterator I = PreReturn->begin();
1094 Instruction *Ins = &ClonedOI->ReturnBlock->front();
1095 SmallVector<Instruction *, 4> DeadPhis;
1096 while (I != PreReturn->end()) {
1097 PHINode *OldPhi = dyn_cast<PHINode>(I);
1098 if (!OldPhi)
1099 break;
1100
1101 PHINode *RetPhi =
1102 PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
1103 OldPhi->replaceAllUsesWith(RetPhi);
1104 Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
1105
1106 RetPhi->addIncoming(&*I, PreReturn);
1107 for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
1108 RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
1109 OldPhi->removeIncomingValue(E);
1110 }
1111
1112 // After incoming values splitting, the old phi may become trivial.
1113 // Keeping the trivial phi can introduce definition inside the outline
1114 // region which is live-out, causing necessary overhead (load, store
1115 // arg passing etc).
1116 if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
1117 OldPhi->replaceAllUsesWith(OldPhiVal);
1118 DeadPhis.push_back(OldPhi);
1119 }
1120 ++I;
1121 }
1122 for (auto *DP : DeadPhis)
1123 DP->eraseFromParent();
1124
1125 for (auto *E : ClonedOI->ReturnBlockPreds)
1126 E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
1127 }
1128
doMultiRegionFunctionOutlining()1129 bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
1130
1131 auto ComputeRegionCost =
1132 [&](SmallVectorImpl<BasicBlock *> &Region) -> InstructionCost {
1133 InstructionCost Cost = 0;
1134 for (BasicBlock* BB : Region)
1135 Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
1136 return Cost;
1137 };
1138
1139 assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
1140
1141 if (ClonedOMRI->ORI.empty())
1142 return false;
1143
1144 // The CodeExtractor needs a dominator tree.
1145 DominatorTree DT;
1146 DT.recalculate(*ClonedFunc);
1147
1148 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1149 LoopInfo LI(DT);
1150 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1151 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1152
1153 // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
1154 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1155
1156 SetVector<Value *> Inputs, Outputs, Sinks;
1157 for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
1158 ClonedOMRI->ORI) {
1159 InstructionCost CurrentOutlinedRegionCost =
1160 ComputeRegionCost(RegionInfo.Region);
1161
1162 CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
1163 ClonedFuncBFI.get(), &BPI,
1164 LookupAC(*RegionInfo.EntryBlock->getParent()),
1165 /* AllowVarargs */ false);
1166
1167 CE.findInputsOutputs(Inputs, Outputs, Sinks);
1168
1169 LLVM_DEBUG({
1170 dbgs() << "inputs: " << Inputs.size() << "\n";
1171 dbgs() << "outputs: " << Outputs.size() << "\n";
1172 for (Value *value : Inputs)
1173 dbgs() << "value used in func: " << *value << "\n";
1174 for (Value *output : Outputs)
1175 dbgs() << "instr used in func: " << *output << "\n";
1176 });
1177
1178 // Do not extract regions that have live exit variables.
1179 if (Outputs.size() > 0 && !ForceLiveExit)
1180 continue;
1181
1182 if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
1183 CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
1184 BasicBlock *OutliningCallBB = OCS->getParent();
1185 assert(OutliningCallBB->getParent() == ClonedFunc);
1186 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
1187 NumColdRegionsOutlined++;
1188 OutlinedRegionCost += CurrentOutlinedRegionCost;
1189
1190 if (MarkOutlinedColdCC) {
1191 OutlinedFunc->setCallingConv(CallingConv::Cold);
1192 OCS->setCallingConv(CallingConv::Cold);
1193 }
1194 } else
1195 ORE.emit([&]() {
1196 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1197 &RegionInfo.Region.front()->front())
1198 << "Failed to extract region at block "
1199 << ore::NV("Block", RegionInfo.Region.front());
1200 });
1201 }
1202
1203 return !OutlinedFunctions.empty();
1204 }
1205
1206 Function *
doSingleRegionFunctionOutlining()1207 PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
1208 // Returns true if the block is to be partial inlined into the caller
1209 // (i.e. not to be extracted to the out of line function)
1210 auto ToBeInlined = [&, this](BasicBlock *BB) {
1211 return BB == ClonedOI->ReturnBlock ||
1212 llvm::is_contained(ClonedOI->Entries, BB);
1213 };
1214
1215 assert(ClonedOI && "Expecting OutlineInfo for single region outline");
1216 // The CodeExtractor needs a dominator tree.
1217 DominatorTree DT;
1218 DT.recalculate(*ClonedFunc);
1219
1220 // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
1221 LoopInfo LI(DT);
1222 BranchProbabilityInfo BPI(*ClonedFunc, LI);
1223 ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
1224
1225 // Gather up the blocks that we're going to extract.
1226 std::vector<BasicBlock *> ToExtract;
1227 auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
1228 ToExtract.push_back(ClonedOI->NonReturnBlock);
1229 OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
1230 ClonedOI->NonReturnBlock, ClonedFuncTTI);
1231 for (BasicBlock &BB : *ClonedFunc)
1232 if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
1233 ToExtract.push_back(&BB);
1234 // FIXME: the code extractor may hoist/sink more code
1235 // into the outlined function which may make the outlining
1236 // overhead (the difference of the outlined function cost
1237 // and OutliningRegionCost) look larger.
1238 OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
1239 }
1240
1241 // Extract the body of the if.
1242 CodeExtractorAnalysisCache CEAC(*ClonedFunc);
1243 Function *OutlinedFunc =
1244 CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
1245 ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
1246 /* AllowVarargs */ true)
1247 .extractCodeRegion(CEAC);
1248
1249 if (OutlinedFunc) {
1250 BasicBlock *OutliningCallBB =
1251 PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
1252 assert(OutliningCallBB->getParent() == ClonedFunc);
1253 OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
1254 } else
1255 ORE.emit([&]() {
1256 return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
1257 &ToExtract.front()->front())
1258 << "Failed to extract region at block "
1259 << ore::NV("Block", ToExtract.front());
1260 });
1261
1262 return OutlinedFunc;
1263 }
1264
~FunctionCloner()1265 PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
1266 // Ditch the duplicate, since we're done with it, and rewrite all remaining
1267 // users (function pointers, etc.) back to the original function.
1268 ClonedFunc->replaceAllUsesWith(OrigFunc);
1269 ClonedFunc->eraseFromParent();
1270 if (!IsFunctionInlined) {
1271 // Remove each function that was speculatively created if there is no
1272 // reference.
1273 for (auto FuncBBPair : OutlinedFunctions) {
1274 Function *Func = FuncBBPair.first;
1275 Func->eraseFromParent();
1276 }
1277 }
1278 }
1279
unswitchFunction(Function & F)1280 std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
1281 if (F.hasAddressTaken())
1282 return {false, nullptr};
1283
1284 // Let inliner handle it
1285 if (F.hasFnAttribute(Attribute::AlwaysInline))
1286 return {false, nullptr};
1287
1288 if (F.hasFnAttribute(Attribute::NoInline))
1289 return {false, nullptr};
1290
1291 if (PSI.isFunctionEntryCold(&F))
1292 return {false, nullptr};
1293
1294 if (F.users().empty())
1295 return {false, nullptr};
1296
1297 OptimizationRemarkEmitter ORE(&F);
1298
1299 // Only try to outline cold regions if we have a profile summary, which
1300 // implies we have profiling information.
1301 if (PSI.hasProfileSummary() && F.hasProfileData() &&
1302 !DisableMultiRegionPartialInline) {
1303 std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
1304 computeOutliningColdRegionsInfo(F, ORE);
1305 if (OMRI) {
1306 FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
1307
1308 LLVM_DEBUG({
1309 dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
1310 dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
1311 << "\n";
1312 });
1313
1314 bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
1315
1316 if (DidOutline) {
1317 LLVM_DEBUG({
1318 dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
1319 Cloner.ClonedFunc->print(dbgs());
1320 dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
1321 });
1322
1323 if (tryPartialInline(Cloner))
1324 return {true, nullptr};
1325 }
1326 }
1327 }
1328
1329 // Fall-thru to regular partial inlining if we:
1330 // i) can't find any cold regions to outline, or
1331 // ii) can't inline the outlined function anywhere.
1332 std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
1333 if (!OI)
1334 return {false, nullptr};
1335
1336 FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
1337 Cloner.normalizeReturnBlock();
1338
1339 Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
1340
1341 if (!OutlinedFunction)
1342 return {false, nullptr};
1343
1344 if (tryPartialInline(Cloner))
1345 return {true, OutlinedFunction};
1346
1347 return {false, nullptr};
1348 }
1349
tryPartialInline(FunctionCloner & Cloner)1350 bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
1351 if (Cloner.OutlinedFunctions.empty())
1352 return false;
1353
1354 int SizeCost = 0;
1355 BlockFrequency WeightedRcost;
1356 int NonWeightedRcost;
1357
1358 auto OutliningCosts = computeOutliningCosts(Cloner);
1359 assert(std::get<0>(OutliningCosts).isValid() &&
1360 std::get<1>(OutliningCosts).isValid() && "Expected valid costs");
1361
1362 SizeCost = *std::get<0>(OutliningCosts).getValue();
1363 NonWeightedRcost = *std::get<1>(OutliningCosts).getValue();
1364
1365 // Only calculate RelativeToEntryFreq when we are doing single region
1366 // outlining.
1367 BranchProbability RelativeToEntryFreq;
1368 if (Cloner.ClonedOI)
1369 RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
1370 else
1371 // RelativeToEntryFreq doesn't make sense when we have more than one
1372 // outlined call because each call will have a different relative frequency
1373 // to the entry block. We can consider using the average, but the
1374 // usefulness of that information is questionable. For now, assume we never
1375 // execute the calls to outlined functions.
1376 RelativeToEntryFreq = BranchProbability(0, 1);
1377
1378 WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
1379
1380 // The call sequence(s) to the outlined function(s) are larger than the sum of
1381 // the original outlined region size(s), it does not increase the chances of
1382 // inlining the function with outlining (The inliner uses the size increase to
1383 // model the cost of inlining a callee).
1384 if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
1385 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1386 DebugLoc DLoc;
1387 BasicBlock *Block;
1388 std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
1389 OrigFuncORE.emit([&]() {
1390 return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
1391 DLoc, Block)
1392 << ore::NV("Function", Cloner.OrigFunc)
1393 << " not partially inlined into callers (Original Size = "
1394 << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
1395 << ", Size of call sequence to outlined function = "
1396 << ore::NV("NewSize", SizeCost) << ")";
1397 });
1398 return false;
1399 }
1400
1401 assert(Cloner.OrigFunc->users().empty() &&
1402 "F's users should all be replaced!");
1403
1404 std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
1405 Cloner.ClonedFunc->user_end());
1406
1407 DenseMap<User *, uint64_t> CallSiteToProfCountMap;
1408 auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
1409 if (CalleeEntryCount)
1410 computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
1411
1412 uint64_t CalleeEntryCountV =
1413 (CalleeEntryCount ? CalleeEntryCount->getCount() : 0);
1414
1415 bool AnyInline = false;
1416 for (User *User : Users) {
1417 // Don't bother with BlockAddress used by CallBr for asm goto.
1418 if (isa<BlockAddress>(User))
1419 continue;
1420
1421 CallBase *CB = getSupportedCallBase(User);
1422
1423 if (isLimitReached())
1424 continue;
1425
1426 OptimizationRemarkEmitter CallerORE(CB->getCaller());
1427 if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
1428 continue;
1429
1430 // Construct remark before doing the inlining, as after successful inlining
1431 // the callsite is removed.
1432 OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
1433 OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
1434 << ore::NV("Caller", CB->getCaller());
1435
1436 InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
1437 // We can only forward varargs when we outlined a single region, else we
1438 // bail on vararg functions.
1439 if (!InlineFunction(*CB, IFI, nullptr, true,
1440 (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
1441 : nullptr))
1442 .isSuccess())
1443 continue;
1444
1445 CallerORE.emit(OR);
1446
1447 // Now update the entry count:
1448 if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
1449 uint64_t CallSiteCount = CallSiteToProfCountMap[User];
1450 CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
1451 }
1452
1453 AnyInline = true;
1454 NumPartialInlining++;
1455 // Update the stats
1456 if (Cloner.ClonedOI)
1457 NumPartialInlined++;
1458 else
1459 NumColdOutlinePartialInlined++;
1460 }
1461
1462 if (AnyInline) {
1463 Cloner.IsFunctionInlined = true;
1464 if (CalleeEntryCount)
1465 Cloner.OrigFunc->setEntryCount(Function::ProfileCount(
1466 CalleeEntryCountV, CalleeEntryCount->getType()));
1467 OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
1468 OrigFuncORE.emit([&]() {
1469 return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
1470 << "Partially inlined into at least one caller";
1471 });
1472 }
1473
1474 return AnyInline;
1475 }
1476
run(Module & M)1477 bool PartialInlinerImpl::run(Module &M) {
1478 if (DisablePartialInlining)
1479 return false;
1480
1481 std::vector<Function *> Worklist;
1482 Worklist.reserve(M.size());
1483 for (Function &F : M)
1484 if (!F.use_empty() && !F.isDeclaration())
1485 Worklist.push_back(&F);
1486
1487 bool Changed = false;
1488 while (!Worklist.empty()) {
1489 Function *CurrFunc = Worklist.back();
1490 Worklist.pop_back();
1491
1492 if (CurrFunc->use_empty())
1493 continue;
1494
1495 bool Recursive = false;
1496 for (User *U : CurrFunc->users())
1497 if (Instruction *I = dyn_cast<Instruction>(U))
1498 if (I->getParent()->getParent() == CurrFunc) {
1499 Recursive = true;
1500 break;
1501 }
1502 if (Recursive)
1503 continue;
1504
1505 std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
1506 if (Result.second)
1507 Worklist.push_back(Result.second);
1508 Changed |= Result.first;
1509 }
1510
1511 return Changed;
1512 }
1513
1514 char PartialInlinerLegacyPass::ID = 0;
1515
1516 INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
1517 "Partial Inliner", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)1518 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1519 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
1520 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1521 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1522 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
1523 "Partial Inliner", false, false)
1524
1525 ModulePass *llvm::createPartialInliningPass() {
1526 return new PartialInlinerLegacyPass();
1527 }
1528
run(Module & M,ModuleAnalysisManager & AM)1529 PreservedAnalyses PartialInlinerPass::run(Module &M,
1530 ModuleAnalysisManager &AM) {
1531 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1532
1533 auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
1534 return FAM.getResult<AssumptionAnalysis>(F);
1535 };
1536
1537 auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
1538 return FAM.getCachedResult<AssumptionAnalysis>(F);
1539 };
1540
1541 auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
1542 return FAM.getResult<BlockFrequencyAnalysis>(F);
1543 };
1544
1545 auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
1546 return FAM.getResult<TargetIRAnalysis>(F);
1547 };
1548
1549 auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
1550 return FAM.getResult<TargetLibraryAnalysis>(F);
1551 };
1552
1553 ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
1554
1555 if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
1556 GetTLI, PSI, GetBFI)
1557 .run(M))
1558 return PreservedAnalyses::none();
1559 return PreservedAnalyses::all();
1560 }
1561