1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Outline cold regions to a separate function.
11 // TODO: Update BFI and BPI
12 // TODO: Add all the outlined functions to a separate section.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/BlockFrequencyInfo.h"
20 #include "llvm/Analysis/BranchProbabilityInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/PostDominators.h"
24 #include "llvm/Analysis/ProfileSummaryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/CFG.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/Function.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/Metadata.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Use.h"
40 #include "llvm/IR/User.h"
41 #include "llvm/IR/Value.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/BlockFrequency.h"
44 #include "llvm/Support/BranchProbability.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Transforms/IPO.h"
48 #include "llvm/Transforms/IPO/HotColdSplitting.h"
49 #include "llvm/Transforms/Scalar.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/Local.h"
54 #include "llvm/Transforms/Utils/SSAUpdater.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 
59 #define DEBUG_TYPE "hotcoldsplit"
60 
61 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
62 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
63 
64 using namespace llvm;
65 
66 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
67                               cl::init(true), cl::Hidden);
68 
69 static cl::opt<int>
70     MinOutliningThreshold("min-outlining-thresh", cl::init(3), cl::Hidden,
71                           cl::desc("Code size threshold for outlining within a "
72                                    "single BB (as a multiple of TCC_Basic)"));
73 
74 namespace {
75 
76 struct PostDomTree : PostDomTreeBase<BasicBlock> {
77   PostDomTree(Function &F) { recalculate(F); }
78 };
79 
80 /// A sequence of basic blocks.
81 ///
82 /// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
83 using BlockSequence = SmallVector<BasicBlock *, 0>;
84 
85 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
86 // this function unless you modify the MBB version as well.
87 //
88 /// A no successor, non-return block probably ends in unreachable and is cold.
89 /// Also consider a block that ends in an indirect branch to be a return block,
90 /// since many targets use plain indirect branches to return.
91 bool blockEndsInUnreachable(const BasicBlock &BB) {
92   if (!succ_empty(&BB))
93     return false;
94   if (BB.empty())
95     return true;
96   const Instruction *I = BB.getTerminator();
97   return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
98 }
99 
100 static bool exceptionHandlingFunctions(const CallInst *CI) {
101   auto F = CI->getCalledFunction();
102   if (!F)
103     return false;
104   auto FName = F->getName();
105   return FName == "__cxa_begin_catch" ||
106          FName == "__cxa_free_exception" ||
107          FName == "__cxa_allocate_exception" ||
108          FName == "__cxa_begin_catch" ||
109          FName == "__cxa_end_catch";
110 }
111 
112 static bool unlikelyExecuted(const BasicBlock &BB) {
113   if (blockEndsInUnreachable(BB))
114     return true;
115   // Exception handling blocks are unlikely executed.
116   if (BB.isEHPad())
117     return true;
118   for (const Instruction &I : BB)
119     if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
120       // The block is cold if it calls functions tagged as cold or noreturn.
121       if (CI->hasFnAttr(Attribute::Cold) ||
122           CI->hasFnAttr(Attribute::NoReturn) ||
123           exceptionHandlingFunctions(CI))
124         return true;
125 
126       // Assume that inline assembly is hot code.
127       if (isa<InlineAsm>(CI->getCalledValue()))
128         return false;
129     }
130   return false;
131 }
132 
133 /// Check whether it's safe to outline \p BB.
134 static bool mayExtractBlock(const BasicBlock &BB) {
135   return !BB.hasAddressTaken();
136 }
137 
138 /// Check whether \p BB is profitable to outline (i.e. its code size cost meets
139 /// the threshold set in \p MinOutliningThreshold).
140 static bool isProfitableToOutline(const BasicBlock &BB,
141                                   TargetTransformInfo &TTI) {
142   int Cost = 0;
143   for (const Instruction &I : BB) {
144     if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
145       continue;
146 
147     Cost += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
148 
149     if (Cost >= (MinOutliningThreshold * TargetTransformInfo::TCC_Basic))
150       return true;
151   }
152   return false;
153 }
154 
155 /// Identify the maximal region of cold blocks which includes \p SinkBB.
156 ///
157 /// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
158 /// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which
159 /// cannot be outlined.
160 ///
161 /// Return an empty sequence if the cold region is too small to outline, or if
162 /// the cold region has no warm predecessors.
163 static BlockSequence findMaximalColdRegion(BasicBlock &SinkBB,
164                                            TargetTransformInfo &TTI,
165                                            DominatorTree &DT,
166                                            PostDomTree &PDT) {
167   // The maximal cold region.
168   BlockSequence ColdRegion = {};
169 
170   // The ancestor farthest-away from SinkBB, and also post-dominated by it.
171   BasicBlock *MaxAncestor = &SinkBB;
172   unsigned MaxAncestorHeight = 0;
173 
174   // Visit SinkBB's ancestors using inverse DFS.
175   auto PredIt = ++idf_begin(&SinkBB);
176   auto PredEnd = idf_end(&SinkBB);
177   while (PredIt != PredEnd) {
178     BasicBlock &PredBB = **PredIt;
179     bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
180 
181     // If SinkBB does not post-dominate a predecessor, do not mark the
182     // predecessor (or any of its predecessors) cold.
183     if (!SinkPostDom || !mayExtractBlock(PredBB)) {
184       PredIt.skipChildren();
185       continue;
186     }
187 
188     // Keep track of the post-dominated ancestor farthest away from the sink.
189     unsigned AncestorHeight = PredIt.getPathLength();
190     if (AncestorHeight > MaxAncestorHeight) {
191       MaxAncestor = &PredBB;
192       MaxAncestorHeight = AncestorHeight;
193     }
194 
195     ColdRegion.push_back(&PredBB);
196     ++PredIt;
197   }
198 
199   // CodeExtractor requires that all blocks to be extracted must be dominated
200   // by the first block to be extracted.
201   //
202   // To avoid spurious or repeated outlining, require that the max ancestor
203   // has a predecessor. By construction this predecessor is not in the cold
204   // region, i.e. its existence implies we don't outline the whole function.
205   //
206   // TODO: If MaxAncestor has no predecessors, we may be able to outline the
207   // second largest cold region that has a predecessor.
208   if (pred_empty(MaxAncestor) ||
209       MaxAncestor->getSinglePredecessor() == MaxAncestor)
210     return {};
211 
212   // Filter out predecessors not dominated by the max ancestor.
213   //
214   // TODO: Blocks not dominated by the max ancestor could be extracted as
215   // other cold regions. Marking outlined calls as noreturn when appropriate
216   // and outlining more than once per function could achieve most of the win.
217   auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) {
218     return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB);
219   });
220   ColdRegion.erase(EraseIt, ColdRegion.end());
221 
222   // Add SinkBB to the cold region.
223   ColdRegion.push_back(&SinkBB);
224 
225   // Ensure that the first extracted block is the max ancestor.
226   if (ColdRegion[0] != MaxAncestor) {
227     auto AncestorIt = find(ColdRegion, MaxAncestor);
228     *AncestorIt = ColdRegion[0];
229     ColdRegion[0] = MaxAncestor;
230   }
231 
232   // Find all successors of SinkBB dominated by SinkBB using DFS.
233   auto SuccIt = ++df_begin(&SinkBB);
234   auto SuccEnd = df_end(&SinkBB);
235   while (SuccIt != SuccEnd) {
236     BasicBlock &SuccBB = **SuccIt;
237     bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
238 
239     // If SinkBB does not dominate a successor, do not mark the successor (or
240     // any of its successors) cold.
241     if (!SinkDom || !mayExtractBlock(SuccBB)) {
242       SuccIt.skipChildren();
243       continue;
244     }
245 
246     ColdRegion.push_back(&SuccBB);
247     ++SuccIt;
248   }
249 
250   if (ColdRegion.size() == 1 && !isProfitableToOutline(*ColdRegion[0], TTI))
251     return {};
252 
253   return ColdRegion;
254 }
255 
256 /// Get the largest cold region in \p F.
257 static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
258                                           BlockFrequencyInfo *BFI,
259                                           TargetTransformInfo &TTI,
260                                           DominatorTree &DT, PostDomTree &PDT) {
261   // Keep track of the largest cold region.
262   BlockSequence LargestColdRegion = {};
263 
264   for (BasicBlock &BB : F) {
265     // Identify cold blocks.
266     if (!mayExtractBlock(BB))
267       continue;
268     bool Cold =
269         PSI.isColdBlock(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB));
270     if (!Cold)
271       continue;
272 
273     LLVM_DEBUG({
274       dbgs() << "Found cold block:\n";
275       BB.dump();
276     });
277 
278     // Find a maximal cold region we can outline.
279     BlockSequence ColdRegion = findMaximalColdRegion(BB, TTI, DT, PDT);
280     if (ColdRegion.empty()) {
281       LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
282       continue;
283     }
284 
285     ++NumColdRegionsFound;
286 
287     LLVM_DEBUG({
288       llvm::dbgs() << "Identified cold region with " << ColdRegion.size()
289                    << " blocks:\n";
290       for (BasicBlock *BB : ColdRegion)
291         BB->dump();
292     });
293 
294     // TODO: Outline more than one region.
295     if (ColdRegion.size() > LargestColdRegion.size())
296       LargestColdRegion = std::move(ColdRegion);
297   }
298 
299   return LargestColdRegion;
300 }
301 
302 class HotColdSplitting {
303 public:
304   HotColdSplitting(ProfileSummaryInfo *ProfSI,
305                    function_ref<BlockFrequencyInfo *(Function &)> GBFI,
306                    function_ref<TargetTransformInfo &(Function &)> GTTI,
307                    std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
308       : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE) {}
309   bool run(Module &M);
310 
311 private:
312   bool shouldOutlineFrom(const Function &F) const;
313   Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
314                               BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
315                               OptimizationRemarkEmitter &ORE, unsigned Count);
316   SmallPtrSet<const Function *, 2> OutlinedFunctions;
317   ProfileSummaryInfo *PSI;
318   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
319   function_ref<TargetTransformInfo &(Function &)> GetTTI;
320   std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
321 };
322 
323 class HotColdSplittingLegacyPass : public ModulePass {
324 public:
325   static char ID;
326   HotColdSplittingLegacyPass() : ModulePass(ID) {
327     initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
328   }
329 
330   void getAnalysisUsage(AnalysisUsage &AU) const override {
331     AU.addRequired<AssumptionCacheTracker>();
332     AU.addRequired<BlockFrequencyInfoWrapperPass>();
333     AU.addRequired<ProfileSummaryInfoWrapperPass>();
334     AU.addRequired<TargetTransformInfoWrapperPass>();
335   }
336 
337   bool runOnModule(Module &M) override;
338 };
339 
340 } // end anonymous namespace
341 
342 // Returns false if the function should not be considered for hot-cold split
343 // optimization.
344 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
345   // Do not try to outline again from an already outlined cold function.
346   if (OutlinedFunctions.count(&F))
347     return false;
348 
349   if (F.size() <= 2)
350     return false;
351 
352   // TODO: Consider only skipping functions marked `optnone` or `cold`.
353 
354   if (F.hasAddressTaken())
355     return false;
356 
357   if (F.hasFnAttribute(Attribute::AlwaysInline))
358     return false;
359 
360   if (F.hasFnAttribute(Attribute::NoInline))
361     return false;
362 
363   if (F.getCallingConv() == CallingConv::Cold)
364     return false;
365 
366   if (PSI->isFunctionEntryCold(&F))
367     return false;
368   return true;
369 }
370 
371 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
372                                               DominatorTree &DT,
373                                               BlockFrequencyInfo *BFI,
374                                               TargetTransformInfo &TTI,
375                                               OptimizationRemarkEmitter &ORE,
376                                               unsigned Count) {
377   assert(!Region.empty());
378   LLVM_DEBUG(for (auto *BB : Region)
379           llvm::dbgs() << "\nExtracting: " << *BB;);
380 
381   // TODO: Pass BFI and BPI to update profile information.
382   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
383                    /* BPI */ nullptr, /* AllowVarArgs */ false,
384                    /* AllowAlloca */ false,
385                    /* Suffix */ "cold." + std::to_string(Count));
386 
387   SetVector<Value *> Inputs, Outputs, Sinks;
388   CE.findInputsOutputs(Inputs, Outputs, Sinks);
389 
390   // Do not extract regions that have live exit variables.
391   if (Outputs.size() > 0) {
392     LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n");
393     return nullptr;
394   }
395 
396   // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function.
397   Function *OrigF = Region[0]->getParent();
398   if (Function *OutF = CE.extractCodeRegion()) {
399     User *U = *OutF->user_begin();
400     CallInst *CI = cast<CallInst>(U);
401     CallSite CS(CI);
402     NumColdRegionsOutlined++;
403     if (TTI.useColdCCForColdCall(*OutF)) {
404       OutF->setCallingConv(CallingConv::Cold);
405       CS.setCallingConv(CallingConv::Cold);
406     }
407     CI->setIsNoInline();
408 
409     // Try to make the outlined code as small as possible on the assumption
410     // that it's cold.
411     assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) &&
412            "An outlined function should never be marked optnone");
413     OutF->addFnAttr(Attribute::MinSize);
414 
415     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
416     ORE.emit([&]() {
417       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
418                                 &*Region[0]->begin())
419              << ore::NV("Original", OrigF) << " split cold code into "
420              << ore::NV("Split", OutF);
421     });
422     return OutF;
423   }
424 
425   ORE.emit([&]() {
426     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
427                                     &*Region[0]->begin())
428            << "Failed to extract region at block "
429            << ore::NV("Block", Region.front());
430   });
431   return nullptr;
432 }
433 
434 bool HotColdSplitting::run(Module &M) {
435   bool Changed = false;
436   for (auto &F : M) {
437     if (!shouldOutlineFrom(F)) {
438       LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n");
439       continue;
440     }
441 
442     LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
443     DominatorTree DT(F);
444     PostDomTree PDT(F);
445     PDT.recalculate(F);
446     BlockFrequencyInfo *BFI = GetBFI(F);
447     TargetTransformInfo &TTI = GetTTI(F);
448 
449     BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, TTI, DT, PDT);
450     if (ColdRegion.empty())
451       continue;
452 
453     OptimizationRemarkEmitter &ORE = (*GetORE)(F);
454     Function *Outlined =
455         extractColdRegion(ColdRegion, DT, BFI, TTI, ORE, /*Count=*/1);
456     if (Outlined) {
457       OutlinedFunctions.insert(Outlined);
458       Changed = true;
459     }
460   }
461   return Changed;
462 }
463 
464 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
465   if (skipModule(M))
466     return false;
467   ProfileSummaryInfo *PSI =
468       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
469   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
470     return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
471   };
472   auto GBFI = [this](Function &F) {
473     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
474   };
475   std::unique_ptr<OptimizationRemarkEmitter> ORE;
476   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
477       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
478     ORE.reset(new OptimizationRemarkEmitter(&F));
479     return *ORE.get();
480   };
481 
482   return HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M);
483 }
484 
485 PreservedAnalyses
486 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
487   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
488 
489   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
490       [&FAM](Function &F) -> AssumptionCache & {
491     return FAM.getResult<AssumptionAnalysis>(F);
492   };
493 
494   auto GBFI = [&FAM](Function &F) {
495     return &FAM.getResult<BlockFrequencyAnalysis>(F);
496   };
497 
498   std::function<TargetTransformInfo &(Function &)> GTTI =
499       [&FAM](Function &F) -> TargetTransformInfo & {
500     return FAM.getResult<TargetIRAnalysis>(F);
501   };
502 
503   std::unique_ptr<OptimizationRemarkEmitter> ORE;
504   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
505       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
506     ORE.reset(new OptimizationRemarkEmitter(&F));
507     return *ORE.get();
508   };
509 
510   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
511 
512   if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M))
513     return PreservedAnalyses::none();
514   return PreservedAnalyses::all();
515 }
516 
517 char HotColdSplittingLegacyPass::ID = 0;
518 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
519                       "Hot Cold Splitting", false, false)
520 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
521 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
522 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
523                     "Hot Cold Splitting", false, false)
524 
525 ModulePass *llvm::createHotColdSplittingPass() {
526   return new HotColdSplittingLegacyPass();
527 }
528