1 //===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Outline cold regions to a separate function.
11 // TODO: Update BFI and BPI
12 // TODO: Add all the outlined functions to a separate section.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Statistic.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/BlockFrequencyInfo.h"
20 #include "llvm/Analysis/BranchProbabilityInfo.h"
21 #include "llvm/Analysis/CFG.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/PostDominators.h"
24 #include "llvm/Analysis/ProfileSummaryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/CFG.h"
28 #include "llvm/IR/DataLayout.h"
29 #include "llvm/IR/DiagnosticInfo.h"
30 #include "llvm/IR/Dominators.h"
31 #include "llvm/IR/Function.h"
32 #include "llvm/IR/Instruction.h"
33 #include "llvm/IR/Instructions.h"
34 #include "llvm/IR/IntrinsicInst.h"
35 #include "llvm/IR/Metadata.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/IR/Type.h"
39 #include "llvm/IR/Use.h"
40 #include "llvm/IR/User.h"
41 #include "llvm/IR/Value.h"
42 #include "llvm/Pass.h"
43 #include "llvm/Support/BlockFrequency.h"
44 #include "llvm/Support/BranchProbability.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47 #include "llvm/Transforms/IPO.h"
48 #include "llvm/Transforms/IPO/HotColdSplitting.h"
49 #include "llvm/Transforms/Scalar.h"
50 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
51 #include "llvm/Transforms/Utils/Cloning.h"
52 #include "llvm/Transforms/Utils/CodeExtractor.h"
53 #include "llvm/Transforms/Utils/Local.h"
54 #include "llvm/Transforms/Utils/SSAUpdater.h"
55 #include "llvm/Transforms/Utils/ValueMapper.h"
56 #include <algorithm>
57 #include <cassert>
58 
59 #define DEBUG_TYPE "hotcoldsplit"
60 
61 STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
62 STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
63 
64 using namespace llvm;
65 
66 static cl::opt<bool> EnableStaticAnalyis("hot-cold-static-analysis",
67                               cl::init(true), cl::Hidden);
68 
69 static cl::opt<unsigned> MinOutliningInstCount(
70     "min-outlining-inst-count", cl::init(3), cl::Hidden,
71     cl::desc("Minimum number of instructions needed for a single-block region "
72              "to be an outlining candidate"));
73 
74 namespace {
75 
76 struct PostDomTree : PostDomTreeBase<BasicBlock> {
77   PostDomTree(Function &F) { recalculate(F); }
78 };
79 
80 /// A sequence of basic blocks.
81 ///
82 /// A 0-sized SmallVector is slightly cheaper to move than a std::vector.
83 using BlockSequence = SmallVector<BasicBlock *, 0>;
84 
85 // Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
86 // this function unless you modify the MBB version as well.
87 //
88 /// A no successor, non-return block probably ends in unreachable and is cold.
89 /// Also consider a block that ends in an indirect branch to be a return block,
90 /// since many targets use plain indirect branches to return.
91 bool blockEndsInUnreachable(const BasicBlock &BB) {
92   if (!succ_empty(&BB))
93     return false;
94   if (BB.empty())
95     return true;
96   const Instruction *I = BB.getTerminator();
97   return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
98 }
99 
100 static bool exceptionHandlingFunctions(const CallInst *CI) {
101   auto F = CI->getCalledFunction();
102   if (!F)
103     return false;
104   auto FName = F->getName();
105   return FName == "__cxa_begin_catch" ||
106          FName == "__cxa_free_exception" ||
107          FName == "__cxa_allocate_exception" ||
108          FName == "__cxa_begin_catch" ||
109          FName == "__cxa_end_catch";
110 }
111 
112 static bool unlikelyExecuted(const BasicBlock &BB) {
113   if (blockEndsInUnreachable(BB))
114     return true;
115   // Exception handling blocks are unlikely executed.
116   if (BB.isEHPad())
117     return true;
118   for (const Instruction &I : BB)
119     if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
120       // The block is cold if it calls functions tagged as cold or noreturn.
121       if (CI->hasFnAttr(Attribute::Cold) ||
122           CI->hasFnAttr(Attribute::NoReturn) ||
123           exceptionHandlingFunctions(CI))
124         return true;
125 
126       // Assume that inline assembly is hot code.
127       if (isa<InlineAsm>(CI->getCalledValue()))
128         return false;
129     }
130   return false;
131 }
132 
133 /// Check whether it's safe to outline \p BB.
134 static bool mayExtractBlock(const BasicBlock &BB) {
135   return !BB.hasAddressTaken();
136 }
137 
138 /// Check whether \p BB has at least \p Min non-debug, non-terminator
139 /// instructions.
140 static bool hasMinimumInstCount(const BasicBlock &BB, unsigned Min) {
141   unsigned Count = 0;
142   for (const Instruction &I : BB) {
143     if (isa<DbgInfoIntrinsic>(&I) || &I == BB.getTerminator())
144       continue;
145     if (++Count >= Min)
146       return true;
147   }
148   return false;
149 }
150 
151 /// Identify the maximal region of cold blocks which includes \p SinkBB.
152 ///
153 /// Include all blocks post-dominated by \p SinkBB, \p SinkBB itself, and all
154 /// blocks dominated by \p SinkBB. Exclude all other blocks, and blocks which
155 /// cannot be outlined.
156 ///
157 /// Return an empty sequence if the cold region is too small to outline, or if
158 /// the cold region has no warm predecessors.
159 static BlockSequence
160 findMaximalColdRegion(BasicBlock &SinkBB, DominatorTree &DT, PostDomTree &PDT) {
161   // The maximal cold region.
162   BlockSequence ColdRegion = {};
163 
164   // The ancestor farthest-away from SinkBB, and also post-dominated by it.
165   BasicBlock *MaxAncestor = &SinkBB;
166   unsigned MaxAncestorHeight = 0;
167 
168   // Visit SinkBB's ancestors using inverse DFS.
169   auto PredIt = ++idf_begin(&SinkBB);
170   auto PredEnd = idf_end(&SinkBB);
171   while (PredIt != PredEnd) {
172     BasicBlock &PredBB = **PredIt;
173     bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
174 
175     // If SinkBB does not post-dominate a predecessor, do not mark the
176     // predecessor (or any of its predecessors) cold.
177     if (!SinkPostDom || !mayExtractBlock(PredBB)) {
178       PredIt.skipChildren();
179       continue;
180     }
181 
182     // Keep track of the post-dominated ancestor farthest away from the sink.
183     unsigned AncestorHeight = PredIt.getPathLength();
184     if (AncestorHeight > MaxAncestorHeight) {
185       MaxAncestor = &PredBB;
186       MaxAncestorHeight = AncestorHeight;
187     }
188 
189     ColdRegion.push_back(&PredBB);
190     ++PredIt;
191   }
192 
193   // CodeExtractor requires that all blocks to be extracted must be dominated
194   // by the first block to be extracted.
195   //
196   // To avoid spurious or repeated outlining, require that the max ancestor
197   // has a predecessor. By construction this predecessor is not in the cold
198   // region, i.e. its existence implies we don't outline the whole function.
199   //
200   // TODO: If MaxAncestor has no predecessors, we may be able to outline the
201   // second largest cold region that has a predecessor.
202   if (pred_empty(MaxAncestor) ||
203       MaxAncestor->getSinglePredecessor() == MaxAncestor)
204     return {};
205 
206   // Filter out predecessors not dominated by the max ancestor.
207   //
208   // TODO: Blocks not dominated by the max ancestor could be extracted as
209   // other cold regions. Marking outlined calls as noreturn when appropriate
210   // and outlining more than once per function could achieve most of the win.
211   auto EraseIt = remove_if(ColdRegion, [&](BasicBlock *PredBB) {
212     return PredBB != MaxAncestor && !DT.dominates(MaxAncestor, PredBB);
213   });
214   ColdRegion.erase(EraseIt, ColdRegion.end());
215 
216   // Add SinkBB to the cold region.
217   ColdRegion.push_back(&SinkBB);
218 
219   // Ensure that the first extracted block is the max ancestor.
220   if (ColdRegion[0] != MaxAncestor) {
221     auto AncestorIt = find(ColdRegion, MaxAncestor);
222     *AncestorIt = ColdRegion[0];
223     ColdRegion[0] = MaxAncestor;
224   }
225 
226   // Find all successors of SinkBB dominated by SinkBB using DFS.
227   auto SuccIt = ++df_begin(&SinkBB);
228   auto SuccEnd = df_end(&SinkBB);
229   while (SuccIt != SuccEnd) {
230     BasicBlock &SuccBB = **SuccIt;
231     bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
232 
233     // If SinkBB does not dominate a successor, do not mark the successor (or
234     // any of its successors) cold.
235     if (!SinkDom || !mayExtractBlock(SuccBB)) {
236       SuccIt.skipChildren();
237       continue;
238     }
239 
240     ColdRegion.push_back(&SuccBB);
241     ++SuccIt;
242   }
243 
244   if (ColdRegion.size() == 1 &&
245       !hasMinimumInstCount(*ColdRegion[0], MinOutliningInstCount))
246     return {};
247 
248   return ColdRegion;
249 }
250 
251 /// Get the largest cold region in \p F.
252 static BlockSequence getLargestColdRegion(Function &F, ProfileSummaryInfo &PSI,
253                                           BlockFrequencyInfo *BFI,
254                                           DominatorTree &DT, PostDomTree &PDT) {
255   // Keep track of the largest cold region.
256   BlockSequence LargestColdRegion = {};
257 
258   for (BasicBlock &BB : F) {
259     // Identify cold blocks.
260     if (!mayExtractBlock(BB))
261       continue;
262     bool Cold =
263         PSI.isColdBB(&BB, BFI) || (EnableStaticAnalyis && unlikelyExecuted(BB));
264     if (!Cold)
265       continue;
266 
267     LLVM_DEBUG({
268       dbgs() << "Found cold block:\n";
269       BB.dump();
270     });
271 
272     // Find a maximal cold region we can outline.
273     BlockSequence ColdRegion = findMaximalColdRegion(BB, DT, PDT);
274     if (ColdRegion.empty()) {
275       LLVM_DEBUG(dbgs() << "  Skipping (block not profitable to extract)\n");
276       continue;
277     }
278 
279     ++NumColdRegionsFound;
280 
281     LLVM_DEBUG({
282       llvm::dbgs() << "Identified cold region with " << ColdRegion.size()
283                    << " blocks:\n";
284       for (BasicBlock *BB : ColdRegion)
285         BB->dump();
286     });
287 
288     // TODO: Outline more than one region.
289     if (ColdRegion.size() > LargestColdRegion.size())
290       LargestColdRegion = std::move(ColdRegion);
291   }
292 
293   return LargestColdRegion;
294 }
295 
296 class HotColdSplitting {
297 public:
298   HotColdSplitting(ProfileSummaryInfo *ProfSI,
299                    function_ref<BlockFrequencyInfo *(Function &)> GBFI,
300                    function_ref<TargetTransformInfo &(Function &)> GTTI,
301                    std::function<OptimizationRemarkEmitter &(Function &)> *GORE)
302       : PSI(ProfSI), GetBFI(GBFI), GetTTI(GTTI), GetORE(GORE) {}
303   bool run(Module &M);
304 
305 private:
306   bool shouldOutlineFrom(const Function &F) const;
307   Function *extractColdRegion(const BlockSequence &Region, DominatorTree &DT,
308                               BlockFrequencyInfo *BFI,
309                               OptimizationRemarkEmitter &ORE, unsigned Count);
310   SmallPtrSet<const Function *, 2> OutlinedFunctions;
311   ProfileSummaryInfo *PSI;
312   function_ref<BlockFrequencyInfo *(Function &)> GetBFI;
313   function_ref<TargetTransformInfo &(Function &)> GetTTI;
314   std::function<OptimizationRemarkEmitter &(Function &)> *GetORE;
315 };
316 
317 class HotColdSplittingLegacyPass : public ModulePass {
318 public:
319   static char ID;
320   HotColdSplittingLegacyPass() : ModulePass(ID) {
321     initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
322   }
323 
324   void getAnalysisUsage(AnalysisUsage &AU) const override {
325     AU.addRequired<AssumptionCacheTracker>();
326     AU.addRequired<BlockFrequencyInfoWrapperPass>();
327     AU.addRequired<ProfileSummaryInfoWrapperPass>();
328     AU.addRequired<TargetTransformInfoWrapperPass>();
329   }
330 
331   bool runOnModule(Module &M) override;
332 };
333 
334 } // end anonymous namespace
335 
336 // Returns false if the function should not be considered for hot-cold split
337 // optimization.
338 bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
339   // Do not try to outline again from an already outlined cold function.
340   if (OutlinedFunctions.count(&F))
341     return false;
342 
343   if (F.size() <= 2)
344     return false;
345 
346   // TODO: Consider only skipping functions marked `optnone` or `cold`.
347 
348   if (F.hasAddressTaken())
349     return false;
350 
351   if (F.hasFnAttribute(Attribute::AlwaysInline))
352     return false;
353 
354   if (F.hasFnAttribute(Attribute::NoInline))
355     return false;
356 
357   if (F.getCallingConv() == CallingConv::Cold)
358     return false;
359 
360   if (PSI->isFunctionEntryCold(&F))
361     return false;
362   return true;
363 }
364 
365 Function *HotColdSplitting::extractColdRegion(const BlockSequence &Region,
366                                               DominatorTree &DT,
367                                               BlockFrequencyInfo *BFI,
368                                               OptimizationRemarkEmitter &ORE,
369                                               unsigned Count) {
370   assert(!Region.empty());
371   LLVM_DEBUG(for (auto *BB : Region)
372           llvm::dbgs() << "\nExtracting: " << *BB;);
373 
374   // TODO: Pass BFI and BPI to update profile information.
375   CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
376                    /* BPI */ nullptr, /* AllowVarArgs */ false,
377                    /* AllowAlloca */ false,
378                    /* Suffix */ "cold." + std::to_string(Count));
379 
380   SetVector<Value *> Inputs, Outputs, Sinks;
381   CE.findInputsOutputs(Inputs, Outputs, Sinks);
382 
383   // Do not extract regions that have live exit variables.
384   if (Outputs.size() > 0) {
385     LLVM_DEBUG(llvm::dbgs() << "Not outlining; live outputs\n");
386     return nullptr;
387   }
388 
389   // TODO: Run MergeBasicBlockIntoOnlyPred on the outlined function.
390   Function *OrigF = Region[0]->getParent();
391   if (Function *OutF = CE.extractCodeRegion()) {
392     User *U = *OutF->user_begin();
393     CallInst *CI = cast<CallInst>(U);
394     CallSite CS(CI);
395     NumColdRegionsOutlined++;
396     if (GetTTI(*OutF).useColdCCForColdCall(*OutF)) {
397       OutF->setCallingConv(CallingConv::Cold);
398       CS.setCallingConv(CallingConv::Cold);
399     }
400     CI->setIsNoInline();
401 
402     // Try to make the outlined code as small as possible on the assumption
403     // that it's cold.
404     assert(!OutF->hasFnAttribute(Attribute::OptimizeNone) &&
405            "An outlined function should never be marked optnone");
406     OutF->addFnAttr(Attribute::MinSize);
407 
408     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
409     ORE.emit([&]() {
410       return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
411                                 &*Region[0]->begin())
412              << ore::NV("Original", OrigF) << " split cold code into "
413              << ore::NV("Split", OutF);
414     });
415     return OutF;
416   }
417 
418   ORE.emit([&]() {
419     return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
420                                     &*Region[0]->begin())
421            << "Failed to extract region at block "
422            << ore::NV("Block", Region.front());
423   });
424   return nullptr;
425 }
426 
427 bool HotColdSplitting::run(Module &M) {
428   bool Changed = false;
429   for (auto &F : M) {
430     if (!shouldOutlineFrom(F)) {
431       LLVM_DEBUG(llvm::dbgs() << "Not outlining in " << F.getName() << "\n");
432       continue;
433     }
434 
435     LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
436     DominatorTree DT(F);
437     PostDomTree PDT(F);
438     PDT.recalculate(F);
439     BlockFrequencyInfo *BFI = GetBFI(F);
440 
441     BlockSequence ColdRegion = getLargestColdRegion(F, *PSI, BFI, DT, PDT);
442     if (ColdRegion.empty())
443       continue;
444 
445     OptimizationRemarkEmitter &ORE = (*GetORE)(F);
446     Function *Outlined =
447         extractColdRegion(ColdRegion, DT, BFI, ORE, /*Count=*/1);
448     if (Outlined) {
449       OutlinedFunctions.insert(Outlined);
450       Changed = true;
451     }
452   }
453   return Changed;
454 }
455 
456 bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
457   if (skipModule(M))
458     return false;
459   ProfileSummaryInfo *PSI =
460       getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
461   auto GTTI = [this](Function &F) -> TargetTransformInfo & {
462     return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
463   };
464   auto GBFI = [this](Function &F) {
465     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
466   };
467   std::unique_ptr<OptimizationRemarkEmitter> ORE;
468   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
469       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
470     ORE.reset(new OptimizationRemarkEmitter(&F));
471     return *ORE.get();
472   };
473 
474   return HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M);
475 }
476 
477 PreservedAnalyses
478 HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
479   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
480 
481   std::function<AssumptionCache &(Function &)> GetAssumptionCache =
482       [&FAM](Function &F) -> AssumptionCache & {
483     return FAM.getResult<AssumptionAnalysis>(F);
484   };
485 
486   auto GBFI = [&FAM](Function &F) {
487     return &FAM.getResult<BlockFrequencyAnalysis>(F);
488   };
489 
490   std::function<TargetTransformInfo &(Function &)> GTTI =
491       [&FAM](Function &F) -> TargetTransformInfo & {
492     return FAM.getResult<TargetIRAnalysis>(F);
493   };
494 
495   std::unique_ptr<OptimizationRemarkEmitter> ORE;
496   std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
497       [&ORE](Function &F) -> OptimizationRemarkEmitter & {
498     ORE.reset(new OptimizationRemarkEmitter(&F));
499     return *ORE.get();
500   };
501 
502   ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
503 
504   if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE).run(M))
505     return PreservedAnalyses::none();
506   return PreservedAnalyses::all();
507 }
508 
509 char HotColdSplittingLegacyPass::ID = 0;
510 INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
511                       "Hot Cold Splitting", false, false)
512 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
513 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
514 INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
515                     "Hot Cold Splitting", false, false)
516 
517 ModulePass *llvm::createHotColdSplittingPass() {
518   return new HotColdSplittingLegacyPass();
519 }
520