1 //===- bolt/Passes/Aligner.cpp - Pass for optimal code alignment ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AlignerPass class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/Aligner.h"
14 #include "bolt/Core/ParallelUtilities.h"
15 
16 #define DEBUG_TYPE "bolt-aligner"
17 
18 using namespace llvm;
19 
20 namespace opts {
21 
22 extern cl::OptionCategory BoltOptCategory;
23 
24 extern cl::opt<bool> AlignBlocks;
25 extern cl::opt<bool> PreserveBlocksAlignment;
26 extern cl::opt<unsigned> AlignFunctions;
27 
28 cl::opt<unsigned>
29 AlignBlocksMinSize("align-blocks-min-size",
30   cl::desc("minimal size of the basic block that should be aligned"),
31   cl::init(0),
32   cl::ZeroOrMore,
33   cl::Hidden,
34   cl::cat(BoltOptCategory));
35 
36 cl::opt<unsigned>
37 AlignBlocksThreshold("align-blocks-threshold",
38   cl::desc("align only blocks with frequency larger than containing function "
39            "execution frequency specified in percent. E.g. 1000 means aligning "
40            "blocks that are 10 times more frequently executed than the "
41            "containing function."),
42   cl::init(800),
43   cl::ZeroOrMore,
44   cl::Hidden,
45   cl::cat(BoltOptCategory));
46 
47 cl::opt<unsigned>
48 AlignFunctionsMaxBytes("align-functions-max-bytes",
49   cl::desc("maximum number of bytes to use to align functions"),
50   cl::init(32),
51   cl::ZeroOrMore,
52   cl::cat(BoltOptCategory));
53 
54 cl::opt<unsigned>
55 BlockAlignment("block-alignment",
56   cl::desc("boundary to use for alignment of basic blocks"),
57   cl::init(16),
58   cl::ZeroOrMore,
59   cl::cat(BoltOptCategory));
60 
61 cl::opt<bool>
62 UseCompactAligner("use-compact-aligner",
63   cl::desc("Use compact approach for aligning functions"),
64   cl::init(true),
65   cl::ZeroOrMore,
66   cl::cat(BoltOptCategory));
67 
68 } // end namespace opts
69 
70 namespace llvm {
71 namespace bolt {
72 
73 namespace {
74 
75 // Align function to the specified byte-boundary (typically, 64) offsetting
76 // the fuction by not more than the corresponding value
77 void alignMaxBytes(BinaryFunction &Function) {
78   Function.setAlignment(opts::AlignFunctions);
79   Function.setMaxAlignmentBytes(opts::AlignFunctionsMaxBytes);
80   Function.setMaxColdAlignmentBytes(opts::AlignFunctionsMaxBytes);
81 }
82 
83 // Align function to the specified byte-boundary (typically, 64) offsetting
84 // the fuction by not more than the minimum over
85 // -- the size of the function
86 // -- the specified number of bytes
87 void alignCompact(BinaryFunction &Function, const MCCodeEmitter *Emitter) {
88   const BinaryContext &BC = Function.getBinaryContext();
89   size_t HotSize = 0;
90   size_t ColdSize = 0;
91 
92   for (const BinaryBasicBlock *BB : Function.layout())
93     if (BB->isCold())
94       ColdSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
95     else
96       HotSize += BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
97 
98   Function.setAlignment(opts::AlignFunctions);
99   if (HotSize > 0)
100     Function.setMaxAlignmentBytes(
101       std::min(size_t(opts::AlignFunctionsMaxBytes), HotSize));
102 
103   // using the same option, max-align-bytes, both for cold and hot parts of the
104   // functions, as aligning cold functions typically does not affect performance
105   if (ColdSize > 0)
106     Function.setMaxColdAlignmentBytes(
107       std::min(size_t(opts::AlignFunctionsMaxBytes), ColdSize));
108 }
109 
110 } // end anonymous namespace
111 
112 void AlignerPass::alignBlocks(BinaryFunction &Function,
113                               const MCCodeEmitter *Emitter) {
114   if (!Function.hasValidProfile() || !Function.isSimple())
115     return;
116 
117   const BinaryContext &BC = Function.getBinaryContext();
118 
119   const uint64_t FuncCount =
120       std::max<uint64_t>(1, Function.getKnownExecutionCount());
121   BinaryBasicBlock *PrevBB = nullptr;
122   for (BinaryBasicBlock *BB : Function.layout()) {
123     uint64_t Count = BB->getKnownExecutionCount();
124 
125     if (Count <= FuncCount * opts::AlignBlocksThreshold / 100) {
126       PrevBB = BB;
127       continue;
128     }
129 
130     uint64_t FTCount = 0;
131     if (PrevBB && PrevBB->getFallthrough() == BB)
132       FTCount = PrevBB->getBranchInfo(*BB).Count;
133 
134     PrevBB = BB;
135 
136     if (Count < FTCount * 2)
137       continue;
138 
139     const uint64_t BlockSize =
140         BC.computeCodeSize(BB->begin(), BB->end(), Emitter);
141     const uint64_t BytesToUse =
142         std::min<uint64_t>(opts::BlockAlignment - 1, BlockSize);
143 
144     if (opts::AlignBlocksMinSize && BlockSize < opts::AlignBlocksMinSize)
145       continue;
146 
147     BB->setAlignment(opts::BlockAlignment);
148     BB->setAlignmentMaxBytes(BytesToUse);
149 
150     // Update stats.
151     LLVM_DEBUG(
152       std::unique_lock<std::shared_timed_mutex> Lock(AlignHistogramMtx);
153       AlignHistogram[BytesToUse]++;
154       AlignedBlocksCount += BB->getKnownExecutionCount();
155     );
156   }
157 }
158 
159 void AlignerPass::runOnFunctions(BinaryContext &BC) {
160   if (!BC.HasRelocations)
161     return;
162 
163   AlignHistogram.resize(opts::BlockAlignment);
164 
165   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
166     // Create a separate MCCodeEmitter to allow lock free execution
167     BinaryContext::IndependentCodeEmitter Emitter =
168         BC.createIndependentMCCodeEmitter();
169 
170     if (opts::UseCompactAligner)
171       alignCompact(BF, Emitter.MCE.get());
172     else
173       alignMaxBytes(BF);
174 
175     if (opts::AlignBlocks && !opts::PreserveBlocksAlignment)
176       alignBlocks(BF, Emitter.MCE.get());
177   };
178 
179   ParallelUtilities::runOnEachFunction(
180       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun,
181       ParallelUtilities::PredicateTy(nullptr), "AlignerPass");
182 
183   LLVM_DEBUG(
184     dbgs() << "BOLT-DEBUG: max bytes per basic block alignment distribution:\n";
185     for (unsigned I = 1; I < AlignHistogram.size(); ++I)
186       dbgs() << "  " << I << " : " << AlignHistogram[I] << '\n';
187 
188     dbgs() << "BOLT-DEBUG: total execution count of aligned blocks: "
189            << AlignedBlocksCount << '\n';
190   );
191 }
192 
193 } // end namespace bolt
194 } // end namespace llvm
195