1 //===- bolt/Passes/TailDuplication.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the TailDuplication class.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "bolt/Passes/TailDuplication.h"
14 #include "llvm/ADT/DenseMap.h"
15 #include "llvm/MC/MCRegisterInfo.h"
16
17 #include <numeric>
18
19 #define DEBUG_TYPE "taildup"
20
21 using namespace llvm;
22
23 namespace opts {
24
25 extern cl::OptionCategory BoltOptCategory;
26 extern cl::opt<bool> NoThreads;
27
28 cl::opt<bolt::TailDuplication::DuplicationMode> TailDuplicationMode(
29 "tail-duplication",
30 cl::desc("duplicate unconditional branches that cross a cache line"),
31 cl::init(bolt::TailDuplication::TD_NONE),
32 cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none",
33 "do not apply"),
34 clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive",
35 "aggressive strategy"),
36 clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate",
37 "moderate strategy"),
38 clEnumValN(bolt::TailDuplication::TD_CACHE, "cache",
39 "cache-aware duplication strategy")),
40 cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
41
42 static cl::opt<unsigned>
43 TailDuplicationMinimumOffset("tail-duplication-minimum-offset",
44 cl::desc("minimum offset needed between block "
45 "and successor to allow duplication"),
46 cl::ReallyHidden, cl::init(64),
47 cl::cat(BoltOptCategory));
48
49 static cl::opt<unsigned> TailDuplicationMaximumDuplication(
50 "tail-duplication-maximum-duplication",
51 cl::desc("tail blocks whose size (in bytes) exceeds the value are never "
52 "duplicated"),
53 cl::ZeroOrMore, cl::ReallyHidden, cl::init(24), cl::cat(BoltOptCategory));
54
55 static cl::opt<unsigned> TailDuplicationMinimumDuplication(
56 "tail-duplication-minimum-duplication",
57 cl::desc("tail blocks with size (in bytes) not exceeding the value are "
58 "always duplicated"),
59 cl::ReallyHidden, cl::init(2), cl::cat(BoltOptCategory));
60
61 static cl::opt<bool> TailDuplicationConstCopyPropagation(
62 "tail-duplication-const-copy-propagation",
63 cl::desc("enable const and copy propagation after tail duplication"),
64 cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory));
65
66 static cl::opt<unsigned> TailDuplicationMaxCacheDistance(
67 "tail-duplication-max-cache-distance",
68 cl::desc("The weight of backward jumps for ExtTSP value"), cl::init(256),
69 cl::ReallyHidden, cl::cat(BoltOptCategory));
70
71 static cl::opt<double> TailDuplicationCacheBackwardWeight(
72 "tail-duplication-cache-backward-weight",
73 cl::desc(
74 "The maximum distance (in bytes) of backward jumps for ExtTSP value"),
75 cl::init(0.5), cl::ReallyHidden, cl::cat(BoltOptCategory));
76
77 } // namespace opts
78
79 namespace llvm {
80 namespace bolt {
81
getCallerSavedRegs(const MCInst & Inst,BitVector & Regs,BinaryContext & BC) const82 void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs,
83 BinaryContext &BC) const {
84 if (!BC.MIB->isCall(Inst))
85 return;
86 BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false);
87 BC.MIB->getCalleeSavedRegs(CallRegs);
88 CallRegs.flip();
89 Regs |= CallRegs;
90 }
91
regIsPossiblyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const92 bool TailDuplication::regIsPossiblyOverwritten(const MCInst &Inst, unsigned Reg,
93 BinaryContext &BC) const {
94 BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
95 BC.MIB->getWrittenRegs(Inst, WrittenRegs);
96 getCallerSavedRegs(Inst, WrittenRegs, BC);
97 if (BC.MIB->isRep(Inst))
98 BC.MIB->getRepRegs(WrittenRegs);
99 WrittenRegs &= BC.MIB->getAliases(Reg, false);
100 return WrittenRegs.any();
101 }
102
regIsDefinitelyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const103 bool TailDuplication::regIsDefinitelyOverwritten(const MCInst &Inst,
104 unsigned Reg,
105 BinaryContext &BC) const {
106 BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
107 BC.MIB->getWrittenRegs(Inst, WrittenRegs);
108 getCallerSavedRegs(Inst, WrittenRegs, BC);
109 if (BC.MIB->isRep(Inst))
110 BC.MIB->getRepRegs(WrittenRegs);
111 return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Reg) &&
112 !BC.MIB->isConditionalMove(Inst));
113 }
114
regIsUsed(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const115 bool TailDuplication::regIsUsed(const MCInst &Inst, unsigned Reg,
116 BinaryContext &BC) const {
117 BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false);
118 BC.MIB->getSrcRegs(Inst, SrcRegs);
119 SrcRegs &= BC.MIB->getAliases(Reg, true);
120 return SrcRegs.any();
121 }
122
isOverwrittenBeforeUsed(BinaryBasicBlock & StartBB,unsigned Reg) const123 bool TailDuplication::isOverwrittenBeforeUsed(BinaryBasicBlock &StartBB,
124 unsigned Reg) const {
125 BinaryFunction *BF = StartBB.getFunction();
126 BinaryContext &BC = BF->getBinaryContext();
127 std::queue<BinaryBasicBlock *> Q;
128 for (auto Itr = StartBB.succ_begin(); Itr != StartBB.succ_end(); ++Itr) {
129 BinaryBasicBlock *NextBB = *Itr;
130 Q.push(NextBB);
131 }
132 std::set<BinaryBasicBlock *> Visited;
133 // Breadth first search through successive blocks and see if Reg is ever used
134 // before its overwritten
135 while (Q.size() > 0) {
136 BinaryBasicBlock *CurrBB = Q.front();
137 Q.pop();
138 if (Visited.count(CurrBB))
139 continue;
140 Visited.insert(CurrBB);
141 bool Overwritten = false;
142 for (auto Itr = CurrBB->begin(); Itr != CurrBB->end(); ++Itr) {
143 MCInst &Inst = *Itr;
144 if (regIsUsed(Inst, Reg, BC))
145 return false;
146 if (regIsDefinitelyOverwritten(Inst, Reg, BC)) {
147 Overwritten = true;
148 break;
149 }
150 }
151 if (Overwritten)
152 continue;
153 for (auto Itr = CurrBB->succ_begin(); Itr != CurrBB->succ_end(); ++Itr) {
154 BinaryBasicBlock *NextBB = *Itr;
155 Q.push(NextBB);
156 }
157 }
158 return true;
159 }
160
constantAndCopyPropagate(BinaryBasicBlock & OriginalBB,std::vector<BinaryBasicBlock * > & BlocksToPropagate)161 void TailDuplication::constantAndCopyPropagate(
162 BinaryBasicBlock &OriginalBB,
163 std::vector<BinaryBasicBlock *> &BlocksToPropagate) {
164 BinaryFunction *BF = OriginalBB.getFunction();
165 BinaryContext &BC = BF->getBinaryContext();
166
167 BlocksToPropagate.insert(BlocksToPropagate.begin(), &OriginalBB);
168 // Iterate through the original instructions to find one to propagate
169 for (auto Itr = OriginalBB.begin(); Itr != OriginalBB.end(); ++Itr) {
170 MCInst &OriginalInst = *Itr;
171 // It must be a non conditional
172 if (BC.MIB->isConditionalMove(OriginalInst))
173 continue;
174
175 // Move immediate or move register
176 if ((!BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate() ||
177 !OriginalInst.getOperand(1).isImm()) &&
178 (!BC.MII->get(OriginalInst.getOpcode()).isMoveReg() ||
179 !OriginalInst.getOperand(1).isReg()))
180 continue;
181
182 // True if this is constant propagation and not copy propagation
183 bool ConstantProp = BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate();
184 // The Register to replaced
185 unsigned Reg = OriginalInst.getOperand(0).getReg();
186 // True if the register to replace was replaced everywhere it was used
187 bool ReplacedEverywhere = true;
188 // True if the register was definitely overwritten
189 bool Overwritten = false;
190 // True if the register to replace and the register to replace with (for
191 // copy propagation) has not been overwritten and is still usable
192 bool RegsActive = true;
193
194 // Iterate through successor blocks and through their instructions
195 for (BinaryBasicBlock *NextBB : BlocksToPropagate) {
196 for (auto PropagateItr =
197 ((NextBB == &OriginalBB) ? Itr + 1 : NextBB->begin());
198 PropagateItr < NextBB->end(); ++PropagateItr) {
199 MCInst &PropagateInst = *PropagateItr;
200 if (regIsUsed(PropagateInst, Reg, BC)) {
201 bool Replaced = false;
202 // If both registers are active for copy propagation or the register
203 // to replace is active for constant propagation
204 if (RegsActive) {
205 // Set Replaced and so ReplacedEverwhere to false if it cannot be
206 // replaced (no replacing that opcode, Register is src and dest)
207 if (ConstantProp)
208 Replaced = BC.MIB->replaceRegWithImm(
209 PropagateInst, Reg, OriginalInst.getOperand(1).getImm());
210 else
211 Replaced = BC.MIB->replaceRegWithReg(
212 PropagateInst, Reg, OriginalInst.getOperand(1).getReg());
213 }
214 ReplacedEverywhere = ReplacedEverywhere && Replaced;
215 }
216 // For copy propagation, make sure no propagation happens after the
217 // register to replace with is overwritten
218 if (!ConstantProp &&
219 regIsPossiblyOverwritten(PropagateInst,
220 OriginalInst.getOperand(1).getReg(), BC))
221 RegsActive = false;
222
223 // Make sure no propagation happens after the register to replace is
224 // overwritten
225 if (regIsPossiblyOverwritten(PropagateInst, Reg, BC))
226 RegsActive = false;
227
228 // Record if the register to replace is overwritten
229 if (regIsDefinitelyOverwritten(PropagateInst, Reg, BC)) {
230 Overwritten = true;
231 break;
232 }
233 }
234 if (Overwritten)
235 break;
236 }
237
238 // If the register was replaced everwhere and it was overwritten in either
239 // one of the iterated through blocks or one of the successor blocks, delete
240 // the original move instruction
241 if (ReplacedEverywhere &&
242 (Overwritten ||
243 isOverwrittenBeforeUsed(
244 *BlocksToPropagate[BlocksToPropagate.size() - 1], Reg))) {
245 // If both registers are active for copy propagation or the register
246 // to replace is active for constant propagation
247 StaticInstructionDeletionCount++;
248 DynamicInstructionDeletionCount += OriginalBB.getExecutionCount();
249 Itr = std::prev(OriginalBB.eraseInstruction(Itr));
250 }
251 }
252 }
253
isInCacheLine(const BinaryBasicBlock & BB,const BinaryBasicBlock & Succ) const254 bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB,
255 const BinaryBasicBlock &Succ) const {
256 if (&BB == &Succ)
257 return true;
258
259 uint64_t Distance = 0;
260 int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1;
261
262 for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex();
263 I += Direction) {
264 Distance += BB.getFunction()->getLayout().getBlock(I)->getOriginalSize();
265 if (Distance > opts::TailDuplicationMinimumOffset)
266 return false;
267 }
268 return true;
269 }
270
271 std::vector<BinaryBasicBlock *>
moderateDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const272 TailDuplication::moderateDuplicate(BinaryBasicBlock &BB,
273 BinaryBasicBlock &Tail) const {
274 std::vector<BinaryBasicBlock *> BlocksToDuplicate;
275 // The block must be hot
276 if (BB.getKnownExecutionCount() == 0)
277 return BlocksToDuplicate;
278 // and its sucessor is not already in the same cache line
279 if (isInCacheLine(BB, Tail))
280 return BlocksToDuplicate;
281 // and its size do not exceed the maximum allowed size
282 if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication)
283 return BlocksToDuplicate;
284 // If duplicating would introduce a new branch, don't duplicate
285 for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) {
286 if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1)
287 return BlocksToDuplicate;
288 }
289
290 BlocksToDuplicate.push_back(&Tail);
291 return BlocksToDuplicate;
292 }
293
294 std::vector<BinaryBasicBlock *>
aggressiveDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const295 TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB,
296 BinaryBasicBlock &Tail) const {
297 std::vector<BinaryBasicBlock *> BlocksToDuplicate;
298 // The block must be hot
299 if (BB.getKnownExecutionCount() == 0)
300 return BlocksToDuplicate;
301 // and its sucessor is not already in the same cache line
302 if (isInCacheLine(BB, Tail))
303 return BlocksToDuplicate;
304
305 BinaryBasicBlock *CurrBB = &BB;
306 while (CurrBB) {
307 LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding "
308 << CurrBB->getName() << " to duplication list\n";);
309 BlocksToDuplicate.push_back(CurrBB);
310
311 if (CurrBB->hasJumpTable()) {
312 LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing duplication "
313 "list due to a JT in "
314 << CurrBB->getName() << '\n';);
315 BlocksToDuplicate.clear();
316 break;
317 }
318
319 // With no successors, we've reached the end and should duplicate all of
320 // BlocksToDuplicate
321 if (CurrBB->succ_size() == 0)
322 break;
323
324 // With two successors, if they're both a jump, we should duplicate all
325 // blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of
326 // blocks to copy
327 if (CurrBB->succ_size() >= 2) {
328 if (CurrBB->getConditionalSuccessor(false)->getLayoutIndex() ==
329 CurrBB->getLayoutIndex() + 1 ||
330 CurrBB->getConditionalSuccessor(true)->getLayoutIndex() ==
331 CurrBB->getLayoutIndex() + 1) {
332 LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing "
333 "duplication list, can't find a simple stream at "
334 << CurrBB->getName() << '\n';);
335 BlocksToDuplicate.clear();
336 }
337 break;
338 }
339
340 // With one successor, if its a jump, we should duplicate all blocks in
341 // BlocksToDuplicate. Otherwise, we should keep going
342 BinaryBasicBlock *SuccBB = CurrBB->getSuccessor();
343 if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1)
344 break;
345 CurrBB = SuccBB;
346 }
347 // Don't duplicate if its too much code
348 unsigned DuplicationByteCount = std::accumulate(
349 std::begin(BlocksToDuplicate), std::end(BlocksToDuplicate), 0,
350 [](int value, BinaryBasicBlock *p) {
351 return value + p->getOriginalSize();
352 });
353 if (DuplicationByteCount > opts::TailDuplicationMaximumDuplication) {
354 LLVM_DEBUG(dbgs() << "Aggressive tail duplication: duplication byte count ("
355 << DuplicationByteCount << ") exceeds maximum "
356 << opts::TailDuplicationMaximumDuplication << '\n';);
357 BlocksToDuplicate.clear();
358 }
359 LLVM_DEBUG(dbgs() << "Aggressive tail duplication: found "
360 << BlocksToDuplicate.size() << " blocks to duplicate\n";);
361 return BlocksToDuplicate;
362 }
363
shouldDuplicate(BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const364 bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred,
365 BinaryBasicBlock *Tail) const {
366 if (Pred == Tail)
367 return false;
368 // Cannot duplicate non-tail blocks
369 if (Tail->succ_size() != 0)
370 return false;
371 // The blocks are already in the order
372 if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex())
373 return false;
374 // No tail duplication for blocks with jump tables
375 if (Pred->hasJumpTable())
376 return false;
377 if (Tail->hasJumpTable())
378 return false;
379
380 return true;
381 }
382
cacheScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t DstSize,uint64_t Count) const383 double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize,
384 uint64_t DstAddr, uint64_t DstSize,
385 uint64_t Count) const {
386 assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
387
388 bool IsForwardJump = SrcAddr <= DstAddr;
389 uint64_t JumpDistance = 0;
390 // Computing the length of the jump so that it takes the sizes of the two
391 // blocks into consideration
392 if (IsForwardJump) {
393 JumpDistance = (DstAddr + DstSize) - (SrcAddr);
394 } else {
395 JumpDistance = (SrcAddr + SrcSize) - (DstAddr);
396 }
397
398 if (JumpDistance >= opts::TailDuplicationMaxCacheDistance)
399 return 0;
400 double Prob = 1.0 - static_cast<double>(JumpDistance) /
401 opts::TailDuplicationMaxCacheDistance;
402 return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) *
403 Prob * Count;
404 }
405
cacheScoreImproved(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const406 bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter,
407 BinaryFunction &BF,
408 BinaryBasicBlock *Pred,
409 BinaryBasicBlock *Tail) const {
410 // Collect (estimated) basic block sizes
411 DenseMap<const BinaryBasicBlock *, uint64_t> BBSize;
412 for (const BinaryBasicBlock &BB : BF) {
413 BBSize[&BB] = std::max<uint64_t>(BB.estimateSize(Emitter), 1);
414 }
415
416 // Build current addresses of basic blocks starting at the entry block
417 DenseMap<BinaryBasicBlock *, uint64_t> CurAddr;
418 uint64_t Addr = 0;
419 for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
420 CurAddr[SrcBB] = Addr;
421 Addr += BBSize[SrcBB];
422 }
423
424 // Build new addresses (after duplication) starting at the entry block
425 DenseMap<BinaryBasicBlock *, uint64_t> NewAddr;
426 Addr = 0;
427 for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
428 NewAddr[SrcBB] = Addr;
429 Addr += BBSize[SrcBB];
430 if (SrcBB == Pred)
431 Addr += BBSize[Tail];
432 }
433
434 // Compute the cache score for the existing layout of basic blocks
435 double CurScore = 0;
436 for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
437 auto BI = SrcBB->branch_info_begin();
438 for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
439 if (SrcBB != DstBB) {
440 CurScore += cacheScore(CurAddr[SrcBB], BBSize[SrcBB], CurAddr[DstBB],
441 BBSize[DstBB], BI->Count);
442 }
443 ++BI;
444 }
445 }
446
447 // Compute the cache score for the layout of blocks after tail duplication
448 double NewScore = 0;
449 for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
450 auto BI = SrcBB->branch_info_begin();
451 for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
452 if (SrcBB != DstBB) {
453 if (SrcBB == Pred && DstBB == Tail) {
454 NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB],
455 NewAddr[SrcBB] + BBSize[SrcBB], BBSize[DstBB],
456 BI->Count);
457 } else {
458 NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB], NewAddr[DstBB],
459 BBSize[DstBB], BI->Count);
460 }
461 }
462 ++BI;
463 }
464 }
465
466 return NewScore > CurScore;
467 }
468
469 std::vector<BinaryBasicBlock *>
cacheDuplicate(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const470 TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter,
471 BinaryFunction &BF, BinaryBasicBlock *Pred,
472 BinaryBasicBlock *Tail) const {
473 std::vector<BinaryBasicBlock *> BlocksToDuplicate;
474
475 // No need to duplicate cold basic blocks
476 if (Pred->isCold() || Tail->isCold()) {
477 return BlocksToDuplicate;
478 }
479 // Always duplicate "small" tail basic blocks, which might be beneficial for
480 // code size, since a jump instruction is eliminated
481 if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) {
482 BlocksToDuplicate.push_back(Tail);
483 return BlocksToDuplicate;
484 }
485 // Never duplicate "large" tail basic blocks
486 if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) {
487 return BlocksToDuplicate;
488 }
489 // Do not append basic blocks after the last hot block in the current layout
490 auto NextBlock = BF.getLayout().getBasicBlockAfter(Pred);
491 if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) {
492 return BlocksToDuplicate;
493 }
494
495 // Duplicate the tail only if it improves the cache score
496 if (cacheScoreImproved(Emitter, BF, Pred, Tail)) {
497 BlocksToDuplicate.push_back(Tail);
498 }
499
500 return BlocksToDuplicate;
501 }
502
duplicateBlocks(BinaryBasicBlock & BB,const std::vector<BinaryBasicBlock * > & BlocksToDuplicate) const503 std::vector<BinaryBasicBlock *> TailDuplication::duplicateBlocks(
504 BinaryBasicBlock &BB,
505 const std::vector<BinaryBasicBlock *> &BlocksToDuplicate) const {
506 BinaryFunction *BF = BB.getFunction();
507 BinaryContext &BC = BF->getBinaryContext();
508
509 // Ratio of this new branches execution count to the total size of the
510 // successor's execution count. Used to set this new branches execution count
511 // and lower the old successor's execution count
512 double ExecutionCountRatio =
513 BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount()
514 ? 1.0
515 : (double)BB.getExecutionCount() /
516 BB.getSuccessor()->getExecutionCount();
517
518 // Use the last branch info when adding a successor to LastBB
519 BinaryBasicBlock::BinaryBranchInfo &LastBI =
520 BB.getBranchInfo(*(BB.getSuccessor()));
521
522 BinaryBasicBlock *LastOriginalBB = &BB;
523 BinaryBasicBlock *LastDuplicatedBB = &BB;
524 assert(LastDuplicatedBB->succ_size() == 1 &&
525 "tail duplication cannot act on a block with more than 1 successor");
526 LastDuplicatedBB->removeSuccessor(LastDuplicatedBB->getSuccessor());
527
528 std::vector<std::unique_ptr<BinaryBasicBlock>> DuplicatedBlocks;
529 std::vector<BinaryBasicBlock *> DuplicatedBlocksToReturn;
530
531 for (BinaryBasicBlock *CurBB : BlocksToDuplicate) {
532 DuplicatedBlocks.emplace_back(
533 BF->createBasicBlock((BC.Ctx)->createNamedTempSymbol("tail-dup")));
534 BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get();
535
536 NewBB->addInstructions(CurBB->begin(), CurBB->end());
537 // Set execution count as if it was just a copy of the original
538 NewBB->setExecutionCount(CurBB->getExecutionCount());
539 NewBB->setIsCold(CurBB->isCold());
540 LastDuplicatedBB->addSuccessor(NewBB, LastBI);
541
542 DuplicatedBlocksToReturn.push_back(NewBB);
543
544 // As long as its not the first block, adjust both original and duplicated
545 // to what they should be
546 if (LastDuplicatedBB != &BB) {
547 LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
548 LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
549 }
550
551 if (CurBB->succ_size() == 1)
552 LastBI = CurBB->getBranchInfo(*(CurBB->getSuccessor()));
553
554 LastOriginalBB = CurBB;
555 LastDuplicatedBB = NewBB;
556 }
557
558 LastDuplicatedBB->addSuccessors(
559 LastOriginalBB->succ_begin(), LastOriginalBB->succ_end(),
560 LastOriginalBB->branch_info_begin(), LastOriginalBB->branch_info_end());
561
562 LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
563 LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
564
565 BF->insertBasicBlocks(&BB, std::move(DuplicatedBlocks));
566
567 return DuplicatedBlocksToReturn;
568 }
569
runOnFunction(BinaryFunction & Function)570 void TailDuplication::runOnFunction(BinaryFunction &Function) {
571 // Create a separate MCCodeEmitter to allow lock-free execution
572 BinaryContext::IndependentCodeEmitter Emitter;
573 if (!opts::NoThreads) {
574 Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter();
575 }
576
577 Function.getLayout().updateLayoutIndices();
578
579 // New blocks will be added and layout will change,
580 // so make a copy here to iterate over the original layout
581 BinaryFunction::BasicBlockOrderType BlockLayout(
582 Function.getLayout().block_begin(), Function.getLayout().block_end());
583 bool ModifiedFunction = false;
584 for (BinaryBasicBlock *BB : BlockLayout) {
585 AllDynamicCount += BB->getKnownExecutionCount();
586
587 // The block must be with one successor
588 if (BB->succ_size() != 1)
589 continue;
590 BinaryBasicBlock *Tail = BB->getSuccessor();
591 // Verify that the tail should be duplicated
592 if (!shouldDuplicate(BB, Tail))
593 continue;
594
595 std::vector<BinaryBasicBlock *> BlocksToDuplicate;
596 if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) {
597 BlocksToDuplicate = aggressiveDuplicate(*BB, *Tail);
598 } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) {
599 BlocksToDuplicate = moderateDuplicate(*BB, *Tail);
600 } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) {
601 BlocksToDuplicate = cacheDuplicate(Emitter.MCE.get(), Function, BB, Tail);
602 } else {
603 llvm_unreachable("unknown tail duplication mode");
604 }
605
606 if (BlocksToDuplicate.empty())
607 continue;
608
609 // Apply the the duplication
610 ModifiedFunction = true;
611 DuplicationsDynamicCount += BB->getExecutionCount();
612 auto DuplicatedBlocks = duplicateBlocks(*BB, BlocksToDuplicate);
613 for (BinaryBasicBlock *BB : DuplicatedBlocks) {
614 DuplicatedBlockCount++;
615 DuplicatedByteCount += BB->estimateSize(Emitter.MCE.get());
616 }
617
618 if (opts::TailDuplicationConstCopyPropagation) {
619 constantAndCopyPropagate(*BB, DuplicatedBlocks);
620 BinaryBasicBlock *FirstBB = BlocksToDuplicate[0];
621 if (FirstBB->pred_size() == 1) {
622 BinaryBasicBlock *PredBB = *FirstBB->pred_begin();
623 if (PredBB->succ_size() == 1)
624 constantAndCopyPropagate(*PredBB, BlocksToDuplicate);
625 }
626 }
627
628 // Layout indices might be stale after duplication
629 Function.getLayout().updateLayoutIndices();
630 }
631 if (ModifiedFunction)
632 ModifiedFunctions++;
633 }
634
runOnFunctions(BinaryContext & BC)635 void TailDuplication::runOnFunctions(BinaryContext &BC) {
636 if (opts::TailDuplicationMode == TailDuplication::TD_NONE)
637 return;
638
639 for (auto &It : BC.getBinaryFunctions()) {
640 BinaryFunction &Function = It.second;
641 if (!shouldOptimize(Function))
642 continue;
643 runOnFunction(Function);
644 }
645
646 outs() << "BOLT-INFO: tail duplication"
647 << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
648 100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
649 << format(" duplicated %zu blocks (%zu bytes) responsible for",
650 DuplicatedBlockCount, DuplicatedByteCount)
651 << format(" %zu dynamic executions (%.2f%% of all block executions)",
652 DuplicationsDynamicCount,
653 100.0 * DuplicationsDynamicCount / AllDynamicCount)
654 << "\n";
655
656 if (opts::TailDuplicationConstCopyPropagation) {
657 outs() << "BOLT-INFO: tail duplication "
658 << format("applied %zu static and %zu dynamic propagation deletions",
659 StaticInstructionDeletionCount,
660 DynamicInstructionDeletionCount)
661 << "\n";
662 }
663 }
664
665 } // end namespace bolt
666 } // end namespace llvm
667