1 //===- bolt/Passes/TailDuplication.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the TailDuplication class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/TailDuplication.h"
14 #include "llvm/ADT/DenseMap.h"
15 #include "llvm/MC/MCRegisterInfo.h"
16 
17 #include <numeric>
18 
19 #define DEBUG_TYPE "taildup"
20 
21 using namespace llvm;
22 
23 namespace opts {
24 
25 extern cl::OptionCategory BoltOptCategory;
26 extern cl::opt<bool> NoThreads;
27 
28 cl::opt<bolt::TailDuplication::DuplicationMode> TailDuplicationMode(
29     "tail-duplication",
30     cl::desc("duplicate unconditional branches that cross a cache line"),
31     cl::init(bolt::TailDuplication::TD_NONE),
32     cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none",
33                           "do not apply"),
34                clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive",
35                           "aggressive strategy"),
36                clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate",
37                           "moderate strategy"),
38                clEnumValN(bolt::TailDuplication::TD_CACHE, "cache",
39                           "cache-aware duplication strategy")),
40     cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
41 
42 static cl::opt<unsigned>
43     TailDuplicationMinimumOffset("tail-duplication-minimum-offset",
44                                  cl::desc("minimum offset needed between block "
45                                           "and successor to allow duplication"),
46                                  cl::ReallyHidden, cl::init(64),
47                                  cl::cat(BoltOptCategory));
48 
49 static cl::opt<unsigned> TailDuplicationMaximumDuplication(
50     "tail-duplication-maximum-duplication",
51     cl::desc("tail blocks whose size (in bytes) exceeds the value are never "
52              "duplicated"),
53     cl::ZeroOrMore, cl::ReallyHidden, cl::init(24), cl::cat(BoltOptCategory));
54 
55 static cl::opt<unsigned> TailDuplicationMinimumDuplication(
56     "tail-duplication-minimum-duplication",
57     cl::desc("tail blocks with size (in bytes) not exceeding the value are "
58              "always duplicated"),
59     cl::ReallyHidden, cl::init(2), cl::cat(BoltOptCategory));
60 
61 static cl::opt<bool> TailDuplicationConstCopyPropagation(
62     "tail-duplication-const-copy-propagation",
63     cl::desc("enable const and copy propagation after tail duplication"),
64     cl::ReallyHidden, cl::init(false), cl::cat(BoltOptCategory));
65 
66 static cl::opt<unsigned> TailDuplicationMaxCacheDistance(
67     "tail-duplication-max-cache-distance",
68     cl::desc("The weight of backward jumps for ExtTSP value"), cl::init(256),
69     cl::ReallyHidden, cl::cat(BoltOptCategory));
70 
71 static cl::opt<double> TailDuplicationCacheBackwardWeight(
72     "tail-duplication-cache-backward-weight",
73     cl::desc(
74         "The maximum distance (in bytes) of backward jumps for ExtTSP value"),
75     cl::init(0.5), cl::ReallyHidden, cl::cat(BoltOptCategory));
76 
77 } // namespace opts
78 
79 namespace llvm {
80 namespace bolt {
81 
getCallerSavedRegs(const MCInst & Inst,BitVector & Regs,BinaryContext & BC) const82 void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs,
83                                          BinaryContext &BC) const {
84   if (!BC.MIB->isCall(Inst))
85     return;
86   BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false);
87   BC.MIB->getCalleeSavedRegs(CallRegs);
88   CallRegs.flip();
89   Regs |= CallRegs;
90 }
91 
regIsPossiblyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const92 bool TailDuplication::regIsPossiblyOverwritten(const MCInst &Inst, unsigned Reg,
93                                                BinaryContext &BC) const {
94   BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
95   BC.MIB->getWrittenRegs(Inst, WrittenRegs);
96   getCallerSavedRegs(Inst, WrittenRegs, BC);
97   if (BC.MIB->isRep(Inst))
98     BC.MIB->getRepRegs(WrittenRegs);
99   WrittenRegs &= BC.MIB->getAliases(Reg, false);
100   return WrittenRegs.any();
101 }
102 
regIsDefinitelyOverwritten(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const103 bool TailDuplication::regIsDefinitelyOverwritten(const MCInst &Inst,
104                                                  unsigned Reg,
105                                                  BinaryContext &BC) const {
106   BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false);
107   BC.MIB->getWrittenRegs(Inst, WrittenRegs);
108   getCallerSavedRegs(Inst, WrittenRegs, BC);
109   if (BC.MIB->isRep(Inst))
110     BC.MIB->getRepRegs(WrittenRegs);
111   return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Reg) &&
112           !BC.MIB->isConditionalMove(Inst));
113 }
114 
regIsUsed(const MCInst & Inst,unsigned Reg,BinaryContext & BC) const115 bool TailDuplication::regIsUsed(const MCInst &Inst, unsigned Reg,
116                                 BinaryContext &BC) const {
117   BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false);
118   BC.MIB->getSrcRegs(Inst, SrcRegs);
119   SrcRegs &= BC.MIB->getAliases(Reg, true);
120   return SrcRegs.any();
121 }
122 
isOverwrittenBeforeUsed(BinaryBasicBlock & StartBB,unsigned Reg) const123 bool TailDuplication::isOverwrittenBeforeUsed(BinaryBasicBlock &StartBB,
124                                               unsigned Reg) const {
125   BinaryFunction *BF = StartBB.getFunction();
126   BinaryContext &BC = BF->getBinaryContext();
127   std::queue<BinaryBasicBlock *> Q;
128   for (auto Itr = StartBB.succ_begin(); Itr != StartBB.succ_end(); ++Itr) {
129     BinaryBasicBlock *NextBB = *Itr;
130     Q.push(NextBB);
131   }
132   std::set<BinaryBasicBlock *> Visited;
133   // Breadth first search through successive blocks and see if Reg is ever used
134   // before its overwritten
135   while (Q.size() > 0) {
136     BinaryBasicBlock *CurrBB = Q.front();
137     Q.pop();
138     if (Visited.count(CurrBB))
139       continue;
140     Visited.insert(CurrBB);
141     bool Overwritten = false;
142     for (auto Itr = CurrBB->begin(); Itr != CurrBB->end(); ++Itr) {
143       MCInst &Inst = *Itr;
144       if (regIsUsed(Inst, Reg, BC))
145         return false;
146       if (regIsDefinitelyOverwritten(Inst, Reg, BC)) {
147         Overwritten = true;
148         break;
149       }
150     }
151     if (Overwritten)
152       continue;
153     for (auto Itr = CurrBB->succ_begin(); Itr != CurrBB->succ_end(); ++Itr) {
154       BinaryBasicBlock *NextBB = *Itr;
155       Q.push(NextBB);
156     }
157   }
158   return true;
159 }
160 
constantAndCopyPropagate(BinaryBasicBlock & OriginalBB,std::vector<BinaryBasicBlock * > & BlocksToPropagate)161 void TailDuplication::constantAndCopyPropagate(
162     BinaryBasicBlock &OriginalBB,
163     std::vector<BinaryBasicBlock *> &BlocksToPropagate) {
164   BinaryFunction *BF = OriginalBB.getFunction();
165   BinaryContext &BC = BF->getBinaryContext();
166 
167   BlocksToPropagate.insert(BlocksToPropagate.begin(), &OriginalBB);
168   // Iterate through the original instructions to find one to propagate
169   for (auto Itr = OriginalBB.begin(); Itr != OriginalBB.end(); ++Itr) {
170     MCInst &OriginalInst = *Itr;
171     // It must be a non conditional
172     if (BC.MIB->isConditionalMove(OriginalInst))
173       continue;
174 
175     // Move immediate or move register
176     if ((!BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate() ||
177          !OriginalInst.getOperand(1).isImm()) &&
178         (!BC.MII->get(OriginalInst.getOpcode()).isMoveReg() ||
179          !OriginalInst.getOperand(1).isReg()))
180       continue;
181 
182     // True if this is constant propagation and not copy propagation
183     bool ConstantProp = BC.MII->get(OriginalInst.getOpcode()).isMoveImmediate();
184     // The Register to replaced
185     unsigned Reg = OriginalInst.getOperand(0).getReg();
186     // True if the register to replace was replaced everywhere it was used
187     bool ReplacedEverywhere = true;
188     // True if the register was definitely overwritten
189     bool Overwritten = false;
190     // True if the register to replace and the register to replace with (for
191     // copy propagation) has not been overwritten and is still usable
192     bool RegsActive = true;
193 
194     // Iterate through successor blocks and through their instructions
195     for (BinaryBasicBlock *NextBB : BlocksToPropagate) {
196       for (auto PropagateItr =
197                ((NextBB == &OriginalBB) ? Itr + 1 : NextBB->begin());
198            PropagateItr < NextBB->end(); ++PropagateItr) {
199         MCInst &PropagateInst = *PropagateItr;
200         if (regIsUsed(PropagateInst, Reg, BC)) {
201           bool Replaced = false;
202           // If both registers are active for copy propagation or the register
203           // to replace is active for constant propagation
204           if (RegsActive) {
205             // Set Replaced and so ReplacedEverwhere to false if it cannot be
206             // replaced (no replacing that opcode, Register is src and dest)
207             if (ConstantProp)
208               Replaced = BC.MIB->replaceRegWithImm(
209                   PropagateInst, Reg, OriginalInst.getOperand(1).getImm());
210             else
211               Replaced = BC.MIB->replaceRegWithReg(
212                   PropagateInst, Reg, OriginalInst.getOperand(1).getReg());
213           }
214           ReplacedEverywhere = ReplacedEverywhere && Replaced;
215         }
216         // For copy propagation, make sure no propagation happens after the
217         // register to replace with is overwritten
218         if (!ConstantProp &&
219             regIsPossiblyOverwritten(PropagateInst,
220                                      OriginalInst.getOperand(1).getReg(), BC))
221           RegsActive = false;
222 
223         // Make sure no propagation happens after the register to replace is
224         // overwritten
225         if (regIsPossiblyOverwritten(PropagateInst, Reg, BC))
226           RegsActive = false;
227 
228         // Record if the register to replace is overwritten
229         if (regIsDefinitelyOverwritten(PropagateInst, Reg, BC)) {
230           Overwritten = true;
231           break;
232         }
233       }
234       if (Overwritten)
235         break;
236     }
237 
238     // If the register was replaced everwhere and it was overwritten in either
239     // one of the iterated through blocks or one of the successor blocks, delete
240     // the original move instruction
241     if (ReplacedEverywhere &&
242         (Overwritten ||
243          isOverwrittenBeforeUsed(
244              *BlocksToPropagate[BlocksToPropagate.size() - 1], Reg))) {
245       // If both registers are active for copy propagation or the register
246       // to replace is active for constant propagation
247       StaticInstructionDeletionCount++;
248       DynamicInstructionDeletionCount += OriginalBB.getExecutionCount();
249       Itr = std::prev(OriginalBB.eraseInstruction(Itr));
250     }
251   }
252 }
253 
isInCacheLine(const BinaryBasicBlock & BB,const BinaryBasicBlock & Succ) const254 bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB,
255                                     const BinaryBasicBlock &Succ) const {
256   if (&BB == &Succ)
257     return true;
258 
259   uint64_t Distance = 0;
260   int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1;
261 
262   for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex();
263        I += Direction) {
264     Distance += BB.getFunction()->getLayout().getBlock(I)->getOriginalSize();
265     if (Distance > opts::TailDuplicationMinimumOffset)
266       return false;
267   }
268   return true;
269 }
270 
271 std::vector<BinaryBasicBlock *>
moderateDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const272 TailDuplication::moderateDuplicate(BinaryBasicBlock &BB,
273                                    BinaryBasicBlock &Tail) const {
274   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
275   // The block must be hot
276   if (BB.getKnownExecutionCount() == 0)
277     return BlocksToDuplicate;
278   // and its sucessor is not already in the same cache line
279   if (isInCacheLine(BB, Tail))
280     return BlocksToDuplicate;
281   // and its size do not exceed the maximum allowed size
282   if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication)
283     return BlocksToDuplicate;
284   // If duplicating would introduce a new branch, don't duplicate
285   for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) {
286     if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1)
287       return BlocksToDuplicate;
288   }
289 
290   BlocksToDuplicate.push_back(&Tail);
291   return BlocksToDuplicate;
292 }
293 
294 std::vector<BinaryBasicBlock *>
aggressiveDuplicate(BinaryBasicBlock & BB,BinaryBasicBlock & Tail) const295 TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB,
296                                      BinaryBasicBlock &Tail) const {
297   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
298   // The block must be hot
299   if (BB.getKnownExecutionCount() == 0)
300     return BlocksToDuplicate;
301   // and its sucessor is not already in the same cache line
302   if (isInCacheLine(BB, Tail))
303     return BlocksToDuplicate;
304 
305   BinaryBasicBlock *CurrBB = &BB;
306   while (CurrBB) {
307     LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding "
308                       << CurrBB->getName() << " to duplication list\n";);
309     BlocksToDuplicate.push_back(CurrBB);
310 
311     if (CurrBB->hasJumpTable()) {
312       LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing duplication "
313                            "list due to a JT in "
314                         << CurrBB->getName() << '\n';);
315       BlocksToDuplicate.clear();
316       break;
317     }
318 
319     // With no successors, we've reached the end and should duplicate all of
320     // BlocksToDuplicate
321     if (CurrBB->succ_size() == 0)
322       break;
323 
324     // With two successors, if they're both a jump, we should duplicate all
325     // blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of
326     // blocks to copy
327     if (CurrBB->succ_size() >= 2) {
328       if (CurrBB->getConditionalSuccessor(false)->getLayoutIndex() ==
329               CurrBB->getLayoutIndex() + 1 ||
330           CurrBB->getConditionalSuccessor(true)->getLayoutIndex() ==
331               CurrBB->getLayoutIndex() + 1) {
332         LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing "
333                              "duplication list, can't find a simple stream at "
334                           << CurrBB->getName() << '\n';);
335         BlocksToDuplicate.clear();
336       }
337       break;
338     }
339 
340     // With one successor, if its a jump, we should duplicate all blocks in
341     // BlocksToDuplicate. Otherwise, we should keep going
342     BinaryBasicBlock *SuccBB = CurrBB->getSuccessor();
343     if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1)
344       break;
345     CurrBB = SuccBB;
346   }
347   // Don't duplicate if its too much code
348   unsigned DuplicationByteCount = std::accumulate(
349       std::begin(BlocksToDuplicate), std::end(BlocksToDuplicate), 0,
350       [](int value, BinaryBasicBlock *p) {
351         return value + p->getOriginalSize();
352       });
353   if (DuplicationByteCount > opts::TailDuplicationMaximumDuplication) {
354     LLVM_DEBUG(dbgs() << "Aggressive tail duplication: duplication byte count ("
355                       << DuplicationByteCount << ") exceeds maximum "
356                       << opts::TailDuplicationMaximumDuplication << '\n';);
357     BlocksToDuplicate.clear();
358   }
359   LLVM_DEBUG(dbgs() << "Aggressive tail duplication: found "
360                     << BlocksToDuplicate.size() << " blocks to duplicate\n";);
361   return BlocksToDuplicate;
362 }
363 
shouldDuplicate(BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const364 bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred,
365                                       BinaryBasicBlock *Tail) const {
366   if (Pred == Tail)
367     return false;
368   // Cannot duplicate non-tail blocks
369   if (Tail->succ_size() != 0)
370     return false;
371   // The blocks are already in the order
372   if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex())
373     return false;
374   // No tail duplication for blocks with jump tables
375   if (Pred->hasJumpTable())
376     return false;
377   if (Tail->hasJumpTable())
378     return false;
379 
380   return true;
381 }
382 
cacheScore(uint64_t SrcAddr,uint64_t SrcSize,uint64_t DstAddr,uint64_t DstSize,uint64_t Count) const383 double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize,
384                                    uint64_t DstAddr, uint64_t DstSize,
385                                    uint64_t Count) const {
386   assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE);
387 
388   bool IsForwardJump = SrcAddr <= DstAddr;
389   uint64_t JumpDistance = 0;
390   // Computing the length of the jump so that it takes the sizes of the two
391   // blocks into consideration
392   if (IsForwardJump) {
393     JumpDistance = (DstAddr + DstSize) - (SrcAddr);
394   } else {
395     JumpDistance = (SrcAddr + SrcSize) - (DstAddr);
396   }
397 
398   if (JumpDistance >= opts::TailDuplicationMaxCacheDistance)
399     return 0;
400   double Prob = 1.0 - static_cast<double>(JumpDistance) /
401                           opts::TailDuplicationMaxCacheDistance;
402   return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) *
403          Prob * Count;
404 }
405 
cacheScoreImproved(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const406 bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter,
407                                          BinaryFunction &BF,
408                                          BinaryBasicBlock *Pred,
409                                          BinaryBasicBlock *Tail) const {
410   // Collect (estimated) basic block sizes
411   DenseMap<const BinaryBasicBlock *, uint64_t> BBSize;
412   for (const BinaryBasicBlock &BB : BF) {
413     BBSize[&BB] = std::max<uint64_t>(BB.estimateSize(Emitter), 1);
414   }
415 
416   // Build current addresses of basic blocks starting at the entry block
417   DenseMap<BinaryBasicBlock *, uint64_t> CurAddr;
418   uint64_t Addr = 0;
419   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
420     CurAddr[SrcBB] = Addr;
421     Addr += BBSize[SrcBB];
422   }
423 
424   // Build new addresses (after duplication) starting at the entry block
425   DenseMap<BinaryBasicBlock *, uint64_t> NewAddr;
426   Addr = 0;
427   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
428     NewAddr[SrcBB] = Addr;
429     Addr += BBSize[SrcBB];
430     if (SrcBB == Pred)
431       Addr += BBSize[Tail];
432   }
433 
434   // Compute the cache score for the existing layout of basic blocks
435   double CurScore = 0;
436   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
437     auto BI = SrcBB->branch_info_begin();
438     for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
439       if (SrcBB != DstBB) {
440         CurScore += cacheScore(CurAddr[SrcBB], BBSize[SrcBB], CurAddr[DstBB],
441                                BBSize[DstBB], BI->Count);
442       }
443       ++BI;
444     }
445   }
446 
447   // Compute the cache score for the layout of blocks after tail duplication
448   double NewScore = 0;
449   for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) {
450     auto BI = SrcBB->branch_info_begin();
451     for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
452       if (SrcBB != DstBB) {
453         if (SrcBB == Pred && DstBB == Tail) {
454           NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB],
455                                  NewAddr[SrcBB] + BBSize[SrcBB], BBSize[DstBB],
456                                  BI->Count);
457         } else {
458           NewScore += cacheScore(NewAddr[SrcBB], BBSize[SrcBB], NewAddr[DstBB],
459                                  BBSize[DstBB], BI->Count);
460         }
461       }
462       ++BI;
463     }
464   }
465 
466   return NewScore > CurScore;
467 }
468 
469 std::vector<BinaryBasicBlock *>
cacheDuplicate(const MCCodeEmitter * Emitter,BinaryFunction & BF,BinaryBasicBlock * Pred,BinaryBasicBlock * Tail) const470 TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter,
471                                 BinaryFunction &BF, BinaryBasicBlock *Pred,
472                                 BinaryBasicBlock *Tail) const {
473   std::vector<BinaryBasicBlock *> BlocksToDuplicate;
474 
475   // No need to duplicate cold basic blocks
476   if (Pred->isCold() || Tail->isCold()) {
477     return BlocksToDuplicate;
478   }
479   // Always duplicate "small" tail basic blocks, which might be beneficial for
480   // code size, since a jump instruction is eliminated
481   if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) {
482     BlocksToDuplicate.push_back(Tail);
483     return BlocksToDuplicate;
484   }
485   // Never duplicate "large" tail basic blocks
486   if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) {
487     return BlocksToDuplicate;
488   }
489   // Do not append basic blocks after the last hot block in the current layout
490   auto NextBlock = BF.getLayout().getBasicBlockAfter(Pred);
491   if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) {
492     return BlocksToDuplicate;
493   }
494 
495   // Duplicate the tail only if it improves the cache score
496   if (cacheScoreImproved(Emitter, BF, Pred, Tail)) {
497     BlocksToDuplicate.push_back(Tail);
498   }
499 
500   return BlocksToDuplicate;
501 }
502 
duplicateBlocks(BinaryBasicBlock & BB,const std::vector<BinaryBasicBlock * > & BlocksToDuplicate) const503 std::vector<BinaryBasicBlock *> TailDuplication::duplicateBlocks(
504     BinaryBasicBlock &BB,
505     const std::vector<BinaryBasicBlock *> &BlocksToDuplicate) const {
506   BinaryFunction *BF = BB.getFunction();
507   BinaryContext &BC = BF->getBinaryContext();
508 
509   // Ratio of this new branches execution count to the total size of the
510   // successor's execution count.  Used to set this new branches execution count
511   // and lower the old successor's execution count
512   double ExecutionCountRatio =
513       BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount()
514           ? 1.0
515           : (double)BB.getExecutionCount() /
516                 BB.getSuccessor()->getExecutionCount();
517 
518   // Use the last branch info when adding a successor to LastBB
519   BinaryBasicBlock::BinaryBranchInfo &LastBI =
520       BB.getBranchInfo(*(BB.getSuccessor()));
521 
522   BinaryBasicBlock *LastOriginalBB = &BB;
523   BinaryBasicBlock *LastDuplicatedBB = &BB;
524   assert(LastDuplicatedBB->succ_size() == 1 &&
525          "tail duplication cannot act on a block with more than 1 successor");
526   LastDuplicatedBB->removeSuccessor(LastDuplicatedBB->getSuccessor());
527 
528   std::vector<std::unique_ptr<BinaryBasicBlock>> DuplicatedBlocks;
529   std::vector<BinaryBasicBlock *> DuplicatedBlocksToReturn;
530 
531   for (BinaryBasicBlock *CurBB : BlocksToDuplicate) {
532     DuplicatedBlocks.emplace_back(
533         BF->createBasicBlock((BC.Ctx)->createNamedTempSymbol("tail-dup")));
534     BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get();
535 
536     NewBB->addInstructions(CurBB->begin(), CurBB->end());
537     // Set execution count as if it was just a copy of the original
538     NewBB->setExecutionCount(CurBB->getExecutionCount());
539     NewBB->setIsCold(CurBB->isCold());
540     LastDuplicatedBB->addSuccessor(NewBB, LastBI);
541 
542     DuplicatedBlocksToReturn.push_back(NewBB);
543 
544     // As long as its not the first block, adjust both original and duplicated
545     // to what they should be
546     if (LastDuplicatedBB != &BB) {
547       LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
548       LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
549     }
550 
551     if (CurBB->succ_size() == 1)
552       LastBI = CurBB->getBranchInfo(*(CurBB->getSuccessor()));
553 
554     LastOriginalBB = CurBB;
555     LastDuplicatedBB = NewBB;
556   }
557 
558   LastDuplicatedBB->addSuccessors(
559       LastOriginalBB->succ_begin(), LastOriginalBB->succ_end(),
560       LastOriginalBB->branch_info_begin(), LastOriginalBB->branch_info_end());
561 
562   LastOriginalBB->adjustExecutionCount(1.0 - ExecutionCountRatio);
563   LastDuplicatedBB->adjustExecutionCount(ExecutionCountRatio);
564 
565   BF->insertBasicBlocks(&BB, std::move(DuplicatedBlocks));
566 
567   return DuplicatedBlocksToReturn;
568 }
569 
runOnFunction(BinaryFunction & Function)570 void TailDuplication::runOnFunction(BinaryFunction &Function) {
571   // Create a separate MCCodeEmitter to allow lock-free execution
572   BinaryContext::IndependentCodeEmitter Emitter;
573   if (!opts::NoThreads) {
574     Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter();
575   }
576 
577   Function.getLayout().updateLayoutIndices();
578 
579   // New blocks will be added and layout will change,
580   // so make a copy here to iterate over the original layout
581   BinaryFunction::BasicBlockOrderType BlockLayout(
582       Function.getLayout().block_begin(), Function.getLayout().block_end());
583   bool ModifiedFunction = false;
584   for (BinaryBasicBlock *BB : BlockLayout) {
585     AllDynamicCount += BB->getKnownExecutionCount();
586 
587     // The block must be with one successor
588     if (BB->succ_size() != 1)
589       continue;
590     BinaryBasicBlock *Tail = BB->getSuccessor();
591     // Verify that the tail should be duplicated
592     if (!shouldDuplicate(BB, Tail))
593       continue;
594 
595     std::vector<BinaryBasicBlock *> BlocksToDuplicate;
596     if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) {
597       BlocksToDuplicate = aggressiveDuplicate(*BB, *Tail);
598     } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) {
599       BlocksToDuplicate = moderateDuplicate(*BB, *Tail);
600     } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) {
601       BlocksToDuplicate = cacheDuplicate(Emitter.MCE.get(), Function, BB, Tail);
602     } else {
603       llvm_unreachable("unknown tail duplication mode");
604     }
605 
606     if (BlocksToDuplicate.empty())
607       continue;
608 
609     // Apply the the duplication
610     ModifiedFunction = true;
611     DuplicationsDynamicCount += BB->getExecutionCount();
612     auto DuplicatedBlocks = duplicateBlocks(*BB, BlocksToDuplicate);
613     for (BinaryBasicBlock *BB : DuplicatedBlocks) {
614       DuplicatedBlockCount++;
615       DuplicatedByteCount += BB->estimateSize(Emitter.MCE.get());
616     }
617 
618     if (opts::TailDuplicationConstCopyPropagation) {
619       constantAndCopyPropagate(*BB, DuplicatedBlocks);
620       BinaryBasicBlock *FirstBB = BlocksToDuplicate[0];
621       if (FirstBB->pred_size() == 1) {
622         BinaryBasicBlock *PredBB = *FirstBB->pred_begin();
623         if (PredBB->succ_size() == 1)
624           constantAndCopyPropagate(*PredBB, BlocksToDuplicate);
625       }
626     }
627 
628     // Layout indices might be stale after duplication
629     Function.getLayout().updateLayoutIndices();
630   }
631   if (ModifiedFunction)
632     ModifiedFunctions++;
633 }
634 
runOnFunctions(BinaryContext & BC)635 void TailDuplication::runOnFunctions(BinaryContext &BC) {
636   if (opts::TailDuplicationMode == TailDuplication::TD_NONE)
637     return;
638 
639   for (auto &It : BC.getBinaryFunctions()) {
640     BinaryFunction &Function = It.second;
641     if (!shouldOptimize(Function))
642       continue;
643     runOnFunction(Function);
644   }
645 
646   outs() << "BOLT-INFO: tail duplication"
647          << format(" modified %zu (%.2f%%) functions;", ModifiedFunctions,
648                    100.0 * ModifiedFunctions / BC.getBinaryFunctions().size())
649          << format(" duplicated %zu blocks (%zu bytes) responsible for",
650                    DuplicatedBlockCount, DuplicatedByteCount)
651          << format(" %zu dynamic executions (%.2f%% of all block executions)",
652                    DuplicationsDynamicCount,
653                    100.0 * DuplicationsDynamicCount / AllDynamicCount)
654          << "\n";
655 
656   if (opts::TailDuplicationConstCopyPropagation) {
657     outs() << "BOLT-INFO: tail duplication "
658            << format("applied %zu static and %zu dynamic propagation deletions",
659                      StaticInstructionDeletionCount,
660                      DynamicInstructionDeletionCount)
661            << "\n";
662   }
663 }
664 
665 } // end namespace bolt
666 } // end namespace llvm
667