1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains an optimization for div and rem on architectures that
11 // execute short instructions significantly faster than longer instructions.
12 // For example, on Intel Atom 32-bit divides are slow enough that during
13 // runtime it is profitable to check the value of the operands, and if they are
14 // positive and less than 256 use an unsigned 8-bit divide.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/Analysis/ValueTracking.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/IRBuilder.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/Transforms/Utils/Local.h"
25 
26 using namespace llvm;
27 
28 #define DEBUG_TYPE "bypass-slow-division"
29 
30 namespace {
31   struct DivOpInfo {
32     bool SignedOp;
33     Value *Dividend;
34     Value *Divisor;
35 
36     DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
37       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
38   };
39 
40   struct QuotRemPair {
41     Value *Quotient;
42     Value *Remainder;
43 
44     QuotRemPair(Value *InQuotient, Value *InRemainder)
45         : Quotient(InQuotient), Remainder(InRemainder) {}
46   };
47 
48   /// A quotient and remainder, plus a BB from which they logically "originate".
49   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
50   /// corresponding predecessor.
51   struct QuotRemWithBB {
52     BasicBlock *BB = nullptr;
53     Value *Quotient = nullptr;
54     Value *Remainder = nullptr;
55   };
56 }
57 
58 namespace llvm {
59   template<>
60   struct DenseMapInfo<DivOpInfo> {
61     static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
62       return Val1.SignedOp == Val2.SignedOp &&
63              Val1.Dividend == Val2.Dividend &&
64              Val1.Divisor == Val2.Divisor;
65     }
66 
67     static DivOpInfo getEmptyKey() {
68       return DivOpInfo(false, nullptr, nullptr);
69     }
70 
71     static DivOpInfo getTombstoneKey() {
72       return DivOpInfo(true, nullptr, nullptr);
73     }
74 
75     static unsigned getHashValue(const DivOpInfo &Val) {
76       return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
77                         reinterpret_cast<uintptr_t>(Val.Divisor)) ^
78                         (unsigned)Val.SignedOp;
79     }
80   };
81 
82   typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
83   typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
84 }
85 
86 namespace {
87 enum ValueRange {
88   /// Operand definitely fits into BypassType. No runtime checks are needed.
89   VALRNG_SHORT,
90   /// A runtime check is required, as value range is unknown.
91   VALRNG_UNKNOWN,
92   /// Operand is unlikely to fit into BypassType. The bypassing should be
93   /// disabled.
94   VALRNG_LONG
95 };
96 
97 class FastDivInsertionTask {
98   bool IsValidTask = false;
99   Instruction *SlowDivOrRem = nullptr;
100   IntegerType *BypassType = nullptr;
101   BasicBlock *MainBB = nullptr;
102 
103   ValueRange getValueRange(Value *Op);
104   QuotRemWithBB createSlowBB(BasicBlock *Successor);
105   QuotRemWithBB createFastBB(BasicBlock *Successor);
106   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
107                                    BasicBlock *PhiBB);
108   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
109   Optional<QuotRemPair> insertFastDivAndRem();
110 
111   bool isSignedOp() {
112     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
113            SlowDivOrRem->getOpcode() == Instruction::SRem;
114   }
115   bool isDivisionOp() {
116     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
117            SlowDivOrRem->getOpcode() == Instruction::UDiv;
118   }
119   Type *getSlowType() { return SlowDivOrRem->getType(); }
120 
121 public:
122   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
123   Value *getReplacement(DivCacheTy &Cache);
124 };
125 } // anonymous namespace
126 
127 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
128                                            const BypassWidthsTy &BypassWidths) {
129   switch (I->getOpcode()) {
130   case Instruction::UDiv:
131   case Instruction::SDiv:
132   case Instruction::URem:
133   case Instruction::SRem:
134     SlowDivOrRem = I;
135     break;
136   default:
137     // I is not a div/rem operation.
138     return;
139   }
140 
141   // Skip division on vector types. Only optimize integer instructions.
142   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
143   if (!SlowType)
144     return;
145 
146   // Skip if this bitwidth is not bypassed.
147   auto BI = BypassWidths.find(SlowType->getBitWidth());
148   if (BI == BypassWidths.end())
149     return;
150 
151   // Get type for div/rem instruction with bypass bitwidth.
152   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
153   BypassType = BT;
154 
155   // The original basic block.
156   MainBB = I->getParent();
157 
158   // The instruction is indeed a slow div or rem operation.
159   IsValidTask = true;
160 }
161 
162 /// Reuses previously-computed dividend or remainder from the current BB if
163 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
164 /// perform the optimization and caches the resulting dividend and remainder.
165 /// If no replacement can be generated, nullptr is returned.
166 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
167   // First, make sure that the task is valid.
168   if (!IsValidTask)
169     return nullptr;
170 
171   // Then, look for a value in Cache.
172   Value *Dividend = SlowDivOrRem->getOperand(0);
173   Value *Divisor = SlowDivOrRem->getOperand(1);
174   DivOpInfo Key(isSignedOp(), Dividend, Divisor);
175   auto CacheI = Cache.find(Key);
176 
177   if (CacheI == Cache.end()) {
178     // If previous instance does not exist, try to insert fast div.
179     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
180     // Bail out if insertFastDivAndRem has failed.
181     if (!OptResult)
182       return nullptr;
183     CacheI = Cache.insert({Key, *OptResult}).first;
184   }
185 
186   QuotRemPair &Value = CacheI->second;
187   return isDivisionOp() ? Value.Quotient : Value.Remainder;
188 }
189 
190 /// Check if an integer value fits into our bypass type.
191 ValueRange FastDivInsertionTask::getValueRange(Value *V) {
192   unsigned ShortLen = BypassType->getBitWidth();
193   unsigned LongLen = V->getType()->getIntegerBitWidth();
194 
195   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
196   unsigned HiBits = LongLen - ShortLen;
197 
198   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
199   APInt Zeros(LongLen, 0), Ones(LongLen, 0);
200 
201   computeKnownBits(V, Zeros, Ones, DL);
202 
203   if (Zeros.countLeadingOnes() >= HiBits)
204     return VALRNG_SHORT;
205 
206   if (Ones.countLeadingZeros() < HiBits)
207     return VALRNG_LONG;
208 
209   return VALRNG_UNKNOWN;
210 }
211 
212 /// Add new basic block for slow div and rem operations and put it before
213 /// SuccessorBB.
214 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
215   QuotRemWithBB DivRemPair;
216   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
217                                      MainBB->getParent(), SuccessorBB);
218   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
219 
220   Value *Dividend = SlowDivOrRem->getOperand(0);
221   Value *Divisor = SlowDivOrRem->getOperand(1);
222 
223   if (isSignedOp()) {
224     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
225     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
226   } else {
227     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
228     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
229   }
230 
231   Builder.CreateBr(SuccessorBB);
232   return DivRemPair;
233 }
234 
235 /// Add new basic block for fast div and rem operations and put it before
236 /// SuccessorBB.
237 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
238   QuotRemWithBB DivRemPair;
239   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
240                                      MainBB->getParent(), SuccessorBB);
241   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
242 
243   Value *Dividend = SlowDivOrRem->getOperand(0);
244   Value *Divisor = SlowDivOrRem->getOperand(1);
245   Value *ShortDivisorV =
246       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
247   Value *ShortDividendV =
248       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
249 
250   // udiv/urem because this optimization only handles positive numbers.
251   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
252   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
253   DivRemPair.Quotient =
254       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
255   DivRemPair.Remainder =
256       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
257   Builder.CreateBr(SuccessorBB);
258 
259   return DivRemPair;
260 }
261 
262 /// Creates Phi nodes for result of Div and Rem.
263 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
264                                                        QuotRemWithBB &RHS,
265                                                        BasicBlock *PhiBB) {
266   IRBuilder<> Builder(PhiBB, PhiBB->begin());
267   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
268   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
269   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
270   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
271   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
272   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
273   return QuotRemPair(QuoPhi, RemPhi);
274 }
275 
276 /// Creates a runtime check to test whether both the divisor and dividend fit
277 /// into BypassType. The check is inserted at the end of MainBB. True return
278 /// value means that the operands fit. Either of the operands may be NULL if it
279 /// doesn't need a runtime check.
280 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
281   assert((Op1 || Op2) && "Nothing to check");
282   IRBuilder<> Builder(MainBB, MainBB->end());
283 
284   Value *OrV;
285   if (Op1 && Op2)
286     OrV = Builder.CreateOr(Op1, Op2);
287   else
288     OrV = Op1 ? Op1 : Op2;
289 
290   // BitMask is inverted to check if the operands are
291   // larger than the bypass type
292   uint64_t BitMask = ~BypassType->getBitMask();
293   Value *AndV = Builder.CreateAnd(OrV, BitMask);
294 
295   // Compare operand values
296   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
297   return Builder.CreateICmpEQ(AndV, ZeroV);
298 }
299 
300 /// Substitutes the div/rem instruction with code that checks the value of the
301 /// operands and uses a shorter-faster div/rem instruction when possible.
302 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
303   Value *Dividend = SlowDivOrRem->getOperand(0);
304   Value *Divisor = SlowDivOrRem->getOperand(1);
305 
306   if (isa<ConstantInt>(Divisor)) {
307     // Keep division by a constant for DAGCombiner.
308     return None;
309   }
310 
311   ValueRange DividendRange = getValueRange(Dividend);
312   if (DividendRange == VALRNG_LONG)
313     return None;
314 
315   ValueRange DivisorRange = getValueRange(Divisor);
316   if (DivisorRange == VALRNG_LONG)
317     return None;
318 
319   bool DividendShort = (DividendRange == VALRNG_SHORT);
320   bool DivisorShort = (DivisorRange == VALRNG_SHORT);
321 
322   if (DividendShort && DivisorShort) {
323     // If both operands are known to be short then just replace the long
324     // division with a short one in-place.
325 
326     IRBuilder<> Builder(SlowDivOrRem);
327     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
328     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
329     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
330     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
331     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
332     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
333     return QuotRemPair(ExtDiv, ExtRem);
334   } else if (DividendShort && !isSignedOp()) {
335     // If the division is unsigned and Dividend is known to be short, then
336     // either
337     // 1) Divisor is less or equal to Dividend, and the result can be computed
338     //    with a short division.
339     // 2) Divisor is greater than Dividend. In this case, no division is needed
340     //    at all: The quotient is 0 and the remainder is equal to Dividend.
341     //
342     // So instead of checking at runtime whether Divisor fits into BypassType,
343     // we emit a runtime check to differentiate between these two cases. This
344     // lets us entirely avoid a long div.
345 
346     // Split the basic block before the div/rem.
347     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
348     // Remove the unconditional branch from MainBB to SuccessorBB.
349     MainBB->getInstList().back().eraseFromParent();
350     QuotRemWithBB Long;
351     Long.BB = MainBB;
352     Long.Quotient = ConstantInt::get(getSlowType(), 0);
353     Long.Remainder = Dividend;
354     QuotRemWithBB Fast = createFastBB(SuccessorBB);
355     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
356     IRBuilder<> Builder(MainBB, MainBB->end());
357     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
358     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
359     return Result;
360   } else {
361     // General case. Create both slow and fast div/rem pairs and choose one of
362     // them at runtime.
363 
364     // Split the basic block before the div/rem.
365     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
366     // Remove the unconditional branch from MainBB to SuccessorBB.
367     MainBB->getInstList().back().eraseFromParent();
368     QuotRemWithBB Fast = createFastBB(SuccessorBB);
369     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
370     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
371     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
372                                             DivisorShort ? nullptr : Divisor);
373     IRBuilder<> Builder(MainBB, MainBB->end());
374     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
375     return Result;
376   }
377 }
378 
379 /// This optimization identifies DIV/REM instructions in a BB that can be
380 /// profitably bypassed and carried out with a shorter, faster divide.
381 bool llvm::bypassSlowDivision(BasicBlock *BB,
382                               const BypassWidthsTy &BypassWidths) {
383   DivCacheTy PerBBDivCache;
384 
385   bool MadeChange = false;
386   Instruction* Next = &*BB->begin();
387   while (Next != nullptr) {
388     // We may add instructions immediately after I, but we want to skip over
389     // them.
390     Instruction* I = Next;
391     Next = Next->getNextNode();
392 
393     FastDivInsertionTask Task(I, BypassWidths);
394     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
395       I->replaceAllUsesWith(Replacement);
396       I->eraseFromParent();
397       MadeChange = true;
398     }
399   }
400 
401   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
402   // create divrem machine instructions.  Now erase any unused divs / rems so we
403   // don't leave extra instructions sitting around.
404   for (auto &KV : PerBBDivCache)
405     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
406       RecursivelyDeleteTriviallyDeadInstructions(V);
407 
408   return MadeChange;
409 }
410