1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains an optimization for div and rem on architectures that
11 // execute short instructions significantly faster than longer instructions.
12 // For example, on Intel Atom 32-bit divides are slow enough that during
13 // runtime it is profitable to check the value of the operands, and if they are
14 // positive and less than 256 use an unsigned 8-bit divide.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/Transforms/Utils/Local.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "bypass-slow-division"
30 
31 namespace {
32   struct DivOpInfo {
33     bool SignedOp;
34     Value *Dividend;
35     Value *Divisor;
36 
37     DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
38       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
39   };
40 
41   struct QuotRemPair {
42     Value *Quotient;
43     Value *Remainder;
44 
45     QuotRemPair(Value *InQuotient, Value *InRemainder)
46         : Quotient(InQuotient), Remainder(InRemainder) {}
47   };
48 
49   /// A quotient and remainder, plus a BB from which they logically "originate".
50   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
51   /// corresponding predecessor.
52   struct QuotRemWithBB {
53     BasicBlock *BB = nullptr;
54     Value *Quotient = nullptr;
55     Value *Remainder = nullptr;
56   };
57 }
58 
59 namespace llvm {
60   template<>
61   struct DenseMapInfo<DivOpInfo> {
62     static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
63       return Val1.SignedOp == Val2.SignedOp &&
64              Val1.Dividend == Val2.Dividend &&
65              Val1.Divisor == Val2.Divisor;
66     }
67 
68     static DivOpInfo getEmptyKey() {
69       return DivOpInfo(false, nullptr, nullptr);
70     }
71 
72     static DivOpInfo getTombstoneKey() {
73       return DivOpInfo(true, nullptr, nullptr);
74     }
75 
76     static unsigned getHashValue(const DivOpInfo &Val) {
77       return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
78                         reinterpret_cast<uintptr_t>(Val.Divisor)) ^
79                         (unsigned)Val.SignedOp;
80     }
81   };
82 
83   typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
84   typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
85   typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
86 }
87 
88 namespace {
89 enum ValueRange {
90   /// Operand definitely fits into BypassType. No runtime checks are needed.
91   VALRNG_KNOWN_SHORT,
92   /// A runtime check is required, as value range is unknown.
93   VALRNG_UNKNOWN,
94   /// Operand is unlikely to fit into BypassType. The bypassing should be
95   /// disabled.
96   VALRNG_LIKELY_LONG
97 };
98 
99 class FastDivInsertionTask {
100   bool IsValidTask = false;
101   Instruction *SlowDivOrRem = nullptr;
102   IntegerType *BypassType = nullptr;
103   BasicBlock *MainBB = nullptr;
104 
105   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
106   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
107   QuotRemWithBB createSlowBB(BasicBlock *Successor);
108   QuotRemWithBB createFastBB(BasicBlock *Successor);
109   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
110                                    BasicBlock *PhiBB);
111   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
112   Optional<QuotRemPair> insertFastDivAndRem();
113 
114   bool isSignedOp() {
115     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
116            SlowDivOrRem->getOpcode() == Instruction::SRem;
117   }
118   bool isDivisionOp() {
119     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
120            SlowDivOrRem->getOpcode() == Instruction::UDiv;
121   }
122   Type *getSlowType() { return SlowDivOrRem->getType(); }
123 
124 public:
125   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
126   Value *getReplacement(DivCacheTy &Cache);
127 };
128 } // anonymous namespace
129 
130 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
131                                            const BypassWidthsTy &BypassWidths) {
132   switch (I->getOpcode()) {
133   case Instruction::UDiv:
134   case Instruction::SDiv:
135   case Instruction::URem:
136   case Instruction::SRem:
137     SlowDivOrRem = I;
138     break;
139   default:
140     // I is not a div/rem operation.
141     return;
142   }
143 
144   // Skip division on vector types. Only optimize integer instructions.
145   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
146   if (!SlowType)
147     return;
148 
149   // Skip if this bitwidth is not bypassed.
150   auto BI = BypassWidths.find(SlowType->getBitWidth());
151   if (BI == BypassWidths.end())
152     return;
153 
154   // Get type for div/rem instruction with bypass bitwidth.
155   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
156   BypassType = BT;
157 
158   // The original basic block.
159   MainBB = I->getParent();
160 
161   // The instruction is indeed a slow div or rem operation.
162   IsValidTask = true;
163 }
164 
165 /// Reuses previously-computed dividend or remainder from the current BB if
166 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
167 /// perform the optimization and caches the resulting dividend and remainder.
168 /// If no replacement can be generated, nullptr is returned.
169 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
170   // First, make sure that the task is valid.
171   if (!IsValidTask)
172     return nullptr;
173 
174   // Then, look for a value in Cache.
175   Value *Dividend = SlowDivOrRem->getOperand(0);
176   Value *Divisor = SlowDivOrRem->getOperand(1);
177   DivOpInfo Key(isSignedOp(), Dividend, Divisor);
178   auto CacheI = Cache.find(Key);
179 
180   if (CacheI == Cache.end()) {
181     // If previous instance does not exist, try to insert fast div.
182     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
183     // Bail out if insertFastDivAndRem has failed.
184     if (!OptResult)
185       return nullptr;
186     CacheI = Cache.insert({Key, *OptResult}).first;
187   }
188 
189   QuotRemPair &Value = CacheI->second;
190   return isDivisionOp() ? Value.Quotient : Value.Remainder;
191 }
192 
193 /// \brief Check if a value looks like a hash.
194 ///
195 /// The routine is expected to detect values computed using the most common hash
196 /// algorithms. Typically, hash computations end with one of the following
197 /// instructions:
198 ///
199 /// 1) MUL with a constant wider than BypassType
200 /// 2) XOR instruction
201 ///
202 /// And even if we are wrong and the value is not a hash, it is still quite
203 /// unlikely that such values will fit into BypassType.
204 ///
205 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
206 /// It is implemented as a depth-first search for values that look neither long
207 /// nor hash-like.
208 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
209   Instruction *I = dyn_cast<Instruction>(V);
210   if (!I)
211     return false;
212 
213   switch (I->getOpcode()) {
214   case Instruction::Xor:
215     return true;
216   case Instruction::Mul: {
217     // After Constant Hoisting pass, long constants may be represented as
218     // bitcast instructions. As a result, some constants may look like an
219     // instruction at first, and an additional check is necessary to find out if
220     // an operand is actually a constant.
221     Value *Op1 = I->getOperand(1);
222     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
223     if (!C && isa<BitCastInst>(Op1))
224       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
225     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
226   }
227   case Instruction::PHI: {
228     // Stop IR traversal in case of a crazy input code. This limits recursion
229     // depth.
230     if (Visited.size() >= 16)
231       return false;
232     // Do not visit nodes that have been visited already. We return true because
233     // it means that we couldn't find any value that doesn't look hash-like.
234     if (Visited.find(I) != Visited.end())
235       return true;
236     Visited.insert(I);
237     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
238       // Ignore undef values as they probably don't affect the division
239       // operands.
240       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
241              isa<UndefValue>(V);
242     });
243   }
244   default:
245     return false;
246   }
247 }
248 
249 /// Check if an integer value fits into our bypass type.
250 ValueRange FastDivInsertionTask::getValueRange(Value *V,
251                                                VisitedSetTy &Visited) {
252   unsigned ShortLen = BypassType->getBitWidth();
253   unsigned LongLen = V->getType()->getIntegerBitWidth();
254 
255   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
256   unsigned HiBits = LongLen - ShortLen;
257 
258   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
259   APInt Zeros(LongLen, 0), Ones(LongLen, 0);
260 
261   computeKnownBits(V, Zeros, Ones, DL);
262 
263   if (Zeros.countLeadingOnes() >= HiBits)
264     return VALRNG_KNOWN_SHORT;
265 
266   if (Ones.countLeadingZeros() < HiBits)
267     return VALRNG_LIKELY_LONG;
268 
269   // Long integer divisions are often used in hashtable implementations. It's
270   // not worth bypassing such divisions because hash values are extremely
271   // unlikely to have enough leading zeros. The call below tries to detect
272   // values that are unlikely to fit BypassType (including hashes).
273   if (isHashLikeValue(V, Visited))
274     return VALRNG_LIKELY_LONG;
275 
276   return VALRNG_UNKNOWN;
277 }
278 
279 /// Add new basic block for slow div and rem operations and put it before
280 /// SuccessorBB.
281 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
282   QuotRemWithBB DivRemPair;
283   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
284                                      MainBB->getParent(), SuccessorBB);
285   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
286 
287   Value *Dividend = SlowDivOrRem->getOperand(0);
288   Value *Divisor = SlowDivOrRem->getOperand(1);
289 
290   if (isSignedOp()) {
291     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
292     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
293   } else {
294     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
295     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
296   }
297 
298   Builder.CreateBr(SuccessorBB);
299   return DivRemPair;
300 }
301 
302 /// Add new basic block for fast div and rem operations and put it before
303 /// SuccessorBB.
304 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
305   QuotRemWithBB DivRemPair;
306   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
307                                      MainBB->getParent(), SuccessorBB);
308   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
309 
310   Value *Dividend = SlowDivOrRem->getOperand(0);
311   Value *Divisor = SlowDivOrRem->getOperand(1);
312   Value *ShortDivisorV =
313       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
314   Value *ShortDividendV =
315       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
316 
317   // udiv/urem because this optimization only handles positive numbers.
318   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
319   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
320   DivRemPair.Quotient =
321       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
322   DivRemPair.Remainder =
323       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
324   Builder.CreateBr(SuccessorBB);
325 
326   return DivRemPair;
327 }
328 
329 /// Creates Phi nodes for result of Div and Rem.
330 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
331                                                        QuotRemWithBB &RHS,
332                                                        BasicBlock *PhiBB) {
333   IRBuilder<> Builder(PhiBB, PhiBB->begin());
334   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
335   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
336   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
337   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
338   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
339   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
340   return QuotRemPair(QuoPhi, RemPhi);
341 }
342 
343 /// Creates a runtime check to test whether both the divisor and dividend fit
344 /// into BypassType. The check is inserted at the end of MainBB. True return
345 /// value means that the operands fit. Either of the operands may be NULL if it
346 /// doesn't need a runtime check.
347 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
348   assert((Op1 || Op2) && "Nothing to check");
349   IRBuilder<> Builder(MainBB, MainBB->end());
350 
351   Value *OrV;
352   if (Op1 && Op2)
353     OrV = Builder.CreateOr(Op1, Op2);
354   else
355     OrV = Op1 ? Op1 : Op2;
356 
357   // BitMask is inverted to check if the operands are
358   // larger than the bypass type
359   uint64_t BitMask = ~BypassType->getBitMask();
360   Value *AndV = Builder.CreateAnd(OrV, BitMask);
361 
362   // Compare operand values
363   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
364   return Builder.CreateICmpEQ(AndV, ZeroV);
365 }
366 
367 /// Substitutes the div/rem instruction with code that checks the value of the
368 /// operands and uses a shorter-faster div/rem instruction when possible.
369 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
370   Value *Dividend = SlowDivOrRem->getOperand(0);
371   Value *Divisor = SlowDivOrRem->getOperand(1);
372 
373   if (isa<ConstantInt>(Divisor)) {
374     // Keep division by a constant for DAGCombiner.
375     return None;
376   }
377 
378   VisitedSetTy SetL;
379   ValueRange DividendRange = getValueRange(Dividend, SetL);
380   if (DividendRange == VALRNG_LIKELY_LONG)
381     return None;
382 
383   VisitedSetTy SetR;
384   ValueRange DivisorRange = getValueRange(Divisor, SetR);
385   if (DivisorRange == VALRNG_LIKELY_LONG)
386     return None;
387 
388   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
389   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
390 
391   if (DividendShort && DivisorShort) {
392     // If both operands are known to be short then just replace the long
393     // division with a short one in-place.
394 
395     IRBuilder<> Builder(SlowDivOrRem);
396     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
397     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
398     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
399     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
400     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
401     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
402     return QuotRemPair(ExtDiv, ExtRem);
403   } else if (DividendShort && !isSignedOp()) {
404     // If the division is unsigned and Dividend is known to be short, then
405     // either
406     // 1) Divisor is less or equal to Dividend, and the result can be computed
407     //    with a short division.
408     // 2) Divisor is greater than Dividend. In this case, no division is needed
409     //    at all: The quotient is 0 and the remainder is equal to Dividend.
410     //
411     // So instead of checking at runtime whether Divisor fits into BypassType,
412     // we emit a runtime check to differentiate between these two cases. This
413     // lets us entirely avoid a long div.
414 
415     // Split the basic block before the div/rem.
416     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
417     // Remove the unconditional branch from MainBB to SuccessorBB.
418     MainBB->getInstList().back().eraseFromParent();
419     QuotRemWithBB Long;
420     Long.BB = MainBB;
421     Long.Quotient = ConstantInt::get(getSlowType(), 0);
422     Long.Remainder = Dividend;
423     QuotRemWithBB Fast = createFastBB(SuccessorBB);
424     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
425     IRBuilder<> Builder(MainBB, MainBB->end());
426     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
427     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
428     return Result;
429   } else {
430     // General case. Create both slow and fast div/rem pairs and choose one of
431     // them at runtime.
432 
433     // Split the basic block before the div/rem.
434     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
435     // Remove the unconditional branch from MainBB to SuccessorBB.
436     MainBB->getInstList().back().eraseFromParent();
437     QuotRemWithBB Fast = createFastBB(SuccessorBB);
438     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
439     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
440     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
441                                             DivisorShort ? nullptr : Divisor);
442     IRBuilder<> Builder(MainBB, MainBB->end());
443     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
444     return Result;
445   }
446 }
447 
448 /// This optimization identifies DIV/REM instructions in a BB that can be
449 /// profitably bypassed and carried out with a shorter, faster divide.
450 bool llvm::bypassSlowDivision(BasicBlock *BB,
451                               const BypassWidthsTy &BypassWidths) {
452   DivCacheTy PerBBDivCache;
453 
454   bool MadeChange = false;
455   Instruction* Next = &*BB->begin();
456   while (Next != nullptr) {
457     // We may add instructions immediately after I, but we want to skip over
458     // them.
459     Instruction* I = Next;
460     Next = Next->getNextNode();
461 
462     FastDivInsertionTask Task(I, BypassWidths);
463     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
464       I->replaceAllUsesWith(Replacement);
465       I->eraseFromParent();
466       MadeChange = true;
467     }
468   }
469 
470   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
471   // create divrem machine instructions.  Now erase any unused divs / rems so we
472   // don't leave extra instructions sitting around.
473   for (auto &KV : PerBBDivCache)
474     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
475       RecursivelyDeleteTriviallyDeadInstructions(V);
476 
477   return MadeChange;
478 }
479