1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains an optimization for div and rem on architectures that
11 // execute short instructions significantly faster than longer instructions.
12 // For example, on Intel Atom 32-bit divides are slow enough that during
13 // runtime it is profitable to check the value of the operands, and if they are
14 // positive and less than 256 use an unsigned 8-bit divide.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/Support/KnownBits.h"
26 #include "llvm/Transforms/Utils/Local.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "bypass-slow-division"
31 
32 namespace {
33   struct DivOpInfo {
34     bool SignedOp;
35     Value *Dividend;
36     Value *Divisor;
37 
38     DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
39       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
40   };
41 
42   struct QuotRemPair {
43     Value *Quotient;
44     Value *Remainder;
45 
46     QuotRemPair(Value *InQuotient, Value *InRemainder)
47         : Quotient(InQuotient), Remainder(InRemainder) {}
48   };
49 
50   /// A quotient and remainder, plus a BB from which they logically "originate".
51   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
52   /// corresponding predecessor.
53   struct QuotRemWithBB {
54     BasicBlock *BB = nullptr;
55     Value *Quotient = nullptr;
56     Value *Remainder = nullptr;
57   };
58 }
59 
60 namespace llvm {
61   template<>
62   struct DenseMapInfo<DivOpInfo> {
63     static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
64       return Val1.SignedOp == Val2.SignedOp &&
65              Val1.Dividend == Val2.Dividend &&
66              Val1.Divisor == Val2.Divisor;
67     }
68 
69     static DivOpInfo getEmptyKey() {
70       return DivOpInfo(false, nullptr, nullptr);
71     }
72 
73     static DivOpInfo getTombstoneKey() {
74       return DivOpInfo(true, nullptr, nullptr);
75     }
76 
77     static unsigned getHashValue(const DivOpInfo &Val) {
78       return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
79                         reinterpret_cast<uintptr_t>(Val.Divisor)) ^
80                         (unsigned)Val.SignedOp;
81     }
82   };
83 
84   typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
85   typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
86   typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
87 }
88 
89 namespace {
90 enum ValueRange {
91   /// Operand definitely fits into BypassType. No runtime checks are needed.
92   VALRNG_KNOWN_SHORT,
93   /// A runtime check is required, as value range is unknown.
94   VALRNG_UNKNOWN,
95   /// Operand is unlikely to fit into BypassType. The bypassing should be
96   /// disabled.
97   VALRNG_LIKELY_LONG
98 };
99 
100 class FastDivInsertionTask {
101   bool IsValidTask = false;
102   Instruction *SlowDivOrRem = nullptr;
103   IntegerType *BypassType = nullptr;
104   BasicBlock *MainBB = nullptr;
105 
106   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
107   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
108   QuotRemWithBB createSlowBB(BasicBlock *Successor);
109   QuotRemWithBB createFastBB(BasicBlock *Successor);
110   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
111                                    BasicBlock *PhiBB);
112   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
113   Optional<QuotRemPair> insertFastDivAndRem();
114 
115   bool isSignedOp() {
116     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
117            SlowDivOrRem->getOpcode() == Instruction::SRem;
118   }
119   bool isDivisionOp() {
120     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
121            SlowDivOrRem->getOpcode() == Instruction::UDiv;
122   }
123   Type *getSlowType() { return SlowDivOrRem->getType(); }
124 
125 public:
126   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
127   Value *getReplacement(DivCacheTy &Cache);
128 };
129 } // anonymous namespace
130 
131 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
132                                            const BypassWidthsTy &BypassWidths) {
133   switch (I->getOpcode()) {
134   case Instruction::UDiv:
135   case Instruction::SDiv:
136   case Instruction::URem:
137   case Instruction::SRem:
138     SlowDivOrRem = I;
139     break;
140   default:
141     // I is not a div/rem operation.
142     return;
143   }
144 
145   // Skip division on vector types. Only optimize integer instructions.
146   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
147   if (!SlowType)
148     return;
149 
150   // Skip if this bitwidth is not bypassed.
151   auto BI = BypassWidths.find(SlowType->getBitWidth());
152   if (BI == BypassWidths.end())
153     return;
154 
155   // Get type for div/rem instruction with bypass bitwidth.
156   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
157   BypassType = BT;
158 
159   // The original basic block.
160   MainBB = I->getParent();
161 
162   // The instruction is indeed a slow div or rem operation.
163   IsValidTask = true;
164 }
165 
166 /// Reuses previously-computed dividend or remainder from the current BB if
167 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
168 /// perform the optimization and caches the resulting dividend and remainder.
169 /// If no replacement can be generated, nullptr is returned.
170 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
171   // First, make sure that the task is valid.
172   if (!IsValidTask)
173     return nullptr;
174 
175   // Then, look for a value in Cache.
176   Value *Dividend = SlowDivOrRem->getOperand(0);
177   Value *Divisor = SlowDivOrRem->getOperand(1);
178   DivOpInfo Key(isSignedOp(), Dividend, Divisor);
179   auto CacheI = Cache.find(Key);
180 
181   if (CacheI == Cache.end()) {
182     // If previous instance does not exist, try to insert fast div.
183     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
184     // Bail out if insertFastDivAndRem has failed.
185     if (!OptResult)
186       return nullptr;
187     CacheI = Cache.insert({Key, *OptResult}).first;
188   }
189 
190   QuotRemPair &Value = CacheI->second;
191   return isDivisionOp() ? Value.Quotient : Value.Remainder;
192 }
193 
194 /// \brief Check if a value looks like a hash.
195 ///
196 /// The routine is expected to detect values computed using the most common hash
197 /// algorithms. Typically, hash computations end with one of the following
198 /// instructions:
199 ///
200 /// 1) MUL with a constant wider than BypassType
201 /// 2) XOR instruction
202 ///
203 /// And even if we are wrong and the value is not a hash, it is still quite
204 /// unlikely that such values will fit into BypassType.
205 ///
206 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
207 /// It is implemented as a depth-first search for values that look neither long
208 /// nor hash-like.
209 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
210   Instruction *I = dyn_cast<Instruction>(V);
211   if (!I)
212     return false;
213 
214   switch (I->getOpcode()) {
215   case Instruction::Xor:
216     return true;
217   case Instruction::Mul: {
218     // After Constant Hoisting pass, long constants may be represented as
219     // bitcast instructions. As a result, some constants may look like an
220     // instruction at first, and an additional check is necessary to find out if
221     // an operand is actually a constant.
222     Value *Op1 = I->getOperand(1);
223     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
224     if (!C && isa<BitCastInst>(Op1))
225       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
226     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
227   }
228   case Instruction::PHI: {
229     // Stop IR traversal in case of a crazy input code. This limits recursion
230     // depth.
231     if (Visited.size() >= 16)
232       return false;
233     // Do not visit nodes that have been visited already. We return true because
234     // it means that we couldn't find any value that doesn't look hash-like.
235     if (Visited.find(I) != Visited.end())
236       return true;
237     Visited.insert(I);
238     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
239       // Ignore undef values as they probably don't affect the division
240       // operands.
241       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
242              isa<UndefValue>(V);
243     });
244   }
245   default:
246     return false;
247   }
248 }
249 
250 /// Check if an integer value fits into our bypass type.
251 ValueRange FastDivInsertionTask::getValueRange(Value *V,
252                                                VisitedSetTy &Visited) {
253   unsigned ShortLen = BypassType->getBitWidth();
254   unsigned LongLen = V->getType()->getIntegerBitWidth();
255 
256   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
257   unsigned HiBits = LongLen - ShortLen;
258 
259   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
260   KnownBits Known(LongLen);
261 
262   computeKnownBits(V, Known, DL);
263 
264   if (Known.countMinLeadingZeros() >= HiBits)
265     return VALRNG_KNOWN_SHORT;
266 
267   if (Known.countMaxLeadingZeros() < HiBits)
268     return VALRNG_LIKELY_LONG;
269 
270   // Long integer divisions are often used in hashtable implementations. It's
271   // not worth bypassing such divisions because hash values are extremely
272   // unlikely to have enough leading zeros. The call below tries to detect
273   // values that are unlikely to fit BypassType (including hashes).
274   if (isHashLikeValue(V, Visited))
275     return VALRNG_LIKELY_LONG;
276 
277   return VALRNG_UNKNOWN;
278 }
279 
280 /// Add new basic block for slow div and rem operations and put it before
281 /// SuccessorBB.
282 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
283   QuotRemWithBB DivRemPair;
284   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
285                                      MainBB->getParent(), SuccessorBB);
286   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
287 
288   Value *Dividend = SlowDivOrRem->getOperand(0);
289   Value *Divisor = SlowDivOrRem->getOperand(1);
290 
291   if (isSignedOp()) {
292     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
293     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
294   } else {
295     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
296     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
297   }
298 
299   Builder.CreateBr(SuccessorBB);
300   return DivRemPair;
301 }
302 
303 /// Add new basic block for fast div and rem operations and put it before
304 /// SuccessorBB.
305 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
306   QuotRemWithBB DivRemPair;
307   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
308                                      MainBB->getParent(), SuccessorBB);
309   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
310 
311   Value *Dividend = SlowDivOrRem->getOperand(0);
312   Value *Divisor = SlowDivOrRem->getOperand(1);
313   Value *ShortDivisorV =
314       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
315   Value *ShortDividendV =
316       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
317 
318   // udiv/urem because this optimization only handles positive numbers.
319   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
320   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
321   DivRemPair.Quotient =
322       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
323   DivRemPair.Remainder =
324       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
325   Builder.CreateBr(SuccessorBB);
326 
327   return DivRemPair;
328 }
329 
330 /// Creates Phi nodes for result of Div and Rem.
331 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
332                                                        QuotRemWithBB &RHS,
333                                                        BasicBlock *PhiBB) {
334   IRBuilder<> Builder(PhiBB, PhiBB->begin());
335   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
336   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
337   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
338   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
339   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
340   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
341   return QuotRemPair(QuoPhi, RemPhi);
342 }
343 
344 /// Creates a runtime check to test whether both the divisor and dividend fit
345 /// into BypassType. The check is inserted at the end of MainBB. True return
346 /// value means that the operands fit. Either of the operands may be NULL if it
347 /// doesn't need a runtime check.
348 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
349   assert((Op1 || Op2) && "Nothing to check");
350   IRBuilder<> Builder(MainBB, MainBB->end());
351 
352   Value *OrV;
353   if (Op1 && Op2)
354     OrV = Builder.CreateOr(Op1, Op2);
355   else
356     OrV = Op1 ? Op1 : Op2;
357 
358   // BitMask is inverted to check if the operands are
359   // larger than the bypass type
360   uint64_t BitMask = ~BypassType->getBitMask();
361   Value *AndV = Builder.CreateAnd(OrV, BitMask);
362 
363   // Compare operand values
364   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
365   return Builder.CreateICmpEQ(AndV, ZeroV);
366 }
367 
368 /// Substitutes the div/rem instruction with code that checks the value of the
369 /// operands and uses a shorter-faster div/rem instruction when possible.
370 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
371   Value *Dividend = SlowDivOrRem->getOperand(0);
372   Value *Divisor = SlowDivOrRem->getOperand(1);
373 
374   if (isa<ConstantInt>(Divisor)) {
375     // Keep division by a constant for DAGCombiner.
376     return None;
377   }
378 
379   VisitedSetTy SetL;
380   ValueRange DividendRange = getValueRange(Dividend, SetL);
381   if (DividendRange == VALRNG_LIKELY_LONG)
382     return None;
383 
384   VisitedSetTy SetR;
385   ValueRange DivisorRange = getValueRange(Divisor, SetR);
386   if (DivisorRange == VALRNG_LIKELY_LONG)
387     return None;
388 
389   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
390   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
391 
392   if (DividendShort && DivisorShort) {
393     // If both operands are known to be short then just replace the long
394     // division with a short one in-place.
395 
396     IRBuilder<> Builder(SlowDivOrRem);
397     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
398     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
399     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
400     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
401     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
402     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
403     return QuotRemPair(ExtDiv, ExtRem);
404   } else if (DividendShort && !isSignedOp()) {
405     // If the division is unsigned and Dividend is known to be short, then
406     // either
407     // 1) Divisor is less or equal to Dividend, and the result can be computed
408     //    with a short division.
409     // 2) Divisor is greater than Dividend. In this case, no division is needed
410     //    at all: The quotient is 0 and the remainder is equal to Dividend.
411     //
412     // So instead of checking at runtime whether Divisor fits into BypassType,
413     // we emit a runtime check to differentiate between these two cases. This
414     // lets us entirely avoid a long div.
415 
416     // Split the basic block before the div/rem.
417     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
418     // Remove the unconditional branch from MainBB to SuccessorBB.
419     MainBB->getInstList().back().eraseFromParent();
420     QuotRemWithBB Long;
421     Long.BB = MainBB;
422     Long.Quotient = ConstantInt::get(getSlowType(), 0);
423     Long.Remainder = Dividend;
424     QuotRemWithBB Fast = createFastBB(SuccessorBB);
425     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
426     IRBuilder<> Builder(MainBB, MainBB->end());
427     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
428     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
429     return Result;
430   } else {
431     // General case. Create both slow and fast div/rem pairs and choose one of
432     // them at runtime.
433 
434     // Split the basic block before the div/rem.
435     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
436     // Remove the unconditional branch from MainBB to SuccessorBB.
437     MainBB->getInstList().back().eraseFromParent();
438     QuotRemWithBB Fast = createFastBB(SuccessorBB);
439     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
440     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
441     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
442                                             DivisorShort ? nullptr : Divisor);
443     IRBuilder<> Builder(MainBB, MainBB->end());
444     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
445     return Result;
446   }
447 }
448 
449 /// This optimization identifies DIV/REM instructions in a BB that can be
450 /// profitably bypassed and carried out with a shorter, faster divide.
451 bool llvm::bypassSlowDivision(BasicBlock *BB,
452                               const BypassWidthsTy &BypassWidths) {
453   DivCacheTy PerBBDivCache;
454 
455   bool MadeChange = false;
456   Instruction* Next = &*BB->begin();
457   while (Next != nullptr) {
458     // We may add instructions immediately after I, but we want to skip over
459     // them.
460     Instruction* I = Next;
461     Next = Next->getNextNode();
462 
463     FastDivInsertionTask Task(I, BypassWidths);
464     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
465       I->replaceAllUsesWith(Replacement);
466       I->eraseFromParent();
467       MadeChange = true;
468     }
469   }
470 
471   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
472   // create divrem machine instructions.  Now erase any unused divs / rems so we
473   // don't leave extra instructions sitting around.
474   for (auto &KV : PerBBDivCache)
475     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
476       RecursivelyDeleteTriviallyDeadInstructions(V);
477 
478   return MadeChange;
479 }
480