1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file contains an optimization for div and rem on architectures that
11 // execute short instructions significantly faster than longer instructions.
12 // For example, on Intel Atom 32-bit divides are slow enough that during
13 // runtime it is profitable to check the value of the operands, and if they are
14 // positive and less than 256 use an unsigned 8-bit divide.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallPtrSet.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/IR/Function.h"
23 #include "llvm/IR/IRBuilder.h"
24 #include "llvm/IR/Instructions.h"
25 #include "llvm/Support/KnownBits.h"
26 #include "llvm/Transforms/Utils/Local.h"
27 
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "bypass-slow-division"
31 
32 namespace {
33   struct QuotRemPair {
34     Value *Quotient;
35     Value *Remainder;
36 
37     QuotRemPair(Value *InQuotient, Value *InRemainder)
38         : Quotient(InQuotient), Remainder(InRemainder) {}
39   };
40 
41   /// A quotient and remainder, plus a BB from which they logically "originate".
42   /// If you use Quotient or Remainder in a Phi node, you should use BB as its
43   /// corresponding predecessor.
44   struct QuotRemWithBB {
45     BasicBlock *BB = nullptr;
46     Value *Quotient = nullptr;
47     Value *Remainder = nullptr;
48   };
49 }
50 
51 namespace llvm {
52   typedef DenseMap<DivRemMapKey, QuotRemPair> DivCacheTy;
53   typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
54   typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
55 }
56 
57 namespace {
58 enum ValueRange {
59   /// Operand definitely fits into BypassType. No runtime checks are needed.
60   VALRNG_KNOWN_SHORT,
61   /// A runtime check is required, as value range is unknown.
62   VALRNG_UNKNOWN,
63   /// Operand is unlikely to fit into BypassType. The bypassing should be
64   /// disabled.
65   VALRNG_LIKELY_LONG
66 };
67 
68 class FastDivInsertionTask {
69   bool IsValidTask = false;
70   Instruction *SlowDivOrRem = nullptr;
71   IntegerType *BypassType = nullptr;
72   BasicBlock *MainBB = nullptr;
73 
74   bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
75   ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
76   QuotRemWithBB createSlowBB(BasicBlock *Successor);
77   QuotRemWithBB createFastBB(BasicBlock *Successor);
78   QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
79                                    BasicBlock *PhiBB);
80   Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
81   Optional<QuotRemPair> insertFastDivAndRem();
82 
83   bool isSignedOp() {
84     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
85            SlowDivOrRem->getOpcode() == Instruction::SRem;
86   }
87   bool isDivisionOp() {
88     return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
89            SlowDivOrRem->getOpcode() == Instruction::UDiv;
90   }
91   Type *getSlowType() { return SlowDivOrRem->getType(); }
92 
93 public:
94   FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
95   Value *getReplacement(DivCacheTy &Cache);
96 };
97 } // anonymous namespace
98 
99 FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
100                                            const BypassWidthsTy &BypassWidths) {
101   switch (I->getOpcode()) {
102   case Instruction::UDiv:
103   case Instruction::SDiv:
104   case Instruction::URem:
105   case Instruction::SRem:
106     SlowDivOrRem = I;
107     break;
108   default:
109     // I is not a div/rem operation.
110     return;
111   }
112 
113   // Skip division on vector types. Only optimize integer instructions.
114   IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
115   if (!SlowType)
116     return;
117 
118   // Skip if this bitwidth is not bypassed.
119   auto BI = BypassWidths.find(SlowType->getBitWidth());
120   if (BI == BypassWidths.end())
121     return;
122 
123   // Get type for div/rem instruction with bypass bitwidth.
124   IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
125   BypassType = BT;
126 
127   // The original basic block.
128   MainBB = I->getParent();
129 
130   // The instruction is indeed a slow div or rem operation.
131   IsValidTask = true;
132 }
133 
134 /// Reuses previously-computed dividend or remainder from the current BB if
135 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to
136 /// perform the optimization and caches the resulting dividend and remainder.
137 /// If no replacement can be generated, nullptr is returned.
138 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
139   // First, make sure that the task is valid.
140   if (!IsValidTask)
141     return nullptr;
142 
143   // Then, look for a value in Cache.
144   Value *Dividend = SlowDivOrRem->getOperand(0);
145   Value *Divisor = SlowDivOrRem->getOperand(1);
146   DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
147   auto CacheI = Cache.find(Key);
148 
149   if (CacheI == Cache.end()) {
150     // If previous instance does not exist, try to insert fast div.
151     Optional<QuotRemPair> OptResult = insertFastDivAndRem();
152     // Bail out if insertFastDivAndRem has failed.
153     if (!OptResult)
154       return nullptr;
155     CacheI = Cache.insert({Key, *OptResult}).first;
156   }
157 
158   QuotRemPair &Value = CacheI->second;
159   return isDivisionOp() ? Value.Quotient : Value.Remainder;
160 }
161 
162 /// \brief Check if a value looks like a hash.
163 ///
164 /// The routine is expected to detect values computed using the most common hash
165 /// algorithms. Typically, hash computations end with one of the following
166 /// instructions:
167 ///
168 /// 1) MUL with a constant wider than BypassType
169 /// 2) XOR instruction
170 ///
171 /// And even if we are wrong and the value is not a hash, it is still quite
172 /// unlikely that such values will fit into BypassType.
173 ///
174 /// To detect string hash algorithms like FNV we have to look through PHI-nodes.
175 /// It is implemented as a depth-first search for values that look neither long
176 /// nor hash-like.
177 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
178   Instruction *I = dyn_cast<Instruction>(V);
179   if (!I)
180     return false;
181 
182   switch (I->getOpcode()) {
183   case Instruction::Xor:
184     return true;
185   case Instruction::Mul: {
186     // After Constant Hoisting pass, long constants may be represented as
187     // bitcast instructions. As a result, some constants may look like an
188     // instruction at first, and an additional check is necessary to find out if
189     // an operand is actually a constant.
190     Value *Op1 = I->getOperand(1);
191     ConstantInt *C = dyn_cast<ConstantInt>(Op1);
192     if (!C && isa<BitCastInst>(Op1))
193       C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
194     return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
195   }
196   case Instruction::PHI: {
197     // Stop IR traversal in case of a crazy input code. This limits recursion
198     // depth.
199     if (Visited.size() >= 16)
200       return false;
201     // Do not visit nodes that have been visited already. We return true because
202     // it means that we couldn't find any value that doesn't look hash-like.
203     if (Visited.find(I) != Visited.end())
204       return true;
205     Visited.insert(I);
206     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
207       // Ignore undef values as they probably don't affect the division
208       // operands.
209       return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
210              isa<UndefValue>(V);
211     });
212   }
213   default:
214     return false;
215   }
216 }
217 
218 /// Check if an integer value fits into our bypass type.
219 ValueRange FastDivInsertionTask::getValueRange(Value *V,
220                                                VisitedSetTy &Visited) {
221   unsigned ShortLen = BypassType->getBitWidth();
222   unsigned LongLen = V->getType()->getIntegerBitWidth();
223 
224   assert(LongLen > ShortLen && "Value type must be wider than BypassType");
225   unsigned HiBits = LongLen - ShortLen;
226 
227   const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
228   KnownBits Known(LongLen);
229 
230   computeKnownBits(V, Known, DL);
231 
232   if (Known.countMinLeadingZeros() >= HiBits)
233     return VALRNG_KNOWN_SHORT;
234 
235   if (Known.countMaxLeadingZeros() < HiBits)
236     return VALRNG_LIKELY_LONG;
237 
238   // Long integer divisions are often used in hashtable implementations. It's
239   // not worth bypassing such divisions because hash values are extremely
240   // unlikely to have enough leading zeros. The call below tries to detect
241   // values that are unlikely to fit BypassType (including hashes).
242   if (isHashLikeValue(V, Visited))
243     return VALRNG_LIKELY_LONG;
244 
245   return VALRNG_UNKNOWN;
246 }
247 
248 /// Add new basic block for slow div and rem operations and put it before
249 /// SuccessorBB.
250 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
251   QuotRemWithBB DivRemPair;
252   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
253                                      MainBB->getParent(), SuccessorBB);
254   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
255 
256   Value *Dividend = SlowDivOrRem->getOperand(0);
257   Value *Divisor = SlowDivOrRem->getOperand(1);
258 
259   if (isSignedOp()) {
260     DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
261     DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
262   } else {
263     DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
264     DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
265   }
266 
267   Builder.CreateBr(SuccessorBB);
268   return DivRemPair;
269 }
270 
271 /// Add new basic block for fast div and rem operations and put it before
272 /// SuccessorBB.
273 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
274   QuotRemWithBB DivRemPair;
275   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
276                                      MainBB->getParent(), SuccessorBB);
277   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
278 
279   Value *Dividend = SlowDivOrRem->getOperand(0);
280   Value *Divisor = SlowDivOrRem->getOperand(1);
281   Value *ShortDivisorV =
282       Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
283   Value *ShortDividendV =
284       Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
285 
286   // udiv/urem because this optimization only handles positive numbers.
287   Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
288   Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
289   DivRemPair.Quotient =
290       Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
291   DivRemPair.Remainder =
292       Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
293   Builder.CreateBr(SuccessorBB);
294 
295   return DivRemPair;
296 }
297 
298 /// Creates Phi nodes for result of Div and Rem.
299 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
300                                                        QuotRemWithBB &RHS,
301                                                        BasicBlock *PhiBB) {
302   IRBuilder<> Builder(PhiBB, PhiBB->begin());
303   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
304   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
305   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
306   PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
307   RemPhi->addIncoming(LHS.Remainder, LHS.BB);
308   RemPhi->addIncoming(RHS.Remainder, RHS.BB);
309   return QuotRemPair(QuoPhi, RemPhi);
310 }
311 
312 /// Creates a runtime check to test whether both the divisor and dividend fit
313 /// into BypassType. The check is inserted at the end of MainBB. True return
314 /// value means that the operands fit. Either of the operands may be NULL if it
315 /// doesn't need a runtime check.
316 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
317   assert((Op1 || Op2) && "Nothing to check");
318   IRBuilder<> Builder(MainBB, MainBB->end());
319 
320   Value *OrV;
321   if (Op1 && Op2)
322     OrV = Builder.CreateOr(Op1, Op2);
323   else
324     OrV = Op1 ? Op1 : Op2;
325 
326   // BitMask is inverted to check if the operands are
327   // larger than the bypass type
328   uint64_t BitMask = ~BypassType->getBitMask();
329   Value *AndV = Builder.CreateAnd(OrV, BitMask);
330 
331   // Compare operand values
332   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
333   return Builder.CreateICmpEQ(AndV, ZeroV);
334 }
335 
336 /// Substitutes the div/rem instruction with code that checks the value of the
337 /// operands and uses a shorter-faster div/rem instruction when possible.
338 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
339   Value *Dividend = SlowDivOrRem->getOperand(0);
340   Value *Divisor = SlowDivOrRem->getOperand(1);
341 
342   if (isa<ConstantInt>(Divisor)) {
343     // Keep division by a constant for DAGCombiner.
344     return None;
345   }
346 
347   VisitedSetTy SetL;
348   ValueRange DividendRange = getValueRange(Dividend, SetL);
349   if (DividendRange == VALRNG_LIKELY_LONG)
350     return None;
351 
352   VisitedSetTy SetR;
353   ValueRange DivisorRange = getValueRange(Divisor, SetR);
354   if (DivisorRange == VALRNG_LIKELY_LONG)
355     return None;
356 
357   bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
358   bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
359 
360   if (DividendShort && DivisorShort) {
361     // If both operands are known to be short then just replace the long
362     // division with a short one in-place.
363 
364     IRBuilder<> Builder(SlowDivOrRem);
365     Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
366     Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
367     Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
368     Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
369     Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
370     Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
371     return QuotRemPair(ExtDiv, ExtRem);
372   } else if (DividendShort && !isSignedOp()) {
373     // If the division is unsigned and Dividend is known to be short, then
374     // either
375     // 1) Divisor is less or equal to Dividend, and the result can be computed
376     //    with a short division.
377     // 2) Divisor is greater than Dividend. In this case, no division is needed
378     //    at all: The quotient is 0 and the remainder is equal to Dividend.
379     //
380     // So instead of checking at runtime whether Divisor fits into BypassType,
381     // we emit a runtime check to differentiate between these two cases. This
382     // lets us entirely avoid a long div.
383 
384     // Split the basic block before the div/rem.
385     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
386     // Remove the unconditional branch from MainBB to SuccessorBB.
387     MainBB->getInstList().back().eraseFromParent();
388     QuotRemWithBB Long;
389     Long.BB = MainBB;
390     Long.Quotient = ConstantInt::get(getSlowType(), 0);
391     Long.Remainder = Dividend;
392     QuotRemWithBB Fast = createFastBB(SuccessorBB);
393     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
394     IRBuilder<> Builder(MainBB, MainBB->end());
395     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
396     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
397     return Result;
398   } else {
399     // General case. Create both slow and fast div/rem pairs and choose one of
400     // them at runtime.
401 
402     // Split the basic block before the div/rem.
403     BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
404     // Remove the unconditional branch from MainBB to SuccessorBB.
405     MainBB->getInstList().back().eraseFromParent();
406     QuotRemWithBB Fast = createFastBB(SuccessorBB);
407     QuotRemWithBB Slow = createSlowBB(SuccessorBB);
408     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
409     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
410                                             DivisorShort ? nullptr : Divisor);
411     IRBuilder<> Builder(MainBB, MainBB->end());
412     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
413     return Result;
414   }
415 }
416 
417 /// This optimization identifies DIV/REM instructions in a BB that can be
418 /// profitably bypassed and carried out with a shorter, faster divide.
419 bool llvm::bypassSlowDivision(BasicBlock *BB,
420                               const BypassWidthsTy &BypassWidths) {
421   DivCacheTy PerBBDivCache;
422 
423   bool MadeChange = false;
424   Instruction* Next = &*BB->begin();
425   while (Next != nullptr) {
426     // We may add instructions immediately after I, but we want to skip over
427     // them.
428     Instruction* I = Next;
429     Next = Next->getNextNode();
430 
431     FastDivInsertionTask Task(I, BypassWidths);
432     if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
433       I->replaceAllUsesWith(Replacement);
434       I->eraseFromParent();
435       MadeChange = true;
436     }
437   }
438 
439   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
440   // create divrem machine instructions.  Now erase any unused divs / rems so we
441   // don't leave extra instructions sitting around.
442   for (auto &KV : PerBBDivCache)
443     for (Value *V : {KV.second.Quotient, KV.second.Remainder})
444       RecursivelyDeleteTriviallyDeadInstructions(V);
445 
446   return MadeChange;
447 }
448