1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains an optimization for div and rem on architectures that 11 // execute short instructions significantly faster than longer instructions. 12 // For example, on Intel Atom 32-bit divides are slow enough that during 13 // runtime it is profitable to check the value of the operands, and if they are 14 // positive and less than 256 use an unsigned 8-bit divide. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/SmallPtrSet.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/Support/KnownBits.h" 26 #include "llvm/Transforms/Utils/Local.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "bypass-slow-division" 31 32 namespace { 33 struct QuotRemPair { 34 Value *Quotient; 35 Value *Remainder; 36 37 QuotRemPair(Value *InQuotient, Value *InRemainder) 38 : Quotient(InQuotient), Remainder(InRemainder) {} 39 }; 40 41 /// A quotient and remainder, plus a BB from which they logically "originate". 42 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 43 /// corresponding predecessor. 44 struct QuotRemWithBB { 45 BasicBlock *BB = nullptr; 46 Value *Quotient = nullptr; 47 Value *Remainder = nullptr; 48 }; 49 } 50 51 namespace llvm { 52 typedef DenseMap<DivRemMapKey, QuotRemPair> DivCacheTy; 53 typedef DenseMap<unsigned, unsigned> BypassWidthsTy; 54 typedef SmallPtrSet<Instruction *, 4> VisitedSetTy; 55 } 56 57 namespace { 58 enum ValueRange { 59 /// Operand definitely fits into BypassType. No runtime checks are needed. 60 VALRNG_KNOWN_SHORT, 61 /// A runtime check is required, as value range is unknown. 62 VALRNG_UNKNOWN, 63 /// Operand is unlikely to fit into BypassType. The bypassing should be 64 /// disabled. 65 VALRNG_LIKELY_LONG 66 }; 67 68 class FastDivInsertionTask { 69 bool IsValidTask = false; 70 Instruction *SlowDivOrRem = nullptr; 71 IntegerType *BypassType = nullptr; 72 BasicBlock *MainBB = nullptr; 73 74 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 75 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 76 QuotRemWithBB createSlowBB(BasicBlock *Successor); 77 QuotRemWithBB createFastBB(BasicBlock *Successor); 78 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 79 BasicBlock *PhiBB); 80 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 81 Optional<QuotRemPair> insertFastDivAndRem(); 82 83 bool isSignedOp() { 84 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 85 SlowDivOrRem->getOpcode() == Instruction::SRem; 86 } 87 bool isDivisionOp() { 88 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 89 SlowDivOrRem->getOpcode() == Instruction::UDiv; 90 } 91 Type *getSlowType() { return SlowDivOrRem->getType(); } 92 93 public: 94 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 95 Value *getReplacement(DivCacheTy &Cache); 96 }; 97 } // anonymous namespace 98 99 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 100 const BypassWidthsTy &BypassWidths) { 101 switch (I->getOpcode()) { 102 case Instruction::UDiv: 103 case Instruction::SDiv: 104 case Instruction::URem: 105 case Instruction::SRem: 106 SlowDivOrRem = I; 107 break; 108 default: 109 // I is not a div/rem operation. 110 return; 111 } 112 113 // Skip division on vector types. Only optimize integer instructions. 114 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 115 if (!SlowType) 116 return; 117 118 // Skip if this bitwidth is not bypassed. 119 auto BI = BypassWidths.find(SlowType->getBitWidth()); 120 if (BI == BypassWidths.end()) 121 return; 122 123 // Get type for div/rem instruction with bypass bitwidth. 124 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 125 BypassType = BT; 126 127 // The original basic block. 128 MainBB = I->getParent(); 129 130 // The instruction is indeed a slow div or rem operation. 131 IsValidTask = true; 132 } 133 134 /// Reuses previously-computed dividend or remainder from the current BB if 135 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 136 /// perform the optimization and caches the resulting dividend and remainder. 137 /// If no replacement can be generated, nullptr is returned. 138 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 139 // First, make sure that the task is valid. 140 if (!IsValidTask) 141 return nullptr; 142 143 // Then, look for a value in Cache. 144 Value *Dividend = SlowDivOrRem->getOperand(0); 145 Value *Divisor = SlowDivOrRem->getOperand(1); 146 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 147 auto CacheI = Cache.find(Key); 148 149 if (CacheI == Cache.end()) { 150 // If previous instance does not exist, try to insert fast div. 151 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 152 // Bail out if insertFastDivAndRem has failed. 153 if (!OptResult) 154 return nullptr; 155 CacheI = Cache.insert({Key, *OptResult}).first; 156 } 157 158 QuotRemPair &Value = CacheI->second; 159 return isDivisionOp() ? Value.Quotient : Value.Remainder; 160 } 161 162 /// \brief Check if a value looks like a hash. 163 /// 164 /// The routine is expected to detect values computed using the most common hash 165 /// algorithms. Typically, hash computations end with one of the following 166 /// instructions: 167 /// 168 /// 1) MUL with a constant wider than BypassType 169 /// 2) XOR instruction 170 /// 171 /// And even if we are wrong and the value is not a hash, it is still quite 172 /// unlikely that such values will fit into BypassType. 173 /// 174 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 175 /// It is implemented as a depth-first search for values that look neither long 176 /// nor hash-like. 177 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 178 Instruction *I = dyn_cast<Instruction>(V); 179 if (!I) 180 return false; 181 182 switch (I->getOpcode()) { 183 case Instruction::Xor: 184 return true; 185 case Instruction::Mul: { 186 // After Constant Hoisting pass, long constants may be represented as 187 // bitcast instructions. As a result, some constants may look like an 188 // instruction at first, and an additional check is necessary to find out if 189 // an operand is actually a constant. 190 Value *Op1 = I->getOperand(1); 191 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 192 if (!C && isa<BitCastInst>(Op1)) 193 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 194 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 195 } 196 case Instruction::PHI: { 197 // Stop IR traversal in case of a crazy input code. This limits recursion 198 // depth. 199 if (Visited.size() >= 16) 200 return false; 201 // Do not visit nodes that have been visited already. We return true because 202 // it means that we couldn't find any value that doesn't look hash-like. 203 if (Visited.find(I) != Visited.end()) 204 return true; 205 Visited.insert(I); 206 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 207 // Ignore undef values as they probably don't affect the division 208 // operands. 209 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 210 isa<UndefValue>(V); 211 }); 212 } 213 default: 214 return false; 215 } 216 } 217 218 /// Check if an integer value fits into our bypass type. 219 ValueRange FastDivInsertionTask::getValueRange(Value *V, 220 VisitedSetTy &Visited) { 221 unsigned ShortLen = BypassType->getBitWidth(); 222 unsigned LongLen = V->getType()->getIntegerBitWidth(); 223 224 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 225 unsigned HiBits = LongLen - ShortLen; 226 227 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 228 KnownBits Known(LongLen); 229 230 computeKnownBits(V, Known, DL); 231 232 if (Known.countMinLeadingZeros() >= HiBits) 233 return VALRNG_KNOWN_SHORT; 234 235 if (Known.countMaxLeadingZeros() < HiBits) 236 return VALRNG_LIKELY_LONG; 237 238 // Long integer divisions are often used in hashtable implementations. It's 239 // not worth bypassing such divisions because hash values are extremely 240 // unlikely to have enough leading zeros. The call below tries to detect 241 // values that are unlikely to fit BypassType (including hashes). 242 if (isHashLikeValue(V, Visited)) 243 return VALRNG_LIKELY_LONG; 244 245 return VALRNG_UNKNOWN; 246 } 247 248 /// Add new basic block for slow div and rem operations and put it before 249 /// SuccessorBB. 250 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 251 QuotRemWithBB DivRemPair; 252 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 253 MainBB->getParent(), SuccessorBB); 254 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 255 256 Value *Dividend = SlowDivOrRem->getOperand(0); 257 Value *Divisor = SlowDivOrRem->getOperand(1); 258 259 if (isSignedOp()) { 260 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 261 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 262 } else { 263 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 264 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 265 } 266 267 Builder.CreateBr(SuccessorBB); 268 return DivRemPair; 269 } 270 271 /// Add new basic block for fast div and rem operations and put it before 272 /// SuccessorBB. 273 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 274 QuotRemWithBB DivRemPair; 275 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 276 MainBB->getParent(), SuccessorBB); 277 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 278 279 Value *Dividend = SlowDivOrRem->getOperand(0); 280 Value *Divisor = SlowDivOrRem->getOperand(1); 281 Value *ShortDivisorV = 282 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 283 Value *ShortDividendV = 284 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 285 286 // udiv/urem because this optimization only handles positive numbers. 287 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 288 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 289 DivRemPair.Quotient = 290 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 291 DivRemPair.Remainder = 292 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 293 Builder.CreateBr(SuccessorBB); 294 295 return DivRemPair; 296 } 297 298 /// Creates Phi nodes for result of Div and Rem. 299 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 300 QuotRemWithBB &RHS, 301 BasicBlock *PhiBB) { 302 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 303 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 304 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 305 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 306 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 307 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 308 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 309 return QuotRemPair(QuoPhi, RemPhi); 310 } 311 312 /// Creates a runtime check to test whether both the divisor and dividend fit 313 /// into BypassType. The check is inserted at the end of MainBB. True return 314 /// value means that the operands fit. Either of the operands may be NULL if it 315 /// doesn't need a runtime check. 316 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 317 assert((Op1 || Op2) && "Nothing to check"); 318 IRBuilder<> Builder(MainBB, MainBB->end()); 319 320 Value *OrV; 321 if (Op1 && Op2) 322 OrV = Builder.CreateOr(Op1, Op2); 323 else 324 OrV = Op1 ? Op1 : Op2; 325 326 // BitMask is inverted to check if the operands are 327 // larger than the bypass type 328 uint64_t BitMask = ~BypassType->getBitMask(); 329 Value *AndV = Builder.CreateAnd(OrV, BitMask); 330 331 // Compare operand values 332 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 333 return Builder.CreateICmpEQ(AndV, ZeroV); 334 } 335 336 /// Substitutes the div/rem instruction with code that checks the value of the 337 /// operands and uses a shorter-faster div/rem instruction when possible. 338 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 339 Value *Dividend = SlowDivOrRem->getOperand(0); 340 Value *Divisor = SlowDivOrRem->getOperand(1); 341 342 if (isa<ConstantInt>(Divisor)) { 343 // Keep division by a constant for DAGCombiner. 344 return None; 345 } 346 347 VisitedSetTy SetL; 348 ValueRange DividendRange = getValueRange(Dividend, SetL); 349 if (DividendRange == VALRNG_LIKELY_LONG) 350 return None; 351 352 VisitedSetTy SetR; 353 ValueRange DivisorRange = getValueRange(Divisor, SetR); 354 if (DivisorRange == VALRNG_LIKELY_LONG) 355 return None; 356 357 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 358 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 359 360 if (DividendShort && DivisorShort) { 361 // If both operands are known to be short then just replace the long 362 // division with a short one in-place. 363 364 IRBuilder<> Builder(SlowDivOrRem); 365 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 366 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 367 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 368 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 369 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 370 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 371 return QuotRemPair(ExtDiv, ExtRem); 372 } else if (DividendShort && !isSignedOp()) { 373 // If the division is unsigned and Dividend is known to be short, then 374 // either 375 // 1) Divisor is less or equal to Dividend, and the result can be computed 376 // with a short division. 377 // 2) Divisor is greater than Dividend. In this case, no division is needed 378 // at all: The quotient is 0 and the remainder is equal to Dividend. 379 // 380 // So instead of checking at runtime whether Divisor fits into BypassType, 381 // we emit a runtime check to differentiate between these two cases. This 382 // lets us entirely avoid a long div. 383 384 // Split the basic block before the div/rem. 385 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 386 // Remove the unconditional branch from MainBB to SuccessorBB. 387 MainBB->getInstList().back().eraseFromParent(); 388 QuotRemWithBB Long; 389 Long.BB = MainBB; 390 Long.Quotient = ConstantInt::get(getSlowType(), 0); 391 Long.Remainder = Dividend; 392 QuotRemWithBB Fast = createFastBB(SuccessorBB); 393 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 394 IRBuilder<> Builder(MainBB, MainBB->end()); 395 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 396 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 397 return Result; 398 } else { 399 // General case. Create both slow and fast div/rem pairs and choose one of 400 // them at runtime. 401 402 // Split the basic block before the div/rem. 403 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 404 // Remove the unconditional branch from MainBB to SuccessorBB. 405 MainBB->getInstList().back().eraseFromParent(); 406 QuotRemWithBB Fast = createFastBB(SuccessorBB); 407 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 408 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 409 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 410 DivisorShort ? nullptr : Divisor); 411 IRBuilder<> Builder(MainBB, MainBB->end()); 412 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 413 return Result; 414 } 415 } 416 417 /// This optimization identifies DIV/REM instructions in a BB that can be 418 /// profitably bypassed and carried out with a shorter, faster divide. 419 bool llvm::bypassSlowDivision(BasicBlock *BB, 420 const BypassWidthsTy &BypassWidths) { 421 DivCacheTy PerBBDivCache; 422 423 bool MadeChange = false; 424 Instruction* Next = &*BB->begin(); 425 while (Next != nullptr) { 426 // We may add instructions immediately after I, but we want to skip over 427 // them. 428 Instruction* I = Next; 429 Next = Next->getNextNode(); 430 431 FastDivInsertionTask Task(I, BypassWidths); 432 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 433 I->replaceAllUsesWith(Replacement); 434 I->eraseFromParent(); 435 MadeChange = true; 436 } 437 } 438 439 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 440 // create divrem machine instructions. Now erase any unused divs / rems so we 441 // don't leave extra instructions sitting around. 442 for (auto &KV : PerBBDivCache) 443 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 444 RecursivelyDeleteTriviallyDeadInstructions(V); 445 446 return MadeChange; 447 } 448