1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains an optimization for div and rem on architectures that 11 // execute short instructions significantly faster than longer instructions. 12 // For example, on Intel Atom 32-bit divides are slow enough that during 13 // runtime it is profitable to check the value of the operands, and if they are 14 // positive and less than 256 use an unsigned 8-bit divide. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/SmallPtrSet.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/Support/KnownBits.h" 26 #include "llvm/Transforms/Utils/Local.h" 27 28 using namespace llvm; 29 30 #define DEBUG_TYPE "bypass-slow-division" 31 32 namespace { 33 struct DivOpInfo { 34 bool SignedOp; 35 Value *Dividend; 36 Value *Divisor; 37 38 DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor) 39 : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {} 40 }; 41 42 struct QuotRemPair { 43 Value *Quotient; 44 Value *Remainder; 45 46 QuotRemPair(Value *InQuotient, Value *InRemainder) 47 : Quotient(InQuotient), Remainder(InRemainder) {} 48 }; 49 50 /// A quotient and remainder, plus a BB from which they logically "originate". 51 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 52 /// corresponding predecessor. 53 struct QuotRemWithBB { 54 BasicBlock *BB = nullptr; 55 Value *Quotient = nullptr; 56 Value *Remainder = nullptr; 57 }; 58 } 59 60 namespace llvm { 61 template<> 62 struct DenseMapInfo<DivOpInfo> { 63 static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) { 64 return Val1.SignedOp == Val2.SignedOp && 65 Val1.Dividend == Val2.Dividend && 66 Val1.Divisor == Val2.Divisor; 67 } 68 69 static DivOpInfo getEmptyKey() { 70 return DivOpInfo(false, nullptr, nullptr); 71 } 72 73 static DivOpInfo getTombstoneKey() { 74 return DivOpInfo(true, nullptr, nullptr); 75 } 76 77 static unsigned getHashValue(const DivOpInfo &Val) { 78 return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^ 79 reinterpret_cast<uintptr_t>(Val.Divisor)) ^ 80 (unsigned)Val.SignedOp; 81 } 82 }; 83 84 typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy; 85 typedef DenseMap<unsigned, unsigned> BypassWidthsTy; 86 typedef SmallPtrSet<Instruction *, 4> VisitedSetTy; 87 } 88 89 namespace { 90 enum ValueRange { 91 /// Operand definitely fits into BypassType. No runtime checks are needed. 92 VALRNG_KNOWN_SHORT, 93 /// A runtime check is required, as value range is unknown. 94 VALRNG_UNKNOWN, 95 /// Operand is unlikely to fit into BypassType. The bypassing should be 96 /// disabled. 97 VALRNG_LIKELY_LONG 98 }; 99 100 class FastDivInsertionTask { 101 bool IsValidTask = false; 102 Instruction *SlowDivOrRem = nullptr; 103 IntegerType *BypassType = nullptr; 104 BasicBlock *MainBB = nullptr; 105 106 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 107 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 108 QuotRemWithBB createSlowBB(BasicBlock *Successor); 109 QuotRemWithBB createFastBB(BasicBlock *Successor); 110 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 111 BasicBlock *PhiBB); 112 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 113 Optional<QuotRemPair> insertFastDivAndRem(); 114 115 bool isSignedOp() { 116 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 117 SlowDivOrRem->getOpcode() == Instruction::SRem; 118 } 119 bool isDivisionOp() { 120 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 121 SlowDivOrRem->getOpcode() == Instruction::UDiv; 122 } 123 Type *getSlowType() { return SlowDivOrRem->getType(); } 124 125 public: 126 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 127 Value *getReplacement(DivCacheTy &Cache); 128 }; 129 } // anonymous namespace 130 131 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 132 const BypassWidthsTy &BypassWidths) { 133 switch (I->getOpcode()) { 134 case Instruction::UDiv: 135 case Instruction::SDiv: 136 case Instruction::URem: 137 case Instruction::SRem: 138 SlowDivOrRem = I; 139 break; 140 default: 141 // I is not a div/rem operation. 142 return; 143 } 144 145 // Skip division on vector types. Only optimize integer instructions. 146 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 147 if (!SlowType) 148 return; 149 150 // Skip if this bitwidth is not bypassed. 151 auto BI = BypassWidths.find(SlowType->getBitWidth()); 152 if (BI == BypassWidths.end()) 153 return; 154 155 // Get type for div/rem instruction with bypass bitwidth. 156 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 157 BypassType = BT; 158 159 // The original basic block. 160 MainBB = I->getParent(); 161 162 // The instruction is indeed a slow div or rem operation. 163 IsValidTask = true; 164 } 165 166 /// Reuses previously-computed dividend or remainder from the current BB if 167 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 168 /// perform the optimization and caches the resulting dividend and remainder. 169 /// If no replacement can be generated, nullptr is returned. 170 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 171 // First, make sure that the task is valid. 172 if (!IsValidTask) 173 return nullptr; 174 175 // Then, look for a value in Cache. 176 Value *Dividend = SlowDivOrRem->getOperand(0); 177 Value *Divisor = SlowDivOrRem->getOperand(1); 178 DivOpInfo Key(isSignedOp(), Dividend, Divisor); 179 auto CacheI = Cache.find(Key); 180 181 if (CacheI == Cache.end()) { 182 // If previous instance does not exist, try to insert fast div. 183 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 184 // Bail out if insertFastDivAndRem has failed. 185 if (!OptResult) 186 return nullptr; 187 CacheI = Cache.insert({Key, *OptResult}).first; 188 } 189 190 QuotRemPair &Value = CacheI->second; 191 return isDivisionOp() ? Value.Quotient : Value.Remainder; 192 } 193 194 /// \brief Check if a value looks like a hash. 195 /// 196 /// The routine is expected to detect values computed using the most common hash 197 /// algorithms. Typically, hash computations end with one of the following 198 /// instructions: 199 /// 200 /// 1) MUL with a constant wider than BypassType 201 /// 2) XOR instruction 202 /// 203 /// And even if we are wrong and the value is not a hash, it is still quite 204 /// unlikely that such values will fit into BypassType. 205 /// 206 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 207 /// It is implemented as a depth-first search for values that look neither long 208 /// nor hash-like. 209 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 210 Instruction *I = dyn_cast<Instruction>(V); 211 if (!I) 212 return false; 213 214 switch (I->getOpcode()) { 215 case Instruction::Xor: 216 return true; 217 case Instruction::Mul: { 218 // After Constant Hoisting pass, long constants may be represented as 219 // bitcast instructions. As a result, some constants may look like an 220 // instruction at first, and an additional check is necessary to find out if 221 // an operand is actually a constant. 222 Value *Op1 = I->getOperand(1); 223 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 224 if (!C && isa<BitCastInst>(Op1)) 225 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 226 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 227 } 228 case Instruction::PHI: { 229 // Stop IR traversal in case of a crazy input code. This limits recursion 230 // depth. 231 if (Visited.size() >= 16) 232 return false; 233 // Do not visit nodes that have been visited already. We return true because 234 // it means that we couldn't find any value that doesn't look hash-like. 235 if (Visited.find(I) != Visited.end()) 236 return true; 237 Visited.insert(I); 238 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 239 // Ignore undef values as they probably don't affect the division 240 // operands. 241 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 242 isa<UndefValue>(V); 243 }); 244 } 245 default: 246 return false; 247 } 248 } 249 250 /// Check if an integer value fits into our bypass type. 251 ValueRange FastDivInsertionTask::getValueRange(Value *V, 252 VisitedSetTy &Visited) { 253 unsigned ShortLen = BypassType->getBitWidth(); 254 unsigned LongLen = V->getType()->getIntegerBitWidth(); 255 256 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 257 unsigned HiBits = LongLen - ShortLen; 258 259 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 260 KnownBits Known(LongLen); 261 262 computeKnownBits(V, Known, DL); 263 264 if (Known.countMinLeadingZeros() >= HiBits) 265 return VALRNG_KNOWN_SHORT; 266 267 if (Known.countMaxLeadingZeros() < HiBits) 268 return VALRNG_LIKELY_LONG; 269 270 // Long integer divisions are often used in hashtable implementations. It's 271 // not worth bypassing such divisions because hash values are extremely 272 // unlikely to have enough leading zeros. The call below tries to detect 273 // values that are unlikely to fit BypassType (including hashes). 274 if (isHashLikeValue(V, Visited)) 275 return VALRNG_LIKELY_LONG; 276 277 return VALRNG_UNKNOWN; 278 } 279 280 /// Add new basic block for slow div and rem operations and put it before 281 /// SuccessorBB. 282 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 283 QuotRemWithBB DivRemPair; 284 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 285 MainBB->getParent(), SuccessorBB); 286 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 287 288 Value *Dividend = SlowDivOrRem->getOperand(0); 289 Value *Divisor = SlowDivOrRem->getOperand(1); 290 291 if (isSignedOp()) { 292 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 293 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 294 } else { 295 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 296 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 297 } 298 299 Builder.CreateBr(SuccessorBB); 300 return DivRemPair; 301 } 302 303 /// Add new basic block for fast div and rem operations and put it before 304 /// SuccessorBB. 305 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 306 QuotRemWithBB DivRemPair; 307 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 308 MainBB->getParent(), SuccessorBB); 309 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 310 311 Value *Dividend = SlowDivOrRem->getOperand(0); 312 Value *Divisor = SlowDivOrRem->getOperand(1); 313 Value *ShortDivisorV = 314 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 315 Value *ShortDividendV = 316 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 317 318 // udiv/urem because this optimization only handles positive numbers. 319 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 320 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 321 DivRemPair.Quotient = 322 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 323 DivRemPair.Remainder = 324 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 325 Builder.CreateBr(SuccessorBB); 326 327 return DivRemPair; 328 } 329 330 /// Creates Phi nodes for result of Div and Rem. 331 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 332 QuotRemWithBB &RHS, 333 BasicBlock *PhiBB) { 334 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 335 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 336 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 337 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 338 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 339 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 340 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 341 return QuotRemPair(QuoPhi, RemPhi); 342 } 343 344 /// Creates a runtime check to test whether both the divisor and dividend fit 345 /// into BypassType. The check is inserted at the end of MainBB. True return 346 /// value means that the operands fit. Either of the operands may be NULL if it 347 /// doesn't need a runtime check. 348 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 349 assert((Op1 || Op2) && "Nothing to check"); 350 IRBuilder<> Builder(MainBB, MainBB->end()); 351 352 Value *OrV; 353 if (Op1 && Op2) 354 OrV = Builder.CreateOr(Op1, Op2); 355 else 356 OrV = Op1 ? Op1 : Op2; 357 358 // BitMask is inverted to check if the operands are 359 // larger than the bypass type 360 uint64_t BitMask = ~BypassType->getBitMask(); 361 Value *AndV = Builder.CreateAnd(OrV, BitMask); 362 363 // Compare operand values 364 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 365 return Builder.CreateICmpEQ(AndV, ZeroV); 366 } 367 368 /// Substitutes the div/rem instruction with code that checks the value of the 369 /// operands and uses a shorter-faster div/rem instruction when possible. 370 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 371 Value *Dividend = SlowDivOrRem->getOperand(0); 372 Value *Divisor = SlowDivOrRem->getOperand(1); 373 374 if (isa<ConstantInt>(Divisor)) { 375 // Keep division by a constant for DAGCombiner. 376 return None; 377 } 378 379 VisitedSetTy SetL; 380 ValueRange DividendRange = getValueRange(Dividend, SetL); 381 if (DividendRange == VALRNG_LIKELY_LONG) 382 return None; 383 384 VisitedSetTy SetR; 385 ValueRange DivisorRange = getValueRange(Divisor, SetR); 386 if (DivisorRange == VALRNG_LIKELY_LONG) 387 return None; 388 389 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 390 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 391 392 if (DividendShort && DivisorShort) { 393 // If both operands are known to be short then just replace the long 394 // division with a short one in-place. 395 396 IRBuilder<> Builder(SlowDivOrRem); 397 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 398 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 399 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 400 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 401 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 402 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 403 return QuotRemPair(ExtDiv, ExtRem); 404 } else if (DividendShort && !isSignedOp()) { 405 // If the division is unsigned and Dividend is known to be short, then 406 // either 407 // 1) Divisor is less or equal to Dividend, and the result can be computed 408 // with a short division. 409 // 2) Divisor is greater than Dividend. In this case, no division is needed 410 // at all: The quotient is 0 and the remainder is equal to Dividend. 411 // 412 // So instead of checking at runtime whether Divisor fits into BypassType, 413 // we emit a runtime check to differentiate between these two cases. This 414 // lets us entirely avoid a long div. 415 416 // Split the basic block before the div/rem. 417 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 418 // Remove the unconditional branch from MainBB to SuccessorBB. 419 MainBB->getInstList().back().eraseFromParent(); 420 QuotRemWithBB Long; 421 Long.BB = MainBB; 422 Long.Quotient = ConstantInt::get(getSlowType(), 0); 423 Long.Remainder = Dividend; 424 QuotRemWithBB Fast = createFastBB(SuccessorBB); 425 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 426 IRBuilder<> Builder(MainBB, MainBB->end()); 427 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 428 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 429 return Result; 430 } else { 431 // General case. Create both slow and fast div/rem pairs and choose one of 432 // them at runtime. 433 434 // Split the basic block before the div/rem. 435 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 436 // Remove the unconditional branch from MainBB to SuccessorBB. 437 MainBB->getInstList().back().eraseFromParent(); 438 QuotRemWithBB Fast = createFastBB(SuccessorBB); 439 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 440 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 441 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 442 DivisorShort ? nullptr : Divisor); 443 IRBuilder<> Builder(MainBB, MainBB->end()); 444 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 445 return Result; 446 } 447 } 448 449 /// This optimization identifies DIV/REM instructions in a BB that can be 450 /// profitably bypassed and carried out with a shorter, faster divide. 451 bool llvm::bypassSlowDivision(BasicBlock *BB, 452 const BypassWidthsTy &BypassWidths) { 453 DivCacheTy PerBBDivCache; 454 455 bool MadeChange = false; 456 Instruction* Next = &*BB->begin(); 457 while (Next != nullptr) { 458 // We may add instructions immediately after I, but we want to skip over 459 // them. 460 Instruction* I = Next; 461 Next = Next->getNextNode(); 462 463 FastDivInsertionTask Task(I, BypassWidths); 464 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 465 I->replaceAllUsesWith(Replacement); 466 I->eraseFromParent(); 467 MadeChange = true; 468 } 469 } 470 471 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 472 // create divrem machine instructions. Now erase any unused divs / rems so we 473 // don't leave extra instructions sitting around. 474 for (auto &KV : PerBBDivCache) 475 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 476 RecursivelyDeleteTriviallyDeadInstructions(V); 477 478 return MadeChange; 479 } 480