1 //===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file contains an optimization for div and rem on architectures that 11 // execute short instructions significantly faster than longer instructions. 12 // For example, on Intel Atom 32-bit divides are slow enough that during 13 // runtime it is profitable to check the value of the operands, and if they are 14 // positive and less than 256 use an unsigned 8-bit divide. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 19 #include "llvm/ADT/DenseMap.h" 20 #include "llvm/ADT/SmallPtrSet.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/IRBuilder.h" 24 #include "llvm/IR/Instructions.h" 25 #include "llvm/Transforms/Utils/Local.h" 26 27 using namespace llvm; 28 29 #define DEBUG_TYPE "bypass-slow-division" 30 31 namespace { 32 struct DivOpInfo { 33 bool SignedOp; 34 Value *Dividend; 35 Value *Divisor; 36 37 DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor) 38 : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {} 39 }; 40 41 struct QuotRemPair { 42 Value *Quotient; 43 Value *Remainder; 44 45 QuotRemPair(Value *InQuotient, Value *InRemainder) 46 : Quotient(InQuotient), Remainder(InRemainder) {} 47 }; 48 49 /// A quotient and remainder, plus a BB from which they logically "originate". 50 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 51 /// corresponding predecessor. 52 struct QuotRemWithBB { 53 BasicBlock *BB = nullptr; 54 Value *Quotient = nullptr; 55 Value *Remainder = nullptr; 56 }; 57 } 58 59 namespace llvm { 60 template<> 61 struct DenseMapInfo<DivOpInfo> { 62 static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) { 63 return Val1.SignedOp == Val2.SignedOp && 64 Val1.Dividend == Val2.Dividend && 65 Val1.Divisor == Val2.Divisor; 66 } 67 68 static DivOpInfo getEmptyKey() { 69 return DivOpInfo(false, nullptr, nullptr); 70 } 71 72 static DivOpInfo getTombstoneKey() { 73 return DivOpInfo(true, nullptr, nullptr); 74 } 75 76 static unsigned getHashValue(const DivOpInfo &Val) { 77 return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^ 78 reinterpret_cast<uintptr_t>(Val.Divisor)) ^ 79 (unsigned)Val.SignedOp; 80 } 81 }; 82 83 typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy; 84 typedef DenseMap<unsigned, unsigned> BypassWidthsTy; 85 typedef SmallPtrSet<Instruction *, 4> VisitedSetTy; 86 } 87 88 namespace { 89 enum ValueRange { 90 /// Operand definitely fits into BypassType. No runtime checks are needed. 91 VALRNG_KNOWN_SHORT, 92 /// A runtime check is required, as value range is unknown. 93 VALRNG_UNKNOWN, 94 /// Operand is unlikely to fit into BypassType. The bypassing should be 95 /// disabled. 96 VALRNG_LIKELY_LONG 97 }; 98 99 class FastDivInsertionTask { 100 bool IsValidTask = false; 101 Instruction *SlowDivOrRem = nullptr; 102 IntegerType *BypassType = nullptr; 103 BasicBlock *MainBB = nullptr; 104 105 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 106 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 107 QuotRemWithBB createSlowBB(BasicBlock *Successor); 108 QuotRemWithBB createFastBB(BasicBlock *Successor); 109 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 110 BasicBlock *PhiBB); 111 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 112 Optional<QuotRemPair> insertFastDivAndRem(); 113 114 bool isSignedOp() { 115 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 116 SlowDivOrRem->getOpcode() == Instruction::SRem; 117 } 118 bool isDivisionOp() { 119 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 120 SlowDivOrRem->getOpcode() == Instruction::UDiv; 121 } 122 Type *getSlowType() { return SlowDivOrRem->getType(); } 123 124 public: 125 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 126 Value *getReplacement(DivCacheTy &Cache); 127 }; 128 } // anonymous namespace 129 130 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 131 const BypassWidthsTy &BypassWidths) { 132 switch (I->getOpcode()) { 133 case Instruction::UDiv: 134 case Instruction::SDiv: 135 case Instruction::URem: 136 case Instruction::SRem: 137 SlowDivOrRem = I; 138 break; 139 default: 140 // I is not a div/rem operation. 141 return; 142 } 143 144 // Skip division on vector types. Only optimize integer instructions. 145 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 146 if (!SlowType) 147 return; 148 149 // Skip if this bitwidth is not bypassed. 150 auto BI = BypassWidths.find(SlowType->getBitWidth()); 151 if (BI == BypassWidths.end()) 152 return; 153 154 // Get type for div/rem instruction with bypass bitwidth. 155 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 156 BypassType = BT; 157 158 // The original basic block. 159 MainBB = I->getParent(); 160 161 // The instruction is indeed a slow div or rem operation. 162 IsValidTask = true; 163 } 164 165 /// Reuses previously-computed dividend or remainder from the current BB if 166 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 167 /// perform the optimization and caches the resulting dividend and remainder. 168 /// If no replacement can be generated, nullptr is returned. 169 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 170 // First, make sure that the task is valid. 171 if (!IsValidTask) 172 return nullptr; 173 174 // Then, look for a value in Cache. 175 Value *Dividend = SlowDivOrRem->getOperand(0); 176 Value *Divisor = SlowDivOrRem->getOperand(1); 177 DivOpInfo Key(isSignedOp(), Dividend, Divisor); 178 auto CacheI = Cache.find(Key); 179 180 if (CacheI == Cache.end()) { 181 // If previous instance does not exist, try to insert fast div. 182 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 183 // Bail out if insertFastDivAndRem has failed. 184 if (!OptResult) 185 return nullptr; 186 CacheI = Cache.insert({Key, *OptResult}).first; 187 } 188 189 QuotRemPair &Value = CacheI->second; 190 return isDivisionOp() ? Value.Quotient : Value.Remainder; 191 } 192 193 /// \brief Check if a value looks like a hash. 194 /// 195 /// The routine is expected to detect values computed using the most common hash 196 /// algorithms. Typically, hash computations end with one of the following 197 /// instructions: 198 /// 199 /// 1) MUL with a constant wider than BypassType 200 /// 2) XOR instruction 201 /// 202 /// And even if we are wrong and the value is not a hash, it is still quite 203 /// unlikely that such values will fit into BypassType. 204 /// 205 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 206 /// It is implemented as a depth-first search for values that look neither long 207 /// nor hash-like. 208 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 209 Instruction *I = dyn_cast<Instruction>(V); 210 if (!I) 211 return false; 212 213 switch (I->getOpcode()) { 214 case Instruction::Xor: 215 return true; 216 case Instruction::Mul: { 217 // After Constant Hoisting pass, long constants may be represented as 218 // bitcast instructions. As a result, some constants may look like an 219 // instruction at first, and an additional check is necessary to find out if 220 // an operand is actually a constant. 221 Value *Op1 = I->getOperand(1); 222 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 223 if (!C && isa<BitCastInst>(Op1)) 224 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 225 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 226 } 227 case Instruction::PHI: { 228 // Stop IR traversal in case of a crazy input code. This limits recursion 229 // depth. 230 if (Visited.size() >= 16) 231 return false; 232 // Do not visit nodes that have been visited already. We return true because 233 // it means that we couldn't find any value that doesn't look hash-like. 234 if (Visited.find(I) != Visited.end()) 235 return true; 236 Visited.insert(I); 237 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 238 // Ignore undef values as they probably don't affect the division 239 // operands. 240 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 241 isa<UndefValue>(V); 242 }); 243 } 244 default: 245 return false; 246 } 247 } 248 249 /// Check if an integer value fits into our bypass type. 250 ValueRange FastDivInsertionTask::getValueRange(Value *V, 251 VisitedSetTy &Visited) { 252 unsigned ShortLen = BypassType->getBitWidth(); 253 unsigned LongLen = V->getType()->getIntegerBitWidth(); 254 255 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 256 unsigned HiBits = LongLen - ShortLen; 257 258 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 259 APInt Zeros(LongLen, 0), Ones(LongLen, 0); 260 261 computeKnownBits(V, Zeros, Ones, DL); 262 263 if (Zeros.countLeadingOnes() >= HiBits) 264 return VALRNG_KNOWN_SHORT; 265 266 if (Ones.countLeadingZeros() < HiBits) 267 return VALRNG_LIKELY_LONG; 268 269 // Long integer divisions are often used in hashtable implementations. It's 270 // not worth bypassing such divisions because hash values are extremely 271 // unlikely to have enough leading zeros. The call below tries to detect 272 // values that are unlikely to fit BypassType (including hashes). 273 if (isHashLikeValue(V, Visited)) 274 return VALRNG_LIKELY_LONG; 275 276 return VALRNG_UNKNOWN; 277 } 278 279 /// Add new basic block for slow div and rem operations and put it before 280 /// SuccessorBB. 281 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 282 QuotRemWithBB DivRemPair; 283 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 284 MainBB->getParent(), SuccessorBB); 285 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 286 287 Value *Dividend = SlowDivOrRem->getOperand(0); 288 Value *Divisor = SlowDivOrRem->getOperand(1); 289 290 if (isSignedOp()) { 291 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 292 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 293 } else { 294 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 295 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 296 } 297 298 Builder.CreateBr(SuccessorBB); 299 return DivRemPair; 300 } 301 302 /// Add new basic block for fast div and rem operations and put it before 303 /// SuccessorBB. 304 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 305 QuotRemWithBB DivRemPair; 306 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 307 MainBB->getParent(), SuccessorBB); 308 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 309 310 Value *Dividend = SlowDivOrRem->getOperand(0); 311 Value *Divisor = SlowDivOrRem->getOperand(1); 312 Value *ShortDivisorV = 313 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 314 Value *ShortDividendV = 315 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 316 317 // udiv/urem because this optimization only handles positive numbers. 318 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 319 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 320 DivRemPair.Quotient = 321 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 322 DivRemPair.Remainder = 323 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 324 Builder.CreateBr(SuccessorBB); 325 326 return DivRemPair; 327 } 328 329 /// Creates Phi nodes for result of Div and Rem. 330 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 331 QuotRemWithBB &RHS, 332 BasicBlock *PhiBB) { 333 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 334 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 335 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 336 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 337 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 338 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 339 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 340 return QuotRemPair(QuoPhi, RemPhi); 341 } 342 343 /// Creates a runtime check to test whether both the divisor and dividend fit 344 /// into BypassType. The check is inserted at the end of MainBB. True return 345 /// value means that the operands fit. Either of the operands may be NULL if it 346 /// doesn't need a runtime check. 347 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 348 assert((Op1 || Op2) && "Nothing to check"); 349 IRBuilder<> Builder(MainBB, MainBB->end()); 350 351 Value *OrV; 352 if (Op1 && Op2) 353 OrV = Builder.CreateOr(Op1, Op2); 354 else 355 OrV = Op1 ? Op1 : Op2; 356 357 // BitMask is inverted to check if the operands are 358 // larger than the bypass type 359 uint64_t BitMask = ~BypassType->getBitMask(); 360 Value *AndV = Builder.CreateAnd(OrV, BitMask); 361 362 // Compare operand values 363 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 364 return Builder.CreateICmpEQ(AndV, ZeroV); 365 } 366 367 /// Substitutes the div/rem instruction with code that checks the value of the 368 /// operands and uses a shorter-faster div/rem instruction when possible. 369 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 370 Value *Dividend = SlowDivOrRem->getOperand(0); 371 Value *Divisor = SlowDivOrRem->getOperand(1); 372 373 if (isa<ConstantInt>(Divisor)) { 374 // Keep division by a constant for DAGCombiner. 375 return None; 376 } 377 378 VisitedSetTy SetL; 379 ValueRange DividendRange = getValueRange(Dividend, SetL); 380 if (DividendRange == VALRNG_LIKELY_LONG) 381 return None; 382 383 VisitedSetTy SetR; 384 ValueRange DivisorRange = getValueRange(Divisor, SetR); 385 if (DivisorRange == VALRNG_LIKELY_LONG) 386 return None; 387 388 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 389 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 390 391 if (DividendShort && DivisorShort) { 392 // If both operands are known to be short then just replace the long 393 // division with a short one in-place. 394 395 IRBuilder<> Builder(SlowDivOrRem); 396 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 397 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 398 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 399 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 400 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 401 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 402 return QuotRemPair(ExtDiv, ExtRem); 403 } else if (DividendShort && !isSignedOp()) { 404 // If the division is unsigned and Dividend is known to be short, then 405 // either 406 // 1) Divisor is less or equal to Dividend, and the result can be computed 407 // with a short division. 408 // 2) Divisor is greater than Dividend. In this case, no division is needed 409 // at all: The quotient is 0 and the remainder is equal to Dividend. 410 // 411 // So instead of checking at runtime whether Divisor fits into BypassType, 412 // we emit a runtime check to differentiate between these two cases. This 413 // lets us entirely avoid a long div. 414 415 // Split the basic block before the div/rem. 416 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 417 // Remove the unconditional branch from MainBB to SuccessorBB. 418 MainBB->getInstList().back().eraseFromParent(); 419 QuotRemWithBB Long; 420 Long.BB = MainBB; 421 Long.Quotient = ConstantInt::get(getSlowType(), 0); 422 Long.Remainder = Dividend; 423 QuotRemWithBB Fast = createFastBB(SuccessorBB); 424 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 425 IRBuilder<> Builder(MainBB, MainBB->end()); 426 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 427 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 428 return Result; 429 } else { 430 // General case. Create both slow and fast div/rem pairs and choose one of 431 // them at runtime. 432 433 // Split the basic block before the div/rem. 434 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 435 // Remove the unconditional branch from MainBB to SuccessorBB. 436 MainBB->getInstList().back().eraseFromParent(); 437 QuotRemWithBB Fast = createFastBB(SuccessorBB); 438 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 439 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 440 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 441 DivisorShort ? nullptr : Divisor); 442 IRBuilder<> Builder(MainBB, MainBB->end()); 443 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 444 return Result; 445 } 446 } 447 448 /// This optimization identifies DIV/REM instructions in a BB that can be 449 /// profitably bypassed and carried out with a shorter, faster divide. 450 bool llvm::bypassSlowDivision(BasicBlock *BB, 451 const BypassWidthsTy &BypassWidths) { 452 DivCacheTy PerBBDivCache; 453 454 bool MadeChange = false; 455 Instruction* Next = &*BB->begin(); 456 while (Next != nullptr) { 457 // We may add instructions immediately after I, but we want to skip over 458 // them. 459 Instruction* I = Next; 460 Next = Next->getNextNode(); 461 462 FastDivInsertionTask Task(I, BypassWidths); 463 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 464 I->replaceAllUsesWith(Replacement); 465 I->eraseFromParent(); 466 MadeChange = true; 467 } 468 } 469 470 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 471 // create divrem machine instructions. Now erase any unused divs / rems so we 472 // don't leave extra instructions sitting around. 473 for (auto &KV : PerBBDivCache) 474 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 475 RecursivelyDeleteTriviallyDeadInstructions(V); 476 477 return MadeChange; 478 } 479