1 //===- BypassSlowDivision.cpp - Bypass slow division ----------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file contains an optimization for div and rem on architectures that 10 // execute short instructions significantly faster than longer instructions. 11 // For example, on Intel Atom 32-bit divides are slow enough that during 12 // runtime it is profitable to check the value of the operands, and if they are 13 // positive and less than 256 use an unsigned 8-bit divide. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "llvm/Transforms/Utils/BypassSlowDivision.h" 18 #include "llvm/ADT/DenseMap.h" 19 #include "llvm/ADT/None.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/Transforms/Utils/Local.h" 24 #include "llvm/Analysis/ValueTracking.h" 25 #include "llvm/IR/BasicBlock.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/DerivedTypes.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/IRBuilder.h" 30 #include "llvm/IR/Instruction.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Value.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/KnownBits.h" 37 #include <cassert> 38 #include <cstdint> 39 40 using namespace llvm; 41 42 #define DEBUG_TYPE "bypass-slow-division" 43 44 namespace { 45 46 struct QuotRemPair { 47 Value *Quotient; 48 Value *Remainder; 49 50 QuotRemPair(Value *InQuotient, Value *InRemainder) 51 : Quotient(InQuotient), Remainder(InRemainder) {} 52 }; 53 54 /// A quotient and remainder, plus a BB from which they logically "originate". 55 /// If you use Quotient or Remainder in a Phi node, you should use BB as its 56 /// corresponding predecessor. 57 struct QuotRemWithBB { 58 BasicBlock *BB = nullptr; 59 Value *Quotient = nullptr; 60 Value *Remainder = nullptr; 61 }; 62 63 using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>; 64 using BypassWidthsTy = DenseMap<unsigned, unsigned>; 65 using VisitedSetTy = SmallPtrSet<Instruction *, 4>; 66 67 enum ValueRange { 68 /// Operand definitely fits into BypassType. No runtime checks are needed. 69 VALRNG_KNOWN_SHORT, 70 /// A runtime check is required, as value range is unknown. 71 VALRNG_UNKNOWN, 72 /// Operand is unlikely to fit into BypassType. The bypassing should be 73 /// disabled. 74 VALRNG_LIKELY_LONG 75 }; 76 77 class FastDivInsertionTask { 78 bool IsValidTask = false; 79 Instruction *SlowDivOrRem = nullptr; 80 IntegerType *BypassType = nullptr; 81 BasicBlock *MainBB = nullptr; 82 83 bool isHashLikeValue(Value *V, VisitedSetTy &Visited); 84 ValueRange getValueRange(Value *Op, VisitedSetTy &Visited); 85 QuotRemWithBB createSlowBB(BasicBlock *Successor); 86 QuotRemWithBB createFastBB(BasicBlock *Successor); 87 QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS, 88 BasicBlock *PhiBB); 89 Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2); 90 Optional<QuotRemPair> insertFastDivAndRem(); 91 92 bool isSignedOp() { 93 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 94 SlowDivOrRem->getOpcode() == Instruction::SRem; 95 } 96 97 bool isDivisionOp() { 98 return SlowDivOrRem->getOpcode() == Instruction::SDiv || 99 SlowDivOrRem->getOpcode() == Instruction::UDiv; 100 } 101 102 Type *getSlowType() { return SlowDivOrRem->getType(); } 103 104 public: 105 FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths); 106 107 Value *getReplacement(DivCacheTy &Cache); 108 }; 109 110 } // end anonymous namespace 111 112 FastDivInsertionTask::FastDivInsertionTask(Instruction *I, 113 const BypassWidthsTy &BypassWidths) { 114 switch (I->getOpcode()) { 115 case Instruction::UDiv: 116 case Instruction::SDiv: 117 case Instruction::URem: 118 case Instruction::SRem: 119 SlowDivOrRem = I; 120 break; 121 default: 122 // I is not a div/rem operation. 123 return; 124 } 125 126 // Skip division on vector types. Only optimize integer instructions. 127 IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType()); 128 if (!SlowType) 129 return; 130 131 // Skip if this bitwidth is not bypassed. 132 auto BI = BypassWidths.find(SlowType->getBitWidth()); 133 if (BI == BypassWidths.end()) 134 return; 135 136 // Get type for div/rem instruction with bypass bitwidth. 137 IntegerType *BT = IntegerType::get(I->getContext(), BI->second); 138 BypassType = BT; 139 140 // The original basic block. 141 MainBB = I->getParent(); 142 143 // The instruction is indeed a slow div or rem operation. 144 IsValidTask = true; 145 } 146 147 /// Reuses previously-computed dividend or remainder from the current BB if 148 /// operands and operation are identical. Otherwise calls insertFastDivAndRem to 149 /// perform the optimization and caches the resulting dividend and remainder. 150 /// If no replacement can be generated, nullptr is returned. 151 Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) { 152 // First, make sure that the task is valid. 153 if (!IsValidTask) 154 return nullptr; 155 156 // Then, look for a value in Cache. 157 Value *Dividend = SlowDivOrRem->getOperand(0); 158 Value *Divisor = SlowDivOrRem->getOperand(1); 159 DivRemMapKey Key(isSignedOp(), Dividend, Divisor); 160 auto CacheI = Cache.find(Key); 161 162 if (CacheI == Cache.end()) { 163 // If previous instance does not exist, try to insert fast div. 164 Optional<QuotRemPair> OptResult = insertFastDivAndRem(); 165 // Bail out if insertFastDivAndRem has failed. 166 if (!OptResult) 167 return nullptr; 168 CacheI = Cache.insert({Key, *OptResult}).first; 169 } 170 171 QuotRemPair &Value = CacheI->second; 172 return isDivisionOp() ? Value.Quotient : Value.Remainder; 173 } 174 175 /// Check if a value looks like a hash. 176 /// 177 /// The routine is expected to detect values computed using the most common hash 178 /// algorithms. Typically, hash computations end with one of the following 179 /// instructions: 180 /// 181 /// 1) MUL with a constant wider than BypassType 182 /// 2) XOR instruction 183 /// 184 /// And even if we are wrong and the value is not a hash, it is still quite 185 /// unlikely that such values will fit into BypassType. 186 /// 187 /// To detect string hash algorithms like FNV we have to look through PHI-nodes. 188 /// It is implemented as a depth-first search for values that look neither long 189 /// nor hash-like. 190 bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) { 191 Instruction *I = dyn_cast<Instruction>(V); 192 if (!I) 193 return false; 194 195 switch (I->getOpcode()) { 196 case Instruction::Xor: 197 return true; 198 case Instruction::Mul: { 199 // After Constant Hoisting pass, long constants may be represented as 200 // bitcast instructions. As a result, some constants may look like an 201 // instruction at first, and an additional check is necessary to find out if 202 // an operand is actually a constant. 203 Value *Op1 = I->getOperand(1); 204 ConstantInt *C = dyn_cast<ConstantInt>(Op1); 205 if (!C && isa<BitCastInst>(Op1)) 206 C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0)); 207 return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth(); 208 } 209 case Instruction::PHI: 210 // Stop IR traversal in case of a crazy input code. This limits recursion 211 // depth. 212 if (Visited.size() >= 16) 213 return false; 214 // Do not visit nodes that have been visited already. We return true because 215 // it means that we couldn't find any value that doesn't look hash-like. 216 if (!Visited.insert(I).second) 217 return true; 218 return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) { 219 // Ignore undef values as they probably don't affect the division 220 // operands. 221 return getValueRange(V, Visited) == VALRNG_LIKELY_LONG || 222 isa<UndefValue>(V); 223 }); 224 default: 225 return false; 226 } 227 } 228 229 /// Check if an integer value fits into our bypass type. 230 ValueRange FastDivInsertionTask::getValueRange(Value *V, 231 VisitedSetTy &Visited) { 232 unsigned ShortLen = BypassType->getBitWidth(); 233 unsigned LongLen = V->getType()->getIntegerBitWidth(); 234 235 assert(LongLen > ShortLen && "Value type must be wider than BypassType"); 236 unsigned HiBits = LongLen - ShortLen; 237 238 const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout(); 239 KnownBits Known(LongLen); 240 241 computeKnownBits(V, Known, DL); 242 243 if (Known.countMinLeadingZeros() >= HiBits) 244 return VALRNG_KNOWN_SHORT; 245 246 if (Known.countMaxLeadingZeros() < HiBits) 247 return VALRNG_LIKELY_LONG; 248 249 // Long integer divisions are often used in hashtable implementations. It's 250 // not worth bypassing such divisions because hash values are extremely 251 // unlikely to have enough leading zeros. The call below tries to detect 252 // values that are unlikely to fit BypassType (including hashes). 253 if (isHashLikeValue(V, Visited)) 254 return VALRNG_LIKELY_LONG; 255 256 return VALRNG_UNKNOWN; 257 } 258 259 /// Add new basic block for slow div and rem operations and put it before 260 /// SuccessorBB. 261 QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) { 262 QuotRemWithBB DivRemPair; 263 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 264 MainBB->getParent(), SuccessorBB); 265 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 266 267 Value *Dividend = SlowDivOrRem->getOperand(0); 268 Value *Divisor = SlowDivOrRem->getOperand(1); 269 270 if (isSignedOp()) { 271 DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor); 272 DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor); 273 } else { 274 DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor); 275 DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor); 276 } 277 278 Builder.CreateBr(SuccessorBB); 279 return DivRemPair; 280 } 281 282 /// Add new basic block for fast div and rem operations and put it before 283 /// SuccessorBB. 284 QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) { 285 QuotRemWithBB DivRemPair; 286 DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "", 287 MainBB->getParent(), SuccessorBB); 288 IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin()); 289 290 Value *Dividend = SlowDivOrRem->getOperand(0); 291 Value *Divisor = SlowDivOrRem->getOperand(1); 292 Value *ShortDivisorV = 293 Builder.CreateCast(Instruction::Trunc, Divisor, BypassType); 294 Value *ShortDividendV = 295 Builder.CreateCast(Instruction::Trunc, Dividend, BypassType); 296 297 // udiv/urem because this optimization only handles positive numbers. 298 Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV); 299 Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV); 300 DivRemPair.Quotient = 301 Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType()); 302 DivRemPair.Remainder = 303 Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType()); 304 Builder.CreateBr(SuccessorBB); 305 306 return DivRemPair; 307 } 308 309 /// Creates Phi nodes for result of Div and Rem. 310 QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS, 311 QuotRemWithBB &RHS, 312 BasicBlock *PhiBB) { 313 IRBuilder<> Builder(PhiBB, PhiBB->begin()); 314 PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2); 315 QuoPhi->addIncoming(LHS.Quotient, LHS.BB); 316 QuoPhi->addIncoming(RHS.Quotient, RHS.BB); 317 PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2); 318 RemPhi->addIncoming(LHS.Remainder, LHS.BB); 319 RemPhi->addIncoming(RHS.Remainder, RHS.BB); 320 return QuotRemPair(QuoPhi, RemPhi); 321 } 322 323 /// Creates a runtime check to test whether both the divisor and dividend fit 324 /// into BypassType. The check is inserted at the end of MainBB. True return 325 /// value means that the operands fit. Either of the operands may be NULL if it 326 /// doesn't need a runtime check. 327 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) { 328 assert((Op1 || Op2) && "Nothing to check"); 329 IRBuilder<> Builder(MainBB, MainBB->end()); 330 331 Value *OrV; 332 if (Op1 && Op2) 333 OrV = Builder.CreateOr(Op1, Op2); 334 else 335 OrV = Op1 ? Op1 : Op2; 336 337 // BitMask is inverted to check if the operands are 338 // larger than the bypass type 339 uint64_t BitMask = ~BypassType->getBitMask(); 340 Value *AndV = Builder.CreateAnd(OrV, BitMask); 341 342 // Compare operand values 343 Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0); 344 return Builder.CreateICmpEQ(AndV, ZeroV); 345 } 346 347 /// Substitutes the div/rem instruction with code that checks the value of the 348 /// operands and uses a shorter-faster div/rem instruction when possible. 349 Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() { 350 Value *Dividend = SlowDivOrRem->getOperand(0); 351 Value *Divisor = SlowDivOrRem->getOperand(1); 352 353 VisitedSetTy SetL; 354 ValueRange DividendRange = getValueRange(Dividend, SetL); 355 if (DividendRange == VALRNG_LIKELY_LONG) 356 return None; 357 358 VisitedSetTy SetR; 359 ValueRange DivisorRange = getValueRange(Divisor, SetR); 360 if (DivisorRange == VALRNG_LIKELY_LONG) 361 return None; 362 363 bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT); 364 bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT); 365 366 if (DividendShort && DivisorShort) { 367 // If both operands are known to be short then just replace the long 368 // division with a short one in-place. Since we're not introducing control 369 // flow in this case, narrowing the division is always a win, even if the 370 // divisor is a constant (and will later get replaced by a multiplication). 371 372 IRBuilder<> Builder(SlowDivOrRem); 373 Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType); 374 Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType); 375 Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor); 376 Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor); 377 Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType()); 378 Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType()); 379 return QuotRemPair(ExtDiv, ExtRem); 380 } 381 382 if (isa<ConstantInt>(Divisor)) { 383 // If the divisor is not a constant, DAGCombiner will convert it to a 384 // multiplication by a magic constant. It isn't clear if it is worth 385 // introducing control flow to get a narrower multiply. 386 return None; 387 } 388 389 // After Constant Hoisting pass, long constants may be represented as 390 // bitcast instructions. As a result, some constants may look like an 391 // instruction at first, and an additional check is necessary to find out if 392 // an operand is actually a constant. 393 if (auto *BCI = dyn_cast<BitCastInst>(Divisor)) 394 if (BCI->getParent() == SlowDivOrRem->getParent() && 395 isa<ConstantInt>(BCI->getOperand(0))) 396 return None; 397 398 if (DividendShort && !isSignedOp()) { 399 // If the division is unsigned and Dividend is known to be short, then 400 // either 401 // 1) Divisor is less or equal to Dividend, and the result can be computed 402 // with a short division. 403 // 2) Divisor is greater than Dividend. In this case, no division is needed 404 // at all: The quotient is 0 and the remainder is equal to Dividend. 405 // 406 // So instead of checking at runtime whether Divisor fits into BypassType, 407 // we emit a runtime check to differentiate between these two cases. This 408 // lets us entirely avoid a long div. 409 410 // Split the basic block before the div/rem. 411 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 412 // Remove the unconditional branch from MainBB to SuccessorBB. 413 MainBB->getInstList().back().eraseFromParent(); 414 QuotRemWithBB Long; 415 Long.BB = MainBB; 416 Long.Quotient = ConstantInt::get(getSlowType(), 0); 417 Long.Remainder = Dividend; 418 QuotRemWithBB Fast = createFastBB(SuccessorBB); 419 QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB); 420 IRBuilder<> Builder(MainBB, MainBB->end()); 421 Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor); 422 Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB); 423 return Result; 424 } else { 425 // General case. Create both slow and fast div/rem pairs and choose one of 426 // them at runtime. 427 428 // Split the basic block before the div/rem. 429 BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem); 430 // Remove the unconditional branch from MainBB to SuccessorBB. 431 MainBB->getInstList().back().eraseFromParent(); 432 QuotRemWithBB Fast = createFastBB(SuccessorBB); 433 QuotRemWithBB Slow = createSlowBB(SuccessorBB); 434 QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB); 435 Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend, 436 DivisorShort ? nullptr : Divisor); 437 IRBuilder<> Builder(MainBB, MainBB->end()); 438 Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB); 439 return Result; 440 } 441 } 442 443 /// This optimization identifies DIV/REM instructions in a BB that can be 444 /// profitably bypassed and carried out with a shorter, faster divide. 445 bool llvm::bypassSlowDivision(BasicBlock *BB, 446 const BypassWidthsTy &BypassWidths) { 447 DivCacheTy PerBBDivCache; 448 449 bool MadeChange = false; 450 Instruction *Next = &*BB->begin(); 451 while (Next != nullptr) { 452 // We may add instructions immediately after I, but we want to skip over 453 // them. 454 Instruction *I = Next; 455 Next = Next->getNextNode(); 456 457 // Ignore dead code to save time and avoid bugs. 458 if (I->hasNUses(0)) 459 continue; 460 461 FastDivInsertionTask Task(I, BypassWidths); 462 if (Value *Replacement = Task.getReplacement(PerBBDivCache)) { 463 I->replaceAllUsesWith(Replacement); 464 I->eraseFromParent(); 465 MadeChange = true; 466 } 467 } 468 469 // Above we eagerly create divs and rems, as pairs, so that we can efficiently 470 // create divrem machine instructions. Now erase any unused divs / rems so we 471 // don't leave extra instructions sitting around. 472 for (auto &KV : PerBBDivCache) 473 for (Value *V : {KV.second.Quotient, KV.second.Remainder}) 474 RecursivelyDeleteTriviallyDeadInstructions(V); 475 476 return MadeChange; 477 } 478