1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/AssumptionCache.h" 21 #include "llvm/Analysis/LegacyDivergenceAnalysis.h" 22 #include "llvm/Analysis/Loads.h" 23 #include "llvm/Analysis/ValueTracking.h" 24 #include "llvm/CodeGen/Passes.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/Attributes.h" 27 #include "llvm/IR/BasicBlock.h" 28 #include "llvm/IR/Constants.h" 29 #include "llvm/IR/DerivedTypes.h" 30 #include "llvm/IR/Function.h" 31 #include "llvm/IR/IRBuilder.h" 32 #include "llvm/IR/InstVisitor.h" 33 #include "llvm/IR/InstrTypes.h" 34 #include "llvm/IR/Instruction.h" 35 #include "llvm/IR/Instructions.h" 36 #include "llvm/IR/IntrinsicInst.h" 37 #include "llvm/IR/Intrinsics.h" 38 #include "llvm/IR/LLVMContext.h" 39 #include "llvm/IR/Operator.h" 40 #include "llvm/IR/Type.h" 41 #include "llvm/IR/Value.h" 42 #include "llvm/Pass.h" 43 #include "llvm/Support/Casting.h" 44 #include <cassert> 45 #include <iterator> 46 47 #define DEBUG_TYPE "amdgpu-codegenprepare" 48 49 using namespace llvm; 50 51 namespace { 52 53 static cl::opt<bool> WidenLoads( 54 "amdgpu-codegenprepare-widen-constant-loads", 55 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"), 56 cl::ReallyHidden, 57 cl::init(true)); 58 59 class AMDGPUCodeGenPrepare : public FunctionPass, 60 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 61 const GCNSubtarget *ST = nullptr; 62 AssumptionCache *AC = nullptr; 63 LegacyDivergenceAnalysis *DA = nullptr; 64 Module *Mod = nullptr; 65 bool HasUnsafeFPMath = false; 66 67 /// Copies exact/nsw/nuw flags (if any) from binary operation \p I to 68 /// binary operation \p V. 69 /// 70 /// \returns Binary operation \p V. 71 /// \returns \p T's base element bit width. 72 unsigned getBaseElementBitWidth(const Type *T) const; 73 74 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 75 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 76 /// is returned. 77 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 78 79 /// \returns True if binary operation \p I is a signed binary operation, false 80 /// otherwise. 81 bool isSigned(const BinaryOperator &I) const; 82 83 /// \returns True if the condition of 'select' operation \p I comes from a 84 /// signed 'icmp' operation, false otherwise. 85 bool isSigned(const SelectInst &I) const; 86 87 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 88 /// false otherwise. 89 bool needsPromotionToI32(const Type *T) const; 90 91 /// Promotes uniform binary operation \p I to equivalent 32 bit binary 92 /// operation. 93 /// 94 /// \details \p I's base element bit width must be greater than 1 and less 95 /// than or equal 16. Promotion is done by sign or zero extending operands to 96 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 97 /// truncating the result of 32 bit binary operation back to \p I's original 98 /// type. Division operation is not promoted. 99 /// 100 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 101 /// false otherwise. 102 bool promoteUniformOpToI32(BinaryOperator &I) const; 103 104 /// Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 105 /// 106 /// \details \p I's base element bit width must be greater than 1 and less 107 /// than or equal 16. Promotion is done by sign or zero extending operands to 108 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 109 /// 110 /// \returns True. 111 bool promoteUniformOpToI32(ICmpInst &I) const; 112 113 /// Promotes uniform 'select' operation \p I to 32 bit 'select' 114 /// operation. 115 /// 116 /// \details \p I's base element bit width must be greater than 1 and less 117 /// than or equal 16. Promotion is done by sign or zero extending operands to 118 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 119 /// result of 32 bit 'select' operation back to \p I's original type. 120 /// 121 /// \returns True. 122 bool promoteUniformOpToI32(SelectInst &I) const; 123 124 /// Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 125 /// intrinsic. 126 /// 127 /// \details \p I's base element bit width must be greater than 1 and less 128 /// than or equal 16. Promotion is done by zero extending the operand to 32 129 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 130 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 131 /// shift amount is 32 minus \p I's base element bit width), and truncating 132 /// the result of the shift operation back to \p I's original type. 133 /// 134 /// \returns True. 135 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 136 137 /// Expands 24 bit div or rem. 138 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I, 139 Value *Num, Value *Den, 140 bool IsDiv, bool IsSigned) const; 141 142 /// Expands 32 bit div or rem. 143 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I, 144 Value *Num, Value *Den) const; 145 146 /// Widen a scalar load. 147 /// 148 /// \details \p Widen scalar load for uniform, small type loads from constant 149 // memory / to a full 32-bits and then truncate the input to allow a scalar 150 // load instead of a vector load. 151 // 152 /// \returns True. 153 154 bool canWidenScalarExtLoad(LoadInst &I) const; 155 156 public: 157 static char ID; 158 159 AMDGPUCodeGenPrepare() : FunctionPass(ID) {} 160 161 bool visitFDiv(BinaryOperator &I); 162 163 bool visitInstruction(Instruction &I) { return false; } 164 bool visitBinaryOperator(BinaryOperator &I); 165 bool visitLoadInst(LoadInst &I); 166 bool visitICmpInst(ICmpInst &I); 167 bool visitSelectInst(SelectInst &I); 168 169 bool visitIntrinsicInst(IntrinsicInst &I); 170 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 171 172 bool doInitialization(Module &M) override; 173 bool runOnFunction(Function &F) override; 174 175 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 176 177 void getAnalysisUsage(AnalysisUsage &AU) const override { 178 AU.addRequired<AssumptionCacheTracker>(); 179 AU.addRequired<LegacyDivergenceAnalysis>(); 180 AU.setPreservesAll(); 181 } 182 }; 183 184 } // end anonymous namespace 185 186 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 187 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 188 189 if (T->isIntegerTy()) 190 return T->getIntegerBitWidth(); 191 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 192 } 193 194 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 195 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 196 197 if (T->isIntegerTy()) 198 return B.getInt32Ty(); 199 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 200 } 201 202 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 203 return I.getOpcode() == Instruction::AShr || 204 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 205 } 206 207 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 208 return isa<ICmpInst>(I.getOperand(0)) ? 209 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 210 } 211 212 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 213 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 214 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 215 return true; 216 217 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 218 // TODO: The set of packed operations is more limited, so may want to 219 // promote some anyway. 220 if (ST->hasVOP3PInsts()) 221 return false; 222 223 return needsPromotionToI32(VT->getElementType()); 224 } 225 226 return false; 227 } 228 229 // Return true if the op promoted to i32 should have nsw set. 230 static bool promotedOpIsNSW(const Instruction &I) { 231 switch (I.getOpcode()) { 232 case Instruction::Shl: 233 case Instruction::Add: 234 case Instruction::Sub: 235 return true; 236 case Instruction::Mul: 237 return I.hasNoUnsignedWrap(); 238 default: 239 return false; 240 } 241 } 242 243 // Return true if the op promoted to i32 should have nuw set. 244 static bool promotedOpIsNUW(const Instruction &I) { 245 switch (I.getOpcode()) { 246 case Instruction::Shl: 247 case Instruction::Add: 248 case Instruction::Mul: 249 return true; 250 case Instruction::Sub: 251 return I.hasNoUnsignedWrap(); 252 default: 253 return false; 254 } 255 } 256 257 bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const { 258 Type *Ty = I.getType(); 259 const DataLayout &DL = Mod->getDataLayout(); 260 int TySize = DL.getTypeSizeInBits(Ty); 261 unsigned Align = I.getAlignment() ? 262 I.getAlignment() : DL.getABITypeAlignment(Ty); 263 264 return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I); 265 } 266 267 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 268 assert(needsPromotionToI32(I.getType()) && 269 "I does not need promotion to i32"); 270 271 if (I.getOpcode() == Instruction::SDiv || 272 I.getOpcode() == Instruction::UDiv || 273 I.getOpcode() == Instruction::SRem || 274 I.getOpcode() == Instruction::URem) 275 return false; 276 277 IRBuilder<> Builder(&I); 278 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 279 280 Type *I32Ty = getI32Ty(Builder, I.getType()); 281 Value *ExtOp0 = nullptr; 282 Value *ExtOp1 = nullptr; 283 Value *ExtRes = nullptr; 284 Value *TruncRes = nullptr; 285 286 if (isSigned(I)) { 287 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 288 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 289 } else { 290 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 291 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 292 } 293 294 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 295 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 296 if (promotedOpIsNSW(cast<Instruction>(I))) 297 Inst->setHasNoSignedWrap(); 298 299 if (promotedOpIsNUW(cast<Instruction>(I))) 300 Inst->setHasNoUnsignedWrap(); 301 302 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 303 Inst->setIsExact(ExactOp->isExact()); 304 } 305 306 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 307 308 I.replaceAllUsesWith(TruncRes); 309 I.eraseFromParent(); 310 311 return true; 312 } 313 314 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 315 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 316 "I does not need promotion to i32"); 317 318 IRBuilder<> Builder(&I); 319 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 320 321 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 322 Value *ExtOp0 = nullptr; 323 Value *ExtOp1 = nullptr; 324 Value *NewICmp = nullptr; 325 326 if (I.isSigned()) { 327 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 328 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 329 } else { 330 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 331 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 332 } 333 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 334 335 I.replaceAllUsesWith(NewICmp); 336 I.eraseFromParent(); 337 338 return true; 339 } 340 341 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 342 assert(needsPromotionToI32(I.getType()) && 343 "I does not need promotion to i32"); 344 345 IRBuilder<> Builder(&I); 346 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 347 348 Type *I32Ty = getI32Ty(Builder, I.getType()); 349 Value *ExtOp1 = nullptr; 350 Value *ExtOp2 = nullptr; 351 Value *ExtRes = nullptr; 352 Value *TruncRes = nullptr; 353 354 if (isSigned(I)) { 355 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 356 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 357 } else { 358 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 359 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 360 } 361 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 362 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 363 364 I.replaceAllUsesWith(TruncRes); 365 I.eraseFromParent(); 366 367 return true; 368 } 369 370 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 371 IntrinsicInst &I) const { 372 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 373 "I must be bitreverse intrinsic"); 374 assert(needsPromotionToI32(I.getType()) && 375 "I does not need promotion to i32"); 376 377 IRBuilder<> Builder(&I); 378 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 379 380 Type *I32Ty = getI32Ty(Builder, I.getType()); 381 Function *I32 = 382 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 383 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 384 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 385 Value *LShrOp = 386 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 387 Value *TruncRes = 388 Builder.CreateTrunc(LShrOp, I.getType()); 389 390 I.replaceAllUsesWith(TruncRes); 391 I.eraseFromParent(); 392 393 return true; 394 } 395 396 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) { 397 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 398 if (!CNum) 399 return HasDenormals; 400 401 if (UnsafeDiv) 402 return true; 403 404 bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0); 405 406 // Reciprocal f32 is handled separately without denormals. 407 return HasDenormals ^ IsOne; 408 } 409 410 // Insert an intrinsic for fast fdiv for safe math situations where we can 411 // reduce precision. Leave fdiv for situations where the generic node is 412 // expected to be optimized. 413 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 414 Type *Ty = FDiv.getType(); 415 416 if (!Ty->getScalarType()->isFloatTy()) 417 return false; 418 419 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 420 if (!FPMath) 421 return false; 422 423 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 424 float ULP = FPOp->getFPAccuracy(); 425 if (ULP < 2.5f) 426 return false; 427 428 FastMathFlags FMF = FPOp->getFastMathFlags(); 429 bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() || 430 FMF.allowReciprocal(); 431 432 // With UnsafeDiv node will be optimized to just rcp and mul. 433 if (UnsafeDiv) 434 return false; 435 436 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 437 Builder.setFastMathFlags(FMF); 438 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 439 440 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 441 442 Value *Num = FDiv.getOperand(0); 443 Value *Den = FDiv.getOperand(1); 444 445 Value *NewFDiv = nullptr; 446 447 bool HasDenormals = ST->hasFP32Denormals(); 448 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 449 NewFDiv = UndefValue::get(VT); 450 451 // FIXME: Doesn't do the right thing for cases where the vector is partially 452 // constant. This works when the scalarizer pass is run first. 453 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 454 Value *NumEltI = Builder.CreateExtractElement(Num, I); 455 Value *DenEltI = Builder.CreateExtractElement(Den, I); 456 Value *NewElt; 457 458 if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) { 459 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 460 } else { 461 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 462 } 463 464 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 465 } 466 } else { 467 if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals)) 468 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 469 } 470 471 if (NewFDiv) { 472 FDiv.replaceAllUsesWith(NewFDiv); 473 NewFDiv->takeName(&FDiv); 474 FDiv.eraseFromParent(); 475 } 476 477 return !!NewFDiv; 478 } 479 480 static bool hasUnsafeFPMath(const Function &F) { 481 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 482 return Attr.getValueAsString() == "true"; 483 } 484 485 static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder, 486 Value *LHS, Value *RHS) { 487 Type *I32Ty = Builder.getInt32Ty(); 488 Type *I64Ty = Builder.getInt64Ty(); 489 490 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty); 491 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty); 492 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64); 493 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty); 494 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32)); 495 Hi = Builder.CreateTrunc(Hi, I32Ty); 496 return std::make_pair(Lo, Hi); 497 } 498 499 static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) { 500 return getMul64(Builder, LHS, RHS).second; 501 } 502 503 // The fractional part of a float is enough to accurately represent up to 504 // a 24-bit signed integer. 505 Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder, 506 BinaryOperator &I, 507 Value *Num, Value *Den, 508 bool IsDiv, bool IsSigned) const { 509 assert(Num->getType()->isIntegerTy(32)); 510 511 const DataLayout &DL = Mod->getDataLayout(); 512 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I); 513 if (LHSSignBits < 9) 514 return nullptr; 515 516 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I); 517 if (RHSSignBits < 9) 518 return nullptr; 519 520 521 unsigned SignBits = std::min(LHSSignBits, RHSSignBits); 522 unsigned DivBits = 32 - SignBits; 523 if (IsSigned) 524 ++DivBits; 525 526 Type *Ty = Num->getType(); 527 Type *I32Ty = Builder.getInt32Ty(); 528 Type *F32Ty = Builder.getFloatTy(); 529 ConstantInt *One = Builder.getInt32(1); 530 Value *JQ = One; 531 532 if (IsSigned) { 533 // char|short jq = ia ^ ib; 534 JQ = Builder.CreateXor(Num, Den); 535 536 // jq = jq >> (bitsize - 2) 537 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30)); 538 539 // jq = jq | 0x1 540 JQ = Builder.CreateOr(JQ, One); 541 } 542 543 // int ia = (int)LHS; 544 Value *IA = Num; 545 546 // int ib, (int)RHS; 547 Value *IB = Den; 548 549 // float fa = (float)ia; 550 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty) 551 : Builder.CreateUIToFP(IA, F32Ty); 552 553 // float fb = (float)ib; 554 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) 555 : Builder.CreateUIToFP(IB,F32Ty); 556 557 Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB); 558 Value *FQM = Builder.CreateFMul(FA, RCP); 559 560 // fq = trunc(fqm); 561 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM); 562 FQ->copyFastMathFlags(Builder.getFastMathFlags()); 563 564 // float fqneg = -fq; 565 Value *FQNeg = Builder.CreateFNeg(FQ); 566 567 // float fr = mad(fqneg, fb, fa); 568 Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz, 569 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ); 570 571 // int iq = (int)fq; 572 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty) 573 : Builder.CreateFPToUI(FQ, I32Ty); 574 575 // fr = fabs(fr); 576 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ); 577 578 // fb = fabs(fb); 579 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ); 580 581 // int cv = fr >= fb; 582 Value *CV = Builder.CreateFCmpOGE(FR, FB); 583 584 // jq = (cv ? jq : 0); 585 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0)); 586 587 // dst = iq + jq; 588 Value *Div = Builder.CreateAdd(IQ, JQ); 589 590 Value *Res = Div; 591 if (!IsDiv) { 592 // Rem needs compensation, it's easier to recompute it 593 Value *Rem = Builder.CreateMul(Div, Den); 594 Res = Builder.CreateSub(Num, Rem); 595 } 596 597 // Truncate to number of bits this divide really is. 598 if (IsSigned) { 599 Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits)); 600 Res = Builder.CreateSExt(Res, Ty); 601 } else { 602 ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1); 603 Res = Builder.CreateAnd(Res, TruncMask); 604 } 605 606 return Res; 607 } 608 609 Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder, 610 BinaryOperator &I, 611 Value *Num, Value *Den) const { 612 Instruction::BinaryOps Opc = I.getOpcode(); 613 assert(Opc == Instruction::URem || Opc == Instruction::UDiv || 614 Opc == Instruction::SRem || Opc == Instruction::SDiv); 615 616 FastMathFlags FMF; 617 FMF.setFast(); 618 Builder.setFastMathFlags(FMF); 619 620 if (isa<Constant>(Den)) 621 return nullptr; // Keep it for optimization 622 623 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv; 624 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv; 625 626 Type *Ty = Num->getType(); 627 Type *I32Ty = Builder.getInt32Ty(); 628 Type *F32Ty = Builder.getFloatTy(); 629 630 if (Ty->getScalarSizeInBits() < 32) { 631 if (IsSigned) { 632 Num = Builder.CreateSExt(Num, I32Ty); 633 Den = Builder.CreateSExt(Den, I32Ty); 634 } else { 635 Num = Builder.CreateZExt(Num, I32Ty); 636 Den = Builder.CreateZExt(Den, I32Ty); 637 } 638 } 639 640 if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) { 641 Res = Builder.CreateTrunc(Res, Ty); 642 return Res; 643 } 644 645 ConstantInt *Zero = Builder.getInt32(0); 646 ConstantInt *One = Builder.getInt32(1); 647 ConstantInt *MinusOne = Builder.getInt32(~0); 648 649 Value *Sign = nullptr; 650 if (IsSigned) { 651 ConstantInt *K31 = Builder.getInt32(31); 652 Value *LHSign = Builder.CreateAShr(Num, K31); 653 Value *RHSign = Builder.CreateAShr(Den, K31); 654 // Remainder sign is the same as LHS 655 Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign; 656 657 Num = Builder.CreateAdd(Num, LHSign); 658 Den = Builder.CreateAdd(Den, RHSign); 659 660 Num = Builder.CreateXor(Num, LHSign); 661 Den = Builder.CreateXor(Den, RHSign); 662 } 663 664 // RCP = URECIP(Den) = 2^32 / Den + e 665 // e is rounding error. 666 Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty); 667 Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32); 668 Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000)); 669 Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1); 670 Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty); 671 672 // RCP_LO, RCP_HI = mul(RCP, Den) */ 673 Value *RCP_LO, *RCP_HI; 674 std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den); 675 676 // NEG_RCP_LO = -RCP_LO 677 Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO); 678 679 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) 680 Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero); 681 Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO); 682 683 // Calculate the rounding error from the URECIP instruction 684 // E = mulhu(ABS_RCP_LO, RCP) 685 Value *E = getMulHu(Builder, ABS_RCP_LO, RCP); 686 687 // RCP_A_E = RCP + E 688 Value *RCP_A_E = Builder.CreateAdd(RCP, E); 689 690 // RCP_S_E = RCP - E 691 Value *RCP_S_E = Builder.CreateSub(RCP, E); 692 693 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) 694 Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E); 695 696 // Quotient = mulhu(Tmp0, Num) 697 Value *Quotient = getMulHu(Builder, Tmp0, Num); 698 699 // Num_S_Remainder = Quotient * Den 700 Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den); 701 702 // Remainder = Num - Num_S_Remainder 703 Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder); 704 705 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) 706 Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den); 707 Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero); 708 709 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) 710 Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder); 711 Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, 712 MinusOne, Zero); 713 714 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero 715 Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero); 716 Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero); 717 718 Value *Res; 719 if (IsDiv) { 720 // Quotient_A_One = Quotient + 1 721 Value *Quotient_A_One = Builder.CreateAdd(Quotient, One); 722 723 // Quotient_S_One = Quotient - 1 724 Value *Quotient_S_One = Builder.CreateSub(Quotient, One); 725 726 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) 727 Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One); 728 729 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) 730 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One); 731 } else { 732 // Remainder_S_Den = Remainder - Den 733 Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den); 734 735 // Remainder_A_Den = Remainder + Den 736 Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den); 737 738 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) 739 Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den); 740 741 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) 742 Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den); 743 } 744 745 if (IsSigned) { 746 Res = Builder.CreateXor(Res, Sign); 747 Res = Builder.CreateSub(Res, Sign); 748 } 749 750 Res = Builder.CreateTrunc(Res, Ty); 751 752 return Res; 753 } 754 755 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 756 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 757 DA->isUniform(&I) && promoteUniformOpToI32(I)) 758 return true; 759 760 bool Changed = false; 761 Instruction::BinaryOps Opc = I.getOpcode(); 762 Type *Ty = I.getType(); 763 Value *NewDiv = nullptr; 764 if ((Opc == Instruction::URem || Opc == Instruction::UDiv || 765 Opc == Instruction::SRem || Opc == Instruction::SDiv) && 766 Ty->getScalarSizeInBits() <= 32) { 767 Value *Num = I.getOperand(0); 768 Value *Den = I.getOperand(1); 769 IRBuilder<> Builder(&I); 770 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 771 772 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 773 NewDiv = UndefValue::get(VT); 774 775 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) { 776 Value *NumEltN = Builder.CreateExtractElement(Num, N); 777 Value *DenEltN = Builder.CreateExtractElement(Den, N); 778 Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN); 779 if (!NewElt) 780 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN); 781 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N); 782 } 783 } else { 784 NewDiv = expandDivRem32(Builder, I, Num, Den); 785 } 786 787 if (NewDiv) { 788 I.replaceAllUsesWith(NewDiv); 789 I.eraseFromParent(); 790 Changed = true; 791 } 792 } 793 794 return Changed; 795 } 796 797 bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { 798 if (!WidenLoads) 799 return false; 800 801 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || 802 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && 803 canWidenScalarExtLoad(I)) { 804 IRBuilder<> Builder(&I); 805 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 806 807 Type *I32Ty = Builder.getInt32Ty(); 808 Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace()); 809 Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT); 810 LoadInst *WidenLoad = Builder.CreateLoad(BitCast); 811 WidenLoad->copyMetadata(I); 812 813 // If we have range metadata, we need to convert the type, and not make 814 // assumptions about the high bits. 815 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) { 816 ConstantInt *Lower = 817 mdconst::extract<ConstantInt>(Range->getOperand(0)); 818 819 if (Lower->getValue().isNullValue()) { 820 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr); 821 } else { 822 Metadata *LowAndHigh[] = { 823 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))), 824 // Don't make assumptions about the high bits. 825 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0)) 826 }; 827 828 WidenLoad->setMetadata(LLVMContext::MD_range, 829 MDNode::get(Mod->getContext(), LowAndHigh)); 830 } 831 } 832 833 int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType()); 834 Type *IntNTy = Builder.getIntNTy(TySize); 835 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); 836 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); 837 I.replaceAllUsesWith(ValOrig); 838 I.eraseFromParent(); 839 return true; 840 } 841 842 return false; 843 } 844 845 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 846 bool Changed = false; 847 848 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 849 DA->isUniform(&I)) 850 Changed |= promoteUniformOpToI32(I); 851 852 return Changed; 853 } 854 855 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 856 bool Changed = false; 857 858 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 859 DA->isUniform(&I)) 860 Changed |= promoteUniformOpToI32(I); 861 862 return Changed; 863 } 864 865 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 866 switch (I.getIntrinsicID()) { 867 case Intrinsic::bitreverse: 868 return visitBitreverseIntrinsicInst(I); 869 default: 870 return false; 871 } 872 } 873 874 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 875 bool Changed = false; 876 877 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 878 DA->isUniform(&I)) 879 Changed |= promoteUniformBitreverseToI32(I); 880 881 return Changed; 882 } 883 884 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 885 Mod = &M; 886 return false; 887 } 888 889 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 890 if (skipFunction(F)) 891 return false; 892 893 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 894 if (!TPC) 895 return false; 896 897 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>(); 898 ST = &TM.getSubtarget<GCNSubtarget>(F); 899 AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 900 DA = &getAnalysis<LegacyDivergenceAnalysis>(); 901 HasUnsafeFPMath = hasUnsafeFPMath(F); 902 903 bool MadeChange = false; 904 905 for (BasicBlock &BB : F) { 906 BasicBlock::iterator Next; 907 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 908 Next = std::next(I); 909 MadeChange |= visit(*I); 910 } 911 } 912 913 return MadeChange; 914 } 915 916 INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 917 "AMDGPU IR optimizations", false, false) 918 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 919 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) 920 INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations", 921 false, false) 922 923 char AMDGPUCodeGenPrepare::ID = 0; 924 925 FunctionPass *llvm::createAMDGPUCodeGenPreparePass() { 926 return new AMDGPUCodeGenPrepare(); 927 } 928