1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/Analysis/DivergenceAnalysis.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/IR/Attributes.h" 24 #include "llvm/IR/BasicBlock.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/DerivedTypes.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/InstrTypes.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/InstVisitor.h" 32 #include "llvm/IR/IntrinsicInst.h" 33 #include "llvm/IR/Intrinsics.h" 34 #include "llvm/IR/IRBuilder.h" 35 #include "llvm/IR/LLVMContext.h" 36 #include "llvm/IR/Operator.h" 37 #include "llvm/IR/Type.h" 38 #include "llvm/IR/Value.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/Casting.h" 41 #include <cassert> 42 #include <iterator> 43 44 #define DEBUG_TYPE "amdgpu-codegenprepare" 45 46 using namespace llvm; 47 48 namespace { 49 50 class AMDGPUCodeGenPrepare : public FunctionPass, 51 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 52 const GCNTargetMachine *TM; 53 const SISubtarget *ST = nullptr; 54 DivergenceAnalysis *DA = nullptr; 55 Module *Mod = nullptr; 56 bool HasUnsafeFPMath = false; 57 58 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 59 /// binary operation \p V. 60 /// 61 /// \returns Binary operation \p V. 62 Value *copyFlags(const BinaryOperator &I, Value *V) const; 63 64 /// \returns \p T's base element bit width. 65 unsigned getBaseElementBitWidth(const Type *T) const; 66 67 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 68 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 69 /// is returned. 70 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 71 72 /// \returns True if binary operation \p I is a signed binary operation, false 73 /// otherwise. 74 bool isSigned(const BinaryOperator &I) const; 75 76 /// \returns True if the condition of 'select' operation \p I comes from a 77 /// signed 'icmp' operation, false otherwise. 78 bool isSigned(const SelectInst &I) const; 79 80 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 81 /// false otherwise. 82 bool needsPromotionToI32(const Type *T) const; 83 84 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 85 /// operation. 86 /// 87 /// \details \p I's base element bit width must be greater than 1 and less 88 /// than or equal 16. Promotion is done by sign or zero extending operands to 89 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 90 /// truncating the result of 32 bit binary operation back to \p I's original 91 /// type. Division operation is not promoted. 92 /// 93 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 94 /// false otherwise. 95 bool promoteUniformOpToI32(BinaryOperator &I) const; 96 97 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 98 /// 99 /// \details \p I's base element bit width must be greater than 1 and less 100 /// than or equal 16. Promotion is done by sign or zero extending operands to 101 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 102 /// 103 /// \returns True. 104 bool promoteUniformOpToI32(ICmpInst &I) const; 105 106 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 107 /// operation. 108 /// 109 /// \details \p I's base element bit width must be greater than 1 and less 110 /// than or equal 16. Promotion is done by sign or zero extending operands to 111 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 112 /// result of 32 bit 'select' operation back to \p I's original type. 113 /// 114 /// \returns True. 115 bool promoteUniformOpToI32(SelectInst &I) const; 116 117 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 118 /// intrinsic. 119 /// 120 /// \details \p I's base element bit width must be greater than 1 and less 121 /// than or equal 16. Promotion is done by zero extending the operand to 32 122 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 123 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 124 /// shift amount is 32 minus \p I's base element bit width), and truncating 125 /// the result of the shift operation back to \p I's original type. 126 /// 127 /// \returns True. 128 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 129 130 public: 131 static char ID; 132 133 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 134 FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {} 135 136 bool visitFDiv(BinaryOperator &I); 137 138 bool visitInstruction(Instruction &I) { return false; } 139 bool visitBinaryOperator(BinaryOperator &I); 140 bool visitICmpInst(ICmpInst &I); 141 bool visitSelectInst(SelectInst &I); 142 143 bool visitIntrinsicInst(IntrinsicInst &I); 144 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 145 146 bool doInitialization(Module &M) override; 147 bool runOnFunction(Function &F) override; 148 149 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 150 151 void getAnalysisUsage(AnalysisUsage &AU) const override { 152 AU.addRequired<DivergenceAnalysis>(); 153 AU.setPreservesAll(); 154 } 155 }; 156 157 } // end anonymous namespace 158 159 Value *AMDGPUCodeGenPrepare::copyFlags( 160 const BinaryOperator &I, Value *V) const { 161 BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); 162 if (!BinOp) // Possibly constant expression. 163 return V; 164 165 if (isa<OverflowingBinaryOperator>(BinOp)) { 166 BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 167 BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 168 } else if (isa<PossiblyExactOperator>(BinOp)) 169 BinOp->setIsExact(I.isExact()); 170 171 return V; 172 } 173 174 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 175 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 176 177 if (T->isIntegerTy()) 178 return T->getIntegerBitWidth(); 179 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 180 } 181 182 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 183 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 184 185 if (T->isIntegerTy()) 186 return B.getInt32Ty(); 187 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 188 } 189 190 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 191 return I.getOpcode() == Instruction::AShr || 192 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 193 } 194 195 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 196 return isa<ICmpInst>(I.getOperand(0)) ? 197 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 198 } 199 200 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 201 if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 && 202 T->getIntegerBitWidth() <= 16) 203 return true; 204 if (!T->isVectorTy()) 205 return false; 206 return needsPromotionToI32(cast<VectorType>(T)->getElementType()); 207 } 208 209 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 210 assert(needsPromotionToI32(I.getType()) && 211 "I does not need promotion to i32"); 212 213 if (I.getOpcode() == Instruction::SDiv || 214 I.getOpcode() == Instruction::UDiv) 215 return false; 216 217 IRBuilder<> Builder(&I); 218 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 219 220 Type *I32Ty = getI32Ty(Builder, I.getType()); 221 Value *ExtOp0 = nullptr; 222 Value *ExtOp1 = nullptr; 223 Value *ExtRes = nullptr; 224 Value *TruncRes = nullptr; 225 226 if (isSigned(I)) { 227 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 228 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 229 } else { 230 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 231 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 232 } 233 ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 234 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 235 236 I.replaceAllUsesWith(TruncRes); 237 I.eraseFromParent(); 238 239 return true; 240 } 241 242 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 243 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 244 "I does not need promotion to i32"); 245 246 IRBuilder<> Builder(&I); 247 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 248 249 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 250 Value *ExtOp0 = nullptr; 251 Value *ExtOp1 = nullptr; 252 Value *NewICmp = nullptr; 253 254 if (I.isSigned()) { 255 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 256 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 257 } else { 258 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 259 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 260 } 261 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 262 263 I.replaceAllUsesWith(NewICmp); 264 I.eraseFromParent(); 265 266 return true; 267 } 268 269 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 270 assert(needsPromotionToI32(I.getType()) && 271 "I does not need promotion to i32"); 272 273 IRBuilder<> Builder(&I); 274 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 275 276 Type *I32Ty = getI32Ty(Builder, I.getType()); 277 Value *ExtOp1 = nullptr; 278 Value *ExtOp2 = nullptr; 279 Value *ExtRes = nullptr; 280 Value *TruncRes = nullptr; 281 282 if (isSigned(I)) { 283 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 284 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 285 } else { 286 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 287 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 288 } 289 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 290 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 291 292 I.replaceAllUsesWith(TruncRes); 293 I.eraseFromParent(); 294 295 return true; 296 } 297 298 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 299 IntrinsicInst &I) const { 300 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 301 "I must be bitreverse intrinsic"); 302 assert(needsPromotionToI32(I.getType()) && 303 "I does not need promotion to i32"); 304 305 IRBuilder<> Builder(&I); 306 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 307 308 Type *I32Ty = getI32Ty(Builder, I.getType()); 309 Function *I32 = 310 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 311 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 312 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 313 Value *LShrOp = 314 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 315 Value *TruncRes = 316 Builder.CreateTrunc(LShrOp, I.getType()); 317 318 I.replaceAllUsesWith(TruncRes); 319 I.eraseFromParent(); 320 321 return true; 322 } 323 324 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 325 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 326 if (!CNum) 327 return false; 328 329 // Reciprocal f32 is handled separately without denormals. 330 return UnsafeDiv || CNum->isExactlyValue(+1.0); 331 } 332 333 // Insert an intrinsic for fast fdiv for safe math situations where we can 334 // reduce precision. Leave fdiv for situations where the generic node is 335 // expected to be optimized. 336 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 337 Type *Ty = FDiv.getType(); 338 339 if (!Ty->getScalarType()->isFloatTy()) 340 return false; 341 342 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 343 if (!FPMath) 344 return false; 345 346 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 347 float ULP = FPOp->getFPAccuracy(); 348 if (ULP < 2.5f) 349 return false; 350 351 FastMathFlags FMF = FPOp->getFastMathFlags(); 352 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 353 FMF.allowReciprocal(); 354 if (ST->hasFP32Denormals() && !UnsafeDiv) 355 return false; 356 357 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 358 Builder.setFastMathFlags(FMF); 359 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 360 361 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 362 Function *Decl 363 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 364 365 Value *Num = FDiv.getOperand(0); 366 Value *Den = FDiv.getOperand(1); 367 368 Value *NewFDiv = nullptr; 369 370 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 371 NewFDiv = UndefValue::get(VT); 372 373 // FIXME: Doesn't do the right thing for cases where the vector is partially 374 // constant. This works when the scalarizer pass is run first. 375 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 376 Value *NumEltI = Builder.CreateExtractElement(Num, I); 377 Value *DenEltI = Builder.CreateExtractElement(Den, I); 378 Value *NewElt; 379 380 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 381 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 382 } else { 383 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 384 } 385 386 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 387 } 388 } else { 389 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 390 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 391 } 392 393 if (NewFDiv) { 394 FDiv.replaceAllUsesWith(NewFDiv); 395 NewFDiv->takeName(&FDiv); 396 FDiv.eraseFromParent(); 397 } 398 399 return true; 400 } 401 402 static bool hasUnsafeFPMath(const Function &F) { 403 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 404 return Attr.getValueAsString() == "true"; 405 } 406 407 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 408 bool Changed = false; 409 410 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 411 DA->isUniform(&I)) 412 Changed |= promoteUniformOpToI32(I); 413 414 return Changed; 415 } 416 417 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 418 bool Changed = false; 419 420 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 421 DA->isUniform(&I)) 422 Changed |= promoteUniformOpToI32(I); 423 424 return Changed; 425 } 426 427 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 428 bool Changed = false; 429 430 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 431 DA->isUniform(&I)) 432 Changed |= promoteUniformOpToI32(I); 433 434 return Changed; 435 } 436 437 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 438 switch (I.getIntrinsicID()) { 439 case Intrinsic::bitreverse: 440 return visitBitreverseIntrinsicInst(I); 441 default: 442 return false; 443 } 444 } 445 446 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 447 bool Changed = false; 448 449 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 450 DA->isUniform(&I)) 451 Changed |= promoteUniformBitreverseToI32(I); 452 453 return Changed; 454 } 455 456 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 457 Mod = &M; 458 return false; 459 } 460 461 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 462 if (!TM || skipFunction(F)) 463 return false; 464 465 ST = &TM->getSubtarget<SISubtarget>(F); 466 DA = &getAnalysis<DivergenceAnalysis>(); 467 HasUnsafeFPMath = hasUnsafeFPMath(F); 468 469 bool MadeChange = false; 470 471 for (BasicBlock &BB : F) { 472 BasicBlock::iterator Next; 473 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 474 Next = std::next(I); 475 MadeChange |= visit(*I); 476 } 477 } 478 479 return MadeChange; 480 } 481 482 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 483 "AMDGPU IR optimizations", false, false) 484 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 485 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 486 "AMDGPU IR optimizations", false, false) 487 488 char AMDGPUCodeGenPrepare::ID = 0; 489 490 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 491 return new AMDGPUCodeGenPrepare(TM); 492 } 493