1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUSubtarget.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/Analysis/DivergenceAnalysis.h" 21 #include "llvm/CodeGen/Passes.h" 22 #include "llvm/IR/Attributes.h" 23 #include "llvm/IR/BasicBlock.h" 24 #include "llvm/IR/Constants.h" 25 #include "llvm/IR/DerivedTypes.h" 26 #include "llvm/IR/Function.h" 27 #include "llvm/IR/InstrTypes.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/InstVisitor.h" 31 #include "llvm/IR/IntrinsicInst.h" 32 #include "llvm/IR/Intrinsics.h" 33 #include "llvm/IR/IRBuilder.h" 34 #include "llvm/IR/LLVMContext.h" 35 #include "llvm/IR/Operator.h" 36 #include "llvm/IR/Type.h" 37 #include "llvm/IR/Value.h" 38 #include "llvm/Pass.h" 39 #include "llvm/Support/Casting.h" 40 #include <cassert> 41 #include <iterator> 42 43 #define DEBUG_TYPE "amdgpu-codegenprepare" 44 45 using namespace llvm; 46 47 namespace { 48 49 class AMDGPUCodeGenPrepare : public FunctionPass, 50 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 51 const GCNTargetMachine *TM; 52 const SISubtarget *ST = nullptr; 53 DivergenceAnalysis *DA = nullptr; 54 Module *Mod = nullptr; 55 bool HasUnsafeFPMath = false; 56 57 /// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to 58 /// binary operation \p V. 59 /// 60 /// \returns Binary operation \p V. 61 /// \returns \p T's base element bit width. 62 unsigned getBaseElementBitWidth(const Type *T) const; 63 64 /// \returns Equivalent 32 bit integer type for given type \p T. For example, 65 /// if \p T is i7, then i32 is returned; if \p T is <3 x i12>, then <3 x i32> 66 /// is returned. 67 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 68 69 /// \returns True if binary operation \p I is a signed binary operation, false 70 /// otherwise. 71 bool isSigned(const BinaryOperator &I) const; 72 73 /// \returns True if the condition of 'select' operation \p I comes from a 74 /// signed 'icmp' operation, false otherwise. 75 bool isSigned(const SelectInst &I) const; 76 77 /// \returns True if type \p T needs to be promoted to 32 bit integer type, 78 /// false otherwise. 79 bool needsPromotionToI32(const Type *T) const; 80 81 /// \brief Promotes uniform binary operation \p I to equivalent 32 bit binary 82 /// operation. 83 /// 84 /// \details \p I's base element bit width must be greater than 1 and less 85 /// than or equal 16. Promotion is done by sign or zero extending operands to 86 /// 32 bits, replacing \p I with equivalent 32 bit binary operation, and 87 /// truncating the result of 32 bit binary operation back to \p I's original 88 /// type. Division operation is not promoted. 89 /// 90 /// \returns True if \p I is promoted to equivalent 32 bit binary operation, 91 /// false otherwise. 92 bool promoteUniformOpToI32(BinaryOperator &I) const; 93 94 /// \brief Promotes uniform 'icmp' operation \p I to 32 bit 'icmp' operation. 95 /// 96 /// \details \p I's base element bit width must be greater than 1 and less 97 /// than or equal 16. Promotion is done by sign or zero extending operands to 98 /// 32 bits, and replacing \p I with 32 bit 'icmp' operation. 99 /// 100 /// \returns True. 101 bool promoteUniformOpToI32(ICmpInst &I) const; 102 103 /// \brief Promotes uniform 'select' operation \p I to 32 bit 'select' 104 /// operation. 105 /// 106 /// \details \p I's base element bit width must be greater than 1 and less 107 /// than or equal 16. Promotion is done by sign or zero extending operands to 108 /// 32 bits, replacing \p I with 32 bit 'select' operation, and truncating the 109 /// result of 32 bit 'select' operation back to \p I's original type. 110 /// 111 /// \returns True. 112 bool promoteUniformOpToI32(SelectInst &I) const; 113 114 /// \brief Promotes uniform 'bitreverse' intrinsic \p I to 32 bit 'bitreverse' 115 /// intrinsic. 116 /// 117 /// \details \p I's base element bit width must be greater than 1 and less 118 /// than or equal 16. Promotion is done by zero extending the operand to 32 119 /// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the 120 /// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the 121 /// shift amount is 32 minus \p I's base element bit width), and truncating 122 /// the result of the shift operation back to \p I's original type. 123 /// 124 /// \returns True. 125 bool promoteUniformBitreverseToI32(IntrinsicInst &I) const; 126 127 public: 128 static char ID; 129 130 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 131 FunctionPass(ID), TM(static_cast<const GCNTargetMachine *>(TM)) {} 132 133 bool visitFDiv(BinaryOperator &I); 134 135 bool visitInstruction(Instruction &I) { return false; } 136 bool visitBinaryOperator(BinaryOperator &I); 137 bool visitICmpInst(ICmpInst &I); 138 bool visitSelectInst(SelectInst &I); 139 140 bool visitIntrinsicInst(IntrinsicInst &I); 141 bool visitBitreverseIntrinsicInst(IntrinsicInst &I); 142 143 bool doInitialization(Module &M) override; 144 bool runOnFunction(Function &F) override; 145 146 StringRef getPassName() const override { return "AMDGPU IR optimizations"; } 147 148 void getAnalysisUsage(AnalysisUsage &AU) const override { 149 AU.addRequired<DivergenceAnalysis>(); 150 AU.setPreservesAll(); 151 } 152 }; 153 154 } // end anonymous namespace 155 156 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const { 157 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 158 159 if (T->isIntegerTy()) 160 return T->getIntegerBitWidth(); 161 return cast<VectorType>(T)->getElementType()->getIntegerBitWidth(); 162 } 163 164 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 165 assert(needsPromotionToI32(T) && "T does not need promotion to i32"); 166 167 if (T->isIntegerTy()) 168 return B.getInt32Ty(); 169 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 170 } 171 172 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 173 return I.getOpcode() == Instruction::AShr || 174 I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem; 175 } 176 177 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 178 return isa<ICmpInst>(I.getOperand(0)) ? 179 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 180 } 181 182 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const { 183 const IntegerType *IntTy = dyn_cast<IntegerType>(T); 184 if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16) 185 return true; 186 187 if (const VectorType *VT = dyn_cast<VectorType>(T)) { 188 // TODO: The set of packed operations is more limited, so may want to 189 // promote some anyway. 190 if (ST->hasVOP3PInsts()) 191 return false; 192 193 return needsPromotionToI32(VT->getElementType()); 194 } 195 196 return false; 197 } 198 199 // Return true if the op promoted to i32 should have nsw set. 200 static bool promotedOpIsNSW(const Instruction &I) { 201 switch (I.getOpcode()) { 202 case Instruction::Shl: 203 case Instruction::Add: 204 case Instruction::Sub: 205 return true; 206 case Instruction::Mul: 207 return I.hasNoUnsignedWrap(); 208 default: 209 return false; 210 } 211 } 212 213 // Return true if the op promoted to i32 should have nuw set. 214 static bool promotedOpIsNUW(const Instruction &I) { 215 switch (I.getOpcode()) { 216 case Instruction::Shl: 217 case Instruction::Add: 218 case Instruction::Mul: 219 return true; 220 case Instruction::Sub: 221 return I.hasNoUnsignedWrap(); 222 default: 223 return false; 224 } 225 } 226 227 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const { 228 assert(needsPromotionToI32(I.getType()) && 229 "I does not need promotion to i32"); 230 231 if (I.getOpcode() == Instruction::SDiv || 232 I.getOpcode() == Instruction::UDiv) 233 return false; 234 235 IRBuilder<> Builder(&I); 236 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 237 238 Type *I32Ty = getI32Ty(Builder, I.getType()); 239 Value *ExtOp0 = nullptr; 240 Value *ExtOp1 = nullptr; 241 Value *ExtRes = nullptr; 242 Value *TruncRes = nullptr; 243 244 if (isSigned(I)) { 245 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 246 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 247 } else { 248 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 249 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 250 } 251 252 ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1); 253 if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) { 254 if (promotedOpIsNSW(cast<Instruction>(I))) 255 Inst->setHasNoSignedWrap(); 256 257 if (promotedOpIsNUW(cast<Instruction>(I))) 258 Inst->setHasNoUnsignedWrap(); 259 260 if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) 261 Inst->setIsExact(ExactOp->isExact()); 262 } 263 264 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 265 266 I.replaceAllUsesWith(TruncRes); 267 I.eraseFromParent(); 268 269 return true; 270 } 271 272 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const { 273 assert(needsPromotionToI32(I.getOperand(0)->getType()) && 274 "I does not need promotion to i32"); 275 276 IRBuilder<> Builder(&I); 277 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 278 279 Type *I32Ty = getI32Ty(Builder, I.getOperand(0)->getType()); 280 Value *ExtOp0 = nullptr; 281 Value *ExtOp1 = nullptr; 282 Value *NewICmp = nullptr; 283 284 if (I.isSigned()) { 285 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 286 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 287 } else { 288 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 289 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 290 } 291 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 292 293 I.replaceAllUsesWith(NewICmp); 294 I.eraseFromParent(); 295 296 return true; 297 } 298 299 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const { 300 assert(needsPromotionToI32(I.getType()) && 301 "I does not need promotion to i32"); 302 303 IRBuilder<> Builder(&I); 304 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 305 306 Type *I32Ty = getI32Ty(Builder, I.getType()); 307 Value *ExtOp1 = nullptr; 308 Value *ExtOp2 = nullptr; 309 Value *ExtRes = nullptr; 310 Value *TruncRes = nullptr; 311 312 if (isSigned(I)) { 313 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 314 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 315 } else { 316 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 317 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 318 } 319 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 320 TruncRes = Builder.CreateTrunc(ExtRes, I.getType()); 321 322 I.replaceAllUsesWith(TruncRes); 323 I.eraseFromParent(); 324 325 return true; 326 } 327 328 bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32( 329 IntrinsicInst &I) const { 330 assert(I.getIntrinsicID() == Intrinsic::bitreverse && 331 "I must be bitreverse intrinsic"); 332 assert(needsPromotionToI32(I.getType()) && 333 "I does not need promotion to i32"); 334 335 IRBuilder<> Builder(&I); 336 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 337 338 Type *I32Ty = getI32Ty(Builder, I.getType()); 339 Function *I32 = 340 Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); 341 Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); 342 Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); 343 Value *LShrOp = 344 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType())); 345 Value *TruncRes = 346 Builder.CreateTrunc(LShrOp, I.getType()); 347 348 I.replaceAllUsesWith(TruncRes); 349 I.eraseFromParent(); 350 351 return true; 352 } 353 354 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 355 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 356 if (!CNum) 357 return false; 358 359 // Reciprocal f32 is handled separately without denormals. 360 return UnsafeDiv || CNum->isExactlyValue(+1.0); 361 } 362 363 // Insert an intrinsic for fast fdiv for safe math situations where we can 364 // reduce precision. Leave fdiv for situations where the generic node is 365 // expected to be optimized. 366 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 367 Type *Ty = FDiv.getType(); 368 369 if (!Ty->getScalarType()->isFloatTy()) 370 return false; 371 372 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 373 if (!FPMath) 374 return false; 375 376 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 377 float ULP = FPOp->getFPAccuracy(); 378 if (ULP < 2.5f) 379 return false; 380 381 FastMathFlags FMF = FPOp->getFastMathFlags(); 382 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 383 FMF.allowReciprocal(); 384 if (ST->hasFP32Denormals() && !UnsafeDiv) 385 return false; 386 387 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 388 Builder.setFastMathFlags(FMF); 389 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 390 391 Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast); 392 393 Value *Num = FDiv.getOperand(0); 394 Value *Den = FDiv.getOperand(1); 395 396 Value *NewFDiv = nullptr; 397 398 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 399 NewFDiv = UndefValue::get(VT); 400 401 // FIXME: Doesn't do the right thing for cases where the vector is partially 402 // constant. This works when the scalarizer pass is run first. 403 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 404 Value *NumEltI = Builder.CreateExtractElement(Num, I); 405 Value *DenEltI = Builder.CreateExtractElement(Den, I); 406 Value *NewElt; 407 408 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 409 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 410 } else { 411 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 412 } 413 414 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 415 } 416 } else { 417 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 418 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 419 } 420 421 if (NewFDiv) { 422 FDiv.replaceAllUsesWith(NewFDiv); 423 NewFDiv->takeName(&FDiv); 424 FDiv.eraseFromParent(); 425 } 426 427 return true; 428 } 429 430 static bool hasUnsafeFPMath(const Function &F) { 431 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 432 return Attr.getValueAsString() == "true"; 433 } 434 435 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 436 bool Changed = false; 437 438 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 439 DA->isUniform(&I)) 440 Changed |= promoteUniformOpToI32(I); 441 442 return Changed; 443 } 444 445 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 446 bool Changed = false; 447 448 if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) && 449 DA->isUniform(&I)) 450 Changed |= promoteUniformOpToI32(I); 451 452 return Changed; 453 } 454 455 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 456 bool Changed = false; 457 458 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 459 DA->isUniform(&I)) 460 Changed |= promoteUniformOpToI32(I); 461 462 return Changed; 463 } 464 465 bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { 466 switch (I.getIntrinsicID()) { 467 case Intrinsic::bitreverse: 468 return visitBitreverseIntrinsicInst(I); 469 default: 470 return false; 471 } 472 } 473 474 bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) { 475 bool Changed = false; 476 477 if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) && 478 DA->isUniform(&I)) 479 Changed |= promoteUniformBitreverseToI32(I); 480 481 return Changed; 482 } 483 484 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 485 Mod = &M; 486 return false; 487 } 488 489 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 490 if (!TM || skipFunction(F)) 491 return false; 492 493 ST = &TM->getSubtarget<SISubtarget>(F); 494 DA = &getAnalysis<DivergenceAnalysis>(); 495 HasUnsafeFPMath = hasUnsafeFPMath(F); 496 497 bool MadeChange = false; 498 499 for (BasicBlock &BB : F) { 500 BasicBlock::iterator Next; 501 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 502 Next = std::next(I); 503 MadeChange |= visit(*I); 504 } 505 } 506 507 return MadeChange; 508 } 509 510 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 511 "AMDGPU IR optimizations", false, false) 512 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 513 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 514 "AMDGPU IR optimizations", false, false) 515 516 char AMDGPUCodeGenPrepare::ID = 0; 517 518 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 519 return new AMDGPUCodeGenPrepare(TM); 520 } 521