1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This pass does misc. AMDGPU optimizations on IR before instruction 12 /// selection. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUIntrinsicInfo.h" 18 #include "AMDGPUSubtarget.h" 19 #include "AMDGPUTargetMachine.h" 20 21 #include "llvm/Analysis/DivergenceAnalysis.h" 22 #include "llvm/CodeGen/Passes.h" 23 #include "llvm/IR/InstVisitor.h" 24 #include "llvm/IR/IRBuilder.h" 25 #include "llvm/Support/Debug.h" 26 #include "llvm/Support/raw_ostream.h" 27 28 #define DEBUG_TYPE "amdgpu-codegenprepare" 29 30 using namespace llvm; 31 32 namespace { 33 34 class AMDGPUCodeGenPrepare : public FunctionPass, 35 public InstVisitor<AMDGPUCodeGenPrepare, bool> { 36 const GCNTargetMachine *TM; 37 const SISubtarget *ST; 38 DivergenceAnalysis *DA; 39 Module *Mod; 40 bool HasUnsafeFPMath; 41 42 /// \brief Copies exact/nsw/nuw flags (if any) from binary operator \p I to 43 /// binary operator \p V. 44 /// 45 /// \returns Binary operator \p V. 46 Value *copyFlags(const BinaryOperator &I, Value *V) const; 47 48 /// \returns Equivalent 16 bit integer type for given 32 bit integer type 49 /// \p T. 50 Type *getI16Ty(IRBuilder<> &B, const Type *T) const; 51 52 /// \returns Equivalent 32 bit integer type for given 16 bit integer type 53 /// \p T. 54 Type *getI32Ty(IRBuilder<> &B, const Type *T) const; 55 56 /// \returns True if the base element of type \p T is 16 bit integer, false 57 /// otherwise. 58 bool isI16Ty(const Type *T) const; 59 60 /// \returns True if the base element of type \p T is 32 bit integer, false 61 /// otherwise. 62 bool isI32Ty(const Type *T) const; 63 64 /// \returns True if binary operation \p I is a signed binary operation, false 65 /// otherwise. 66 bool isSigned(const BinaryOperator &I) const; 67 68 /// \returns True if the condition of 'select' operation \p I comes from a 69 /// signed 'icmp' operation, false otherwise. 70 bool isSigned(const SelectInst &I) const; 71 72 /// \brief Promotes uniform 16 bit binary operation \p I to equivalent 32 bit 73 /// binary operation by sign or zero extending operands to 32 bits, replacing 74 /// 16 bit operation with equivalent 32 bit operation, and truncating the 75 /// result of 32 bit operation back to 16 bits. 16 bit division operation is 76 /// not promoted. 77 /// 78 /// \returns True if 16 bit binary operation is promoted to equivalent 32 bit 79 /// binary operation, false otherwise. 80 bool promoteUniformI16OpToI32Op(BinaryOperator &I) const; 81 82 /// \brief Promotes uniform 16 bit 'icmp' operation \p I to 32 bit 'icmp' 83 /// operation by sign or zero extending operands to 32 bits, and replacing 16 84 /// bit operation with 32 bit operation. 85 /// 86 /// \returns True. 87 bool promoteUniformI16OpToI32Op(ICmpInst &I) const; 88 89 /// \brief Promotes uniform 16 bit 'select' operation \p I to 32 bit 'select' 90 /// operation by sign or zero extending operands to 32 bits, replacing 16 bit 91 /// operation with 32 bit operation, and truncating the result of 32 bit 92 /// operation back to 16 bits. 93 /// 94 /// \returns True. 95 bool promoteUniformI16OpToI32Op(SelectInst &I) const; 96 97 public: 98 static char ID; 99 AMDGPUCodeGenPrepare(const TargetMachine *TM = nullptr) : 100 FunctionPass(ID), 101 TM(static_cast<const GCNTargetMachine *>(TM)), 102 ST(nullptr), 103 DA(nullptr), 104 Mod(nullptr), 105 HasUnsafeFPMath(false) { } 106 107 bool visitFDiv(BinaryOperator &I); 108 109 bool visitInstruction(Instruction &I) { return false; } 110 bool visitBinaryOperator(BinaryOperator &I); 111 bool visitICmpInst(ICmpInst &I); 112 bool visitSelectInst(SelectInst &I); 113 114 bool doInitialization(Module &M) override; 115 bool runOnFunction(Function &F) override; 116 117 const char *getPassName() const override { 118 return "AMDGPU IR optimizations"; 119 } 120 121 void getAnalysisUsage(AnalysisUsage &AU) const override { 122 AU.addRequired<DivergenceAnalysis>(); 123 AU.setPreservesAll(); 124 } 125 }; 126 127 } // End anonymous namespace 128 129 Value *AMDGPUCodeGenPrepare::copyFlags( 130 const BinaryOperator &I, Value *V) const { 131 assert(isa<BinaryOperator>(V) && "V must be binary operator"); 132 133 BinaryOperator *BinOp = cast<BinaryOperator>(V); 134 if (isa<OverflowingBinaryOperator>(BinOp)) { 135 BinOp->setHasNoSignedWrap(I.hasNoSignedWrap()); 136 BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); 137 } else if (isa<PossiblyExactOperator>(BinOp)) { 138 BinOp->setIsExact(I.isExact()); 139 } 140 141 return V; 142 } 143 144 Type *AMDGPUCodeGenPrepare::getI16Ty(IRBuilder<> &B, const Type *T) const { 145 assert(isI32Ty(T) && "T must be 32 bits"); 146 147 if (T->isIntegerTy()) 148 return B.getInt16Ty(); 149 return VectorType::get(B.getInt16Ty(), cast<VectorType>(T)->getNumElements()); 150 } 151 152 Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const { 153 assert(isI16Ty(T) && "T must be 16 bits"); 154 155 if (T->isIntegerTy()) 156 return B.getInt32Ty(); 157 return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements()); 158 } 159 160 bool AMDGPUCodeGenPrepare::isI16Ty(const Type *T) const { 161 if (T->isIntegerTy(16)) 162 return true; 163 if (!T->isVectorTy()) 164 return false; 165 return cast<VectorType>(T)->getElementType()->isIntegerTy(16); 166 } 167 168 bool AMDGPUCodeGenPrepare::isI32Ty(const Type *T) const { 169 if (T->isIntegerTy(32)) 170 return true; 171 if (!T->isVectorTy()) 172 return false; 173 return cast<VectorType>(T)->getElementType()->isIntegerTy(32); 174 } 175 176 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const { 177 return I.getOpcode() == Instruction::SDiv || 178 I.getOpcode() == Instruction::SRem; 179 } 180 181 bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const { 182 return isa<ICmpInst>(I.getOperand(0)) ? 183 cast<ICmpInst>(I.getOperand(0))->isSigned() : false; 184 } 185 186 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(BinaryOperator &I) const { 187 assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 188 189 if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::UDiv) 190 return false; 191 192 IRBuilder<> Builder(&I); 193 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 194 195 Type *I32Ty = getI32Ty(Builder, I.getType()); 196 Value *ExtOp0 = nullptr; 197 Value *ExtOp1 = nullptr; 198 Value *ExtRes = nullptr; 199 Value *TruncRes = nullptr; 200 201 if (isSigned(I)) { 202 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32Ty); 203 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 204 } else { 205 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty); 206 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 207 } 208 ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1)); 209 TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 210 211 I.replaceAllUsesWith(TruncRes); 212 I.eraseFromParent(); 213 214 return true; 215 } 216 217 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(ICmpInst &I) const { 218 assert(isI16Ty(I.getOperand(0)->getType()) && "Op0 must be 16 bits"); 219 assert(isI16Ty(I.getOperand(1)->getType()) && "Op1 must be 16 bits"); 220 221 IRBuilder<> Builder(&I); 222 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 223 224 Type *I32TyOp0 = getI32Ty(Builder, I.getOperand(0)->getType()); 225 Type *I32TyOp1 = getI32Ty(Builder, I.getOperand(1)->getType()); 226 Value *ExtOp0 = nullptr; 227 Value *ExtOp1 = nullptr; 228 Value *NewICmp = nullptr; 229 230 if (I.isSigned()) { 231 ExtOp0 = Builder.CreateSExt(I.getOperand(0), I32TyOp0); 232 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32TyOp1); 233 } else { 234 ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32TyOp0); 235 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32TyOp1); 236 } 237 NewICmp = Builder.CreateICmp(I.getPredicate(), ExtOp0, ExtOp1); 238 239 I.replaceAllUsesWith(NewICmp); 240 I.eraseFromParent(); 241 242 return true; 243 } 244 245 bool AMDGPUCodeGenPrepare::promoteUniformI16OpToI32Op(SelectInst &I) const { 246 assert(isI16Ty(I.getType()) && "Op must be 16 bits"); 247 248 IRBuilder<> Builder(&I); 249 Builder.SetCurrentDebugLocation(I.getDebugLoc()); 250 251 Type *I32Ty = getI32Ty(Builder, I.getType()); 252 Value *ExtOp1 = nullptr; 253 Value *ExtOp2 = nullptr; 254 Value *ExtRes = nullptr; 255 Value *TruncRes = nullptr; 256 257 if (isSigned(I)) { 258 ExtOp1 = Builder.CreateSExt(I.getOperand(1), I32Ty); 259 ExtOp2 = Builder.CreateSExt(I.getOperand(2), I32Ty); 260 } else { 261 ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty); 262 ExtOp2 = Builder.CreateZExt(I.getOperand(2), I32Ty); 263 } 264 ExtRes = Builder.CreateSelect(I.getOperand(0), ExtOp1, ExtOp2); 265 TruncRes = Builder.CreateTrunc(ExtRes, getI16Ty(Builder, ExtRes->getType())); 266 267 I.replaceAllUsesWith(TruncRes); 268 I.eraseFromParent(); 269 270 return true; 271 } 272 273 static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv) { 274 const ConstantFP *CNum = dyn_cast<ConstantFP>(Num); 275 if (!CNum) 276 return false; 277 278 // Reciprocal f32 is handled separately without denormals. 279 return UnsafeDiv || CNum->isExactlyValue(+1.0); 280 } 281 282 // Insert an intrinsic for fast fdiv for safe math situations where we can 283 // reduce precision. Leave fdiv for situations where the generic node is 284 // expected to be optimized. 285 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { 286 Type *Ty = FDiv.getType(); 287 288 // TODO: Handle half 289 if (!Ty->getScalarType()->isFloatTy()) 290 return false; 291 292 MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath); 293 if (!FPMath) 294 return false; 295 296 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv); 297 float ULP = FPOp->getFPAccuracy(); 298 if (ULP < 2.5f) 299 return false; 300 301 FastMathFlags FMF = FPOp->getFastMathFlags(); 302 bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || 303 FMF.allowReciprocal(); 304 if (ST->hasFP32Denormals() && !UnsafeDiv) 305 return false; 306 307 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); 308 Builder.setFastMathFlags(FMF); 309 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc()); 310 311 const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo(); 312 Function *Decl 313 = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {}); 314 315 Value *Num = FDiv.getOperand(0); 316 Value *Den = FDiv.getOperand(1); 317 318 Value *NewFDiv = nullptr; 319 320 if (VectorType *VT = dyn_cast<VectorType>(Ty)) { 321 NewFDiv = UndefValue::get(VT); 322 323 // FIXME: Doesn't do the right thing for cases where the vector is partially 324 // constant. This works when the scalarizer pass is run first. 325 for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) { 326 Value *NumEltI = Builder.CreateExtractElement(Num, I); 327 Value *DenEltI = Builder.CreateExtractElement(Den, I); 328 Value *NewElt; 329 330 if (shouldKeepFDivF32(NumEltI, UnsafeDiv)) { 331 NewElt = Builder.CreateFDiv(NumEltI, DenEltI); 332 } else { 333 NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }); 334 } 335 336 NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I); 337 } 338 } else { 339 if (!shouldKeepFDivF32(Num, UnsafeDiv)) 340 NewFDiv = Builder.CreateCall(Decl, { Num, Den }); 341 } 342 343 if (NewFDiv) { 344 FDiv.replaceAllUsesWith(NewFDiv); 345 NewFDiv->takeName(&FDiv); 346 FDiv.eraseFromParent(); 347 } 348 349 return true; 350 } 351 352 static bool hasUnsafeFPMath(const Function &F) { 353 Attribute Attr = F.getFnAttribute("unsafe-fp-math"); 354 return Attr.getValueAsString() == "true"; 355 } 356 357 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { 358 bool Changed = false; 359 360 // TODO: Should we promote smaller types that will be legalized to i16? 361 if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 362 Changed |= promoteUniformI16OpToI32Op(I); 363 364 return Changed; 365 } 366 367 bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) { 368 bool Changed = false; 369 370 // TODO: Should we promote smaller types that will be legalized to i16? 371 if (ST->has16BitInsts() && isI16Ty(I.getOperand(0)->getType()) && 372 isI16Ty(I.getOperand(1)->getType()) && DA->isUniform(&I)) 373 Changed |= promoteUniformI16OpToI32Op(I); 374 375 return Changed; 376 } 377 378 bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) { 379 bool Changed = false; 380 381 // TODO: Should we promote smaller types that will be legalized to i16? 382 if (ST->has16BitInsts() && isI16Ty(I.getType()) && DA->isUniform(&I)) 383 Changed |= promoteUniformI16OpToI32Op(I); 384 385 return Changed; 386 } 387 388 bool AMDGPUCodeGenPrepare::doInitialization(Module &M) { 389 Mod = &M; 390 return false; 391 } 392 393 bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) { 394 if (!TM || skipFunction(F)) 395 return false; 396 397 ST = &TM->getSubtarget<SISubtarget>(F); 398 DA = &getAnalysis<DivergenceAnalysis>(); 399 HasUnsafeFPMath = hasUnsafeFPMath(F); 400 401 bool MadeChange = false; 402 403 for (BasicBlock &BB : F) { 404 BasicBlock::iterator Next; 405 for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) { 406 Next = std::next(I); 407 MadeChange |= visit(*I); 408 } 409 } 410 411 return MadeChange; 412 } 413 414 INITIALIZE_TM_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE, 415 "AMDGPU IR optimizations", false, false) 416 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) 417 INITIALIZE_TM_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, 418 "AMDGPU IR optimizations", false, false) 419 420 char AMDGPUCodeGenPrepare::ID = 0; 421 422 FunctionPass *llvm::createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM) { 423 return new AMDGPUCodeGenPrepare(TM); 424 } 425