1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "llvm/IR/IntrinsicsAMDGPU.h" 20 #include "llvm/Transforms/InstCombine/InstCombiner.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "AMDGPUtti" 25 26 namespace { 27 28 struct AMDGPUImageDMaskIntrinsic { 29 unsigned Intr; 30 }; 31 32 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 33 #include "InstCombineTables.inc" 34 35 } // end anonymous namespace 36 37 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 38 // 39 // A single NaN input is folded to minnum, so we rely on that folding for 40 // handling NaNs. 41 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 42 const APFloat &Src2) { 43 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 44 45 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 46 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 47 if (Cmp0 == APFloat::cmpEqual) 48 return maxnum(Src1, Src2); 49 50 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 51 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 52 if (Cmp1 == APFloat::cmpEqual) 53 return maxnum(Src0, Src2); 54 55 return maxnum(Src0, Src1); 56 } 57 58 // Check if a value can be converted to a 16-bit value without losing 59 // precision. 60 static bool canSafelyConvertTo16Bit(Value &V) { 61 Type *VTy = V.getType(); 62 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 63 // The value is already 16-bit, so we don't want to convert to 16-bit again! 64 return false; 65 } 66 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 67 // We need to check that if we cast the index down to a half, we do not lose 68 // precision. 69 APFloat FloatValue(ConstFloat->getValueAPF()); 70 bool LosesInfo = true; 71 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 72 return !LosesInfo; 73 } 74 Value *CastSrc; 75 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 76 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 77 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 78 Type *CastSrcTy = CastSrc->getType(); 79 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 80 return true; 81 } 82 83 return false; 84 } 85 86 // Convert a value to 16-bit. 87 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 88 Type *VTy = V.getType(); 89 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 90 return cast<Instruction>(&V)->getOperand(0); 91 if (VTy->isIntegerTy()) 92 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 93 if (VTy->isFloatingPointTy()) 94 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 95 96 llvm_unreachable("Should never be called!"); 97 } 98 99 static Optional<Instruction *> 100 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 101 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 102 IntrinsicInst &II, InstCombiner &IC) { 103 if (!ST->hasA16() && !ST->hasG16()) 104 return None; 105 106 bool FloatCoord = false; 107 // true means derivatives can be converted to 16 bit, coordinates not 108 bool OnlyDerivatives = false; 109 110 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 111 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 112 Value *Coord = II.getOperand(OperandIndex); 113 // If the values are not derived from 16-bit values, we cannot optimize. 114 if (!canSafelyConvertTo16Bit(*Coord)) { 115 if (OperandIndex < ImageDimIntr->CoordStart || 116 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 117 return None; 118 } 119 // All gradients can be converted, so convert only them 120 OnlyDerivatives = true; 121 break; 122 } 123 124 assert(OperandIndex == ImageDimIntr->GradientStart || 125 FloatCoord == Coord->getType()->isFloatingPointTy()); 126 FloatCoord = Coord->getType()->isFloatingPointTy(); 127 } 128 129 if (OnlyDerivatives) { 130 if (!ST->hasG16()) 131 return None; 132 } else { 133 if (!ST->hasA16()) 134 OnlyDerivatives = true; // Only supports G16 135 } 136 137 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 138 : Type::getInt16Ty(II.getContext()); 139 140 SmallVector<Type *, 4> ArgTys; 141 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 142 return None; 143 144 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 145 if (!OnlyDerivatives) 146 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 147 Function *I = 148 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 149 150 SmallVector<Value *, 8> Args(II.arg_operands()); 151 152 unsigned EndIndex = 153 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 154 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 155 OperandIndex < EndIndex; OperandIndex++) { 156 Args[OperandIndex] = 157 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 158 } 159 160 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 161 NewCall->takeName(&II); 162 NewCall->copyMetadata(II); 163 if (isa<FPMathOperator>(NewCall)) 164 NewCall->copyFastMathFlags(&II); 165 return IC.replaceInstUsesWith(II, NewCall); 166 } 167 168 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 169 InstCombiner &IC) const { 170 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 171 // infinity, gives +0.0. If we can prove we don't have one of the special 172 // cases then we can use a normal multiply instead. 173 // TODO: Create and use isKnownFiniteNonZero instead of just matching 174 // constants here. 175 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 176 match(Op1, PatternMatch::m_FiniteNonZero())) { 177 // One operand is not zero or infinity or NaN. 178 return true; 179 } 180 auto *TLI = &IC.getTargetLibraryInfo(); 181 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 182 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 183 // Neither operand is infinity or NaN. 184 return true; 185 } 186 return false; 187 } 188 189 Optional<Instruction *> 190 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 191 Intrinsic::ID IID = II.getIntrinsicID(); 192 switch (IID) { 193 case Intrinsic::amdgcn_rcp: { 194 Value *Src = II.getArgOperand(0); 195 196 // TODO: Move to ConstantFolding/InstSimplify? 197 if (isa<UndefValue>(Src)) { 198 Type *Ty = II.getType(); 199 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 200 return IC.replaceInstUsesWith(II, QNaN); 201 } 202 203 if (II.isStrictFP()) 204 break; 205 206 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 207 const APFloat &ArgVal = C->getValueAPF(); 208 APFloat Val(ArgVal.getSemantics(), 1); 209 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 210 211 // This is more precise than the instruction may give. 212 // 213 // TODO: The instruction always flushes denormal results (except for f16), 214 // should this also? 215 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 216 } 217 218 break; 219 } 220 case Intrinsic::amdgcn_rsq: { 221 Value *Src = II.getArgOperand(0); 222 223 // TODO: Move to ConstantFolding/InstSimplify? 224 if (isa<UndefValue>(Src)) { 225 Type *Ty = II.getType(); 226 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 227 return IC.replaceInstUsesWith(II, QNaN); 228 } 229 230 break; 231 } 232 case Intrinsic::amdgcn_frexp_mant: 233 case Intrinsic::amdgcn_frexp_exp: { 234 Value *Src = II.getArgOperand(0); 235 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 236 int Exp; 237 APFloat Significand = 238 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 239 240 if (IID == Intrinsic::amdgcn_frexp_mant) { 241 return IC.replaceInstUsesWith( 242 II, ConstantFP::get(II.getContext(), Significand)); 243 } 244 245 // Match instruction special case behavior. 246 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 247 Exp = 0; 248 249 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 250 } 251 252 if (isa<UndefValue>(Src)) { 253 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 254 } 255 256 break; 257 } 258 case Intrinsic::amdgcn_class: { 259 enum { 260 S_NAN = 1 << 0, // Signaling NaN 261 Q_NAN = 1 << 1, // Quiet NaN 262 N_INFINITY = 1 << 2, // Negative infinity 263 N_NORMAL = 1 << 3, // Negative normal 264 N_SUBNORMAL = 1 << 4, // Negative subnormal 265 N_ZERO = 1 << 5, // Negative zero 266 P_ZERO = 1 << 6, // Positive zero 267 P_SUBNORMAL = 1 << 7, // Positive subnormal 268 P_NORMAL = 1 << 8, // Positive normal 269 P_INFINITY = 1 << 9 // Positive infinity 270 }; 271 272 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 273 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 274 P_NORMAL | P_INFINITY; 275 276 Value *Src0 = II.getArgOperand(0); 277 Value *Src1 = II.getArgOperand(1); 278 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 279 if (!CMask) { 280 if (isa<UndefValue>(Src0)) { 281 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 282 } 283 284 if (isa<UndefValue>(Src1)) { 285 return IC.replaceInstUsesWith(II, 286 ConstantInt::get(II.getType(), false)); 287 } 288 break; 289 } 290 291 uint32_t Mask = CMask->getZExtValue(); 292 293 // If all tests are made, it doesn't matter what the value is. 294 if ((Mask & FullMask) == FullMask) { 295 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 296 } 297 298 if ((Mask & FullMask) == 0) { 299 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 300 } 301 302 if (Mask == (S_NAN | Q_NAN)) { 303 // Equivalent of isnan. Replace with standard fcmp. 304 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 305 FCmp->takeName(&II); 306 return IC.replaceInstUsesWith(II, FCmp); 307 } 308 309 if (Mask == (N_ZERO | P_ZERO)) { 310 // Equivalent of == 0. 311 Value *FCmp = 312 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 313 314 FCmp->takeName(&II); 315 return IC.replaceInstUsesWith(II, FCmp); 316 } 317 318 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 319 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 320 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 321 return IC.replaceOperand( 322 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 323 } 324 325 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 326 if (!CVal) { 327 if (isa<UndefValue>(Src0)) { 328 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 329 } 330 331 // Clamp mask to used bits 332 if ((Mask & FullMask) != Mask) { 333 CallInst *NewCall = IC.Builder.CreateCall( 334 II.getCalledFunction(), 335 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 336 337 NewCall->takeName(&II); 338 return IC.replaceInstUsesWith(II, NewCall); 339 } 340 341 break; 342 } 343 344 const APFloat &Val = CVal->getValueAPF(); 345 346 bool Result = 347 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 348 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 349 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 350 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 351 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 352 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 353 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 354 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 355 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 356 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 357 358 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 359 } 360 case Intrinsic::amdgcn_cvt_pkrtz: { 361 Value *Src0 = II.getArgOperand(0); 362 Value *Src1 = II.getArgOperand(1); 363 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 364 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 365 const fltSemantics &HalfSem = 366 II.getType()->getScalarType()->getFltSemantics(); 367 bool LosesInfo; 368 APFloat Val0 = C0->getValueAPF(); 369 APFloat Val1 = C1->getValueAPF(); 370 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 371 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 372 373 Constant *Folded = 374 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 375 ConstantFP::get(II.getContext(), Val1)}); 376 return IC.replaceInstUsesWith(II, Folded); 377 } 378 } 379 380 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 381 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 382 } 383 384 break; 385 } 386 case Intrinsic::amdgcn_cvt_pknorm_i16: 387 case Intrinsic::amdgcn_cvt_pknorm_u16: 388 case Intrinsic::amdgcn_cvt_pk_i16: 389 case Intrinsic::amdgcn_cvt_pk_u16: { 390 Value *Src0 = II.getArgOperand(0); 391 Value *Src1 = II.getArgOperand(1); 392 393 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 394 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 395 } 396 397 break; 398 } 399 case Intrinsic::amdgcn_ubfe: 400 case Intrinsic::amdgcn_sbfe: { 401 // Decompose simple cases into standard shifts. 402 Value *Src = II.getArgOperand(0); 403 if (isa<UndefValue>(Src)) { 404 return IC.replaceInstUsesWith(II, Src); 405 } 406 407 unsigned Width; 408 Type *Ty = II.getType(); 409 unsigned IntSize = Ty->getIntegerBitWidth(); 410 411 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 412 if (CWidth) { 413 Width = CWidth->getZExtValue(); 414 if ((Width & (IntSize - 1)) == 0) { 415 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 416 } 417 418 // Hardware ignores high bits, so remove those. 419 if (Width >= IntSize) { 420 return IC.replaceOperand( 421 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 422 } 423 } 424 425 unsigned Offset; 426 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 427 if (COffset) { 428 Offset = COffset->getZExtValue(); 429 if (Offset >= IntSize) { 430 return IC.replaceOperand( 431 II, 1, 432 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 433 } 434 } 435 436 bool Signed = IID == Intrinsic::amdgcn_sbfe; 437 438 if (!CWidth || !COffset) 439 break; 440 441 // The case of Width == 0 is handled above, which makes this tranformation 442 // safe. If Width == 0, then the ashr and lshr instructions become poison 443 // value since the shift amount would be equal to the bit size. 444 assert(Width != 0); 445 446 // TODO: This allows folding to undef when the hardware has specific 447 // behavior? 448 if (Offset + Width < IntSize) { 449 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 450 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 451 : IC.Builder.CreateLShr(Shl, IntSize - Width); 452 RightShift->takeName(&II); 453 return IC.replaceInstUsesWith(II, RightShift); 454 } 455 456 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 457 : IC.Builder.CreateLShr(Src, Offset); 458 459 RightShift->takeName(&II); 460 return IC.replaceInstUsesWith(II, RightShift); 461 } 462 case Intrinsic::amdgcn_exp: 463 case Intrinsic::amdgcn_exp_compr: { 464 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 465 unsigned EnBits = En->getZExtValue(); 466 if (EnBits == 0xf) 467 break; // All inputs enabled. 468 469 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 470 bool Changed = false; 471 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 472 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 473 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 474 Value *Src = II.getArgOperand(I + 2); 475 if (!isa<UndefValue>(Src)) { 476 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 477 Changed = true; 478 } 479 } 480 } 481 482 if (Changed) { 483 return &II; 484 } 485 486 break; 487 } 488 case Intrinsic::amdgcn_fmed3: { 489 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 490 // for the shader. 491 492 Value *Src0 = II.getArgOperand(0); 493 Value *Src1 = II.getArgOperand(1); 494 Value *Src2 = II.getArgOperand(2); 495 496 // Checking for NaN before canonicalization provides better fidelity when 497 // mapping other operations onto fmed3 since the order of operands is 498 // unchanged. 499 CallInst *NewCall = nullptr; 500 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 501 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 502 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 503 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 504 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 505 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 506 } 507 508 if (NewCall) { 509 NewCall->copyFastMathFlags(&II); 510 NewCall->takeName(&II); 511 return IC.replaceInstUsesWith(II, NewCall); 512 } 513 514 bool Swap = false; 515 // Canonicalize constants to RHS operands. 516 // 517 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 518 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 519 std::swap(Src0, Src1); 520 Swap = true; 521 } 522 523 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 524 std::swap(Src1, Src2); 525 Swap = true; 526 } 527 528 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 529 std::swap(Src0, Src1); 530 Swap = true; 531 } 532 533 if (Swap) { 534 II.setArgOperand(0, Src0); 535 II.setArgOperand(1, Src1); 536 II.setArgOperand(2, Src2); 537 return &II; 538 } 539 540 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 541 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 542 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 543 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 544 C2->getValueAPF()); 545 return IC.replaceInstUsesWith( 546 II, ConstantFP::get(IC.Builder.getContext(), Result)); 547 } 548 } 549 } 550 551 break; 552 } 553 case Intrinsic::amdgcn_icmp: 554 case Intrinsic::amdgcn_fcmp: { 555 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 556 // Guard against invalid arguments. 557 int64_t CCVal = CC->getZExtValue(); 558 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 559 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 560 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 561 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 562 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 563 break; 564 565 Value *Src0 = II.getArgOperand(0); 566 Value *Src1 = II.getArgOperand(1); 567 568 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 569 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 570 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 571 if (CCmp->isNullValue()) { 572 return IC.replaceInstUsesWith( 573 II, ConstantExpr::getSExt(CCmp, II.getType())); 574 } 575 576 // The result of V_ICMP/V_FCMP assembly instructions (which this 577 // intrinsic exposes) is one bit per thread, masked with the EXEC 578 // register (which contains the bitmask of live threads). So a 579 // comparison that always returns true is the same as a read of the 580 // EXEC register. 581 Function *NewF = Intrinsic::getDeclaration( 582 II.getModule(), Intrinsic::read_register, II.getType()); 583 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 584 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 585 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 586 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 587 NewCall->addAttribute(AttributeList::FunctionIndex, 588 Attribute::Convergent); 589 NewCall->takeName(&II); 590 return IC.replaceInstUsesWith(II, NewCall); 591 } 592 593 // Canonicalize constants to RHS. 594 CmpInst::Predicate SwapPred = 595 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 596 II.setArgOperand(0, Src1); 597 II.setArgOperand(1, Src0); 598 II.setArgOperand( 599 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 600 return &II; 601 } 602 603 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 604 break; 605 606 // Canonicalize compare eq with true value to compare != 0 607 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 608 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 609 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 610 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 611 Value *ExtSrc; 612 if (CCVal == CmpInst::ICMP_EQ && 613 ((match(Src1, PatternMatch::m_One()) && 614 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 615 (match(Src1, PatternMatch::m_AllOnes()) && 616 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 617 ExtSrc->getType()->isIntegerTy(1)) { 618 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 619 IC.replaceOperand(II, 2, 620 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 621 return &II; 622 } 623 624 CmpInst::Predicate SrcPred; 625 Value *SrcLHS; 626 Value *SrcRHS; 627 628 // Fold compare eq/ne with 0 from a compare result as the predicate to the 629 // intrinsic. The typical use is a wave vote function in the library, which 630 // will be fed from a user code condition compared with 0. Fold in the 631 // redundant compare. 632 633 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 634 // -> llvm.amdgcn.[if]cmp(a, b, pred) 635 // 636 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 637 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 638 if (match(Src1, PatternMatch::m_Zero()) && 639 match(Src0, PatternMatch::m_ZExtOrSExt( 640 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 641 PatternMatch::m_Value(SrcRHS))))) { 642 if (CCVal == CmpInst::ICMP_EQ) 643 SrcPred = CmpInst::getInversePredicate(SrcPred); 644 645 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 646 ? Intrinsic::amdgcn_fcmp 647 : Intrinsic::amdgcn_icmp; 648 649 Type *Ty = SrcLHS->getType(); 650 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 651 // Promote to next legal integer type. 652 unsigned Width = CmpType->getBitWidth(); 653 unsigned NewWidth = Width; 654 655 // Don't do anything for i1 comparisons. 656 if (Width == 1) 657 break; 658 659 if (Width <= 16) 660 NewWidth = 16; 661 else if (Width <= 32) 662 NewWidth = 32; 663 else if (Width <= 64) 664 NewWidth = 64; 665 else if (Width > 64) 666 break; // Can't handle this. 667 668 if (Width != NewWidth) { 669 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 670 if (CmpInst::isSigned(SrcPred)) { 671 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 672 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 673 } else { 674 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 675 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 676 } 677 } 678 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 679 break; 680 681 Function *NewF = Intrinsic::getDeclaration( 682 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 683 Value *Args[] = {SrcLHS, SrcRHS, 684 ConstantInt::get(CC->getType(), SrcPred)}; 685 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 686 NewCall->takeName(&II); 687 return IC.replaceInstUsesWith(II, NewCall); 688 } 689 690 break; 691 } 692 case Intrinsic::amdgcn_ballot: { 693 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 694 if (Src->isZero()) { 695 // amdgcn.ballot(i1 0) is zero. 696 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 697 } 698 699 if (Src->isOne()) { 700 // amdgcn.ballot(i1 1) is exec. 701 const char *RegName = "exec"; 702 if (II.getType()->isIntegerTy(32)) 703 RegName = "exec_lo"; 704 else if (!II.getType()->isIntegerTy(64)) 705 break; 706 707 Function *NewF = Intrinsic::getDeclaration( 708 II.getModule(), Intrinsic::read_register, II.getType()); 709 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 710 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 711 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 712 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 713 NewCall->addAttribute(AttributeList::FunctionIndex, 714 Attribute::Convergent); 715 NewCall->takeName(&II); 716 return IC.replaceInstUsesWith(II, NewCall); 717 } 718 } 719 break; 720 } 721 case Intrinsic::amdgcn_wqm_vote: { 722 // wqm_vote is identity when the argument is constant. 723 if (!isa<Constant>(II.getArgOperand(0))) 724 break; 725 726 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 727 } 728 case Intrinsic::amdgcn_kill: { 729 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 730 if (!C || !C->getZExtValue()) 731 break; 732 733 // amdgcn.kill(i1 1) is a no-op 734 return IC.eraseInstFromFunction(II); 735 } 736 case Intrinsic::amdgcn_update_dpp: { 737 Value *Old = II.getArgOperand(0); 738 739 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 740 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 741 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 742 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 743 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 744 break; 745 746 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 747 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 748 } 749 case Intrinsic::amdgcn_permlane16: 750 case Intrinsic::amdgcn_permlanex16: { 751 // Discard vdst_in if it's not going to be read. 752 Value *VDstIn = II.getArgOperand(0); 753 if (isa<UndefValue>(VDstIn)) 754 break; 755 756 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 757 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 758 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 759 break; 760 761 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 762 } 763 case Intrinsic::amdgcn_readfirstlane: 764 case Intrinsic::amdgcn_readlane: { 765 // A constant value is trivially uniform. 766 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 767 return IC.replaceInstUsesWith(II, C); 768 } 769 770 // The rest of these may not be safe if the exec may not be the same between 771 // the def and use. 772 Value *Src = II.getArgOperand(0); 773 Instruction *SrcInst = dyn_cast<Instruction>(Src); 774 if (SrcInst && SrcInst->getParent() != II.getParent()) 775 break; 776 777 // readfirstlane (readfirstlane x) -> readfirstlane x 778 // readlane (readfirstlane x), y -> readfirstlane x 779 if (match(Src, 780 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 781 return IC.replaceInstUsesWith(II, Src); 782 } 783 784 if (IID == Intrinsic::amdgcn_readfirstlane) { 785 // readfirstlane (readlane x, y) -> readlane x, y 786 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 787 return IC.replaceInstUsesWith(II, Src); 788 } 789 } else { 790 // readlane (readlane x, y), y -> readlane x, y 791 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 792 PatternMatch::m_Value(), 793 PatternMatch::m_Specific(II.getArgOperand(1))))) { 794 return IC.replaceInstUsesWith(II, Src); 795 } 796 } 797 798 break; 799 } 800 case Intrinsic::amdgcn_ldexp: { 801 // FIXME: This doesn't introduce new instructions and belongs in 802 // InstructionSimplify. 803 Type *Ty = II.getType(); 804 Value *Op0 = II.getArgOperand(0); 805 Value *Op1 = II.getArgOperand(1); 806 807 // Folding undef to qnan is safe regardless of the FP mode. 808 if (isa<UndefValue>(Op0)) { 809 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 810 return IC.replaceInstUsesWith(II, QNaN); 811 } 812 813 const APFloat *C = nullptr; 814 match(Op0, PatternMatch::m_APFloat(C)); 815 816 // FIXME: Should flush denorms depending on FP mode, but that's ignored 817 // everywhere else. 818 // 819 // These cases should be safe, even with strictfp. 820 // ldexp(0.0, x) -> 0.0 821 // ldexp(-0.0, x) -> -0.0 822 // ldexp(inf, x) -> inf 823 // ldexp(-inf, x) -> -inf 824 if (C && (C->isZero() || C->isInfinity())) { 825 return IC.replaceInstUsesWith(II, Op0); 826 } 827 828 // With strictfp, be more careful about possibly needing to flush denormals 829 // or not, and snan behavior depends on ieee_mode. 830 if (II.isStrictFP()) 831 break; 832 833 if (C && C->isNaN()) { 834 // FIXME: We just need to make the nan quiet here, but that's unavailable 835 // on APFloat, only IEEEfloat 836 auto *Quieted = 837 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 838 return IC.replaceInstUsesWith(II, Quieted); 839 } 840 841 // ldexp(x, 0) -> x 842 // ldexp(x, undef) -> x 843 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 844 return IC.replaceInstUsesWith(II, Op0); 845 } 846 847 break; 848 } 849 case Intrinsic::amdgcn_fmul_legacy: { 850 Value *Op0 = II.getArgOperand(0); 851 Value *Op1 = II.getArgOperand(1); 852 853 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 854 // infinity, gives +0.0. 855 // TODO: Move to InstSimplify? 856 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 857 match(Op1, PatternMatch::m_AnyZeroFP())) 858 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 859 860 // If we can prove we don't have one of the special cases then we can use a 861 // normal fmul instruction instead. 862 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 863 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 864 FMul->takeName(&II); 865 return IC.replaceInstUsesWith(II, FMul); 866 } 867 break; 868 } 869 case Intrinsic::amdgcn_fma_legacy: { 870 Value *Op0 = II.getArgOperand(0); 871 Value *Op1 = II.getArgOperand(1); 872 Value *Op2 = II.getArgOperand(2); 873 874 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 875 // infinity, gives +0.0. 876 // TODO: Move to InstSimplify? 877 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 878 match(Op1, PatternMatch::m_AnyZeroFP())) { 879 // It's tempting to just return Op2 here, but that would give the wrong 880 // result if Op2 was -0.0. 881 auto *Zero = ConstantFP::getNullValue(II.getType()); 882 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 883 FAdd->takeName(&II); 884 return IC.replaceInstUsesWith(II, FAdd); 885 } 886 887 // If we can prove we don't have one of the special cases then we can use a 888 // normal fma instead. 889 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 890 II.setCalledOperand(Intrinsic::getDeclaration( 891 II.getModule(), Intrinsic::fma, II.getType())); 892 return &II; 893 } 894 break; 895 } 896 default: { 897 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 898 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 899 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 900 } 901 } 902 } 903 return None; 904 } 905 906 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 907 /// 908 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 909 /// struct returns. 910 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 911 IntrinsicInst &II, 912 APInt DemandedElts, 913 int DMaskIdx = -1) { 914 915 auto *IIVTy = cast<FixedVectorType>(II.getType()); 916 unsigned VWidth = IIVTy->getNumElements(); 917 if (VWidth == 1) 918 return nullptr; 919 920 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 921 IC.Builder.SetInsertPoint(&II); 922 923 // Assume the arguments are unchanged and later override them, if needed. 924 SmallVector<Value *, 16> Args(II.args()); 925 926 if (DMaskIdx < 0) { 927 // Buffer case. 928 929 const unsigned ActiveBits = DemandedElts.getActiveBits(); 930 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 931 932 // Start assuming the prefix of elements is demanded, but possibly clear 933 // some other bits if there are trailing zeros (unused components at front) 934 // and update offset. 935 DemandedElts = (1 << ActiveBits) - 1; 936 937 if (UnusedComponentsAtFront > 0) { 938 static const unsigned InvalidOffsetIdx = 0xf; 939 940 unsigned OffsetIdx; 941 switch (II.getIntrinsicID()) { 942 case Intrinsic::amdgcn_raw_buffer_load: 943 OffsetIdx = 1; 944 break; 945 case Intrinsic::amdgcn_s_buffer_load: 946 // If resulting type is vec3, there is no point in trimming the 947 // load with updated offset, as the vec3 would most likely be widened to 948 // vec4 anyway during lowering. 949 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 950 OffsetIdx = InvalidOffsetIdx; 951 else 952 OffsetIdx = 1; 953 break; 954 case Intrinsic::amdgcn_struct_buffer_load: 955 OffsetIdx = 2; 956 break; 957 default: 958 // TODO: handle tbuffer* intrinsics. 959 OffsetIdx = InvalidOffsetIdx; 960 break; 961 } 962 963 if (OffsetIdx != InvalidOffsetIdx) { 964 // Clear demanded bits and update the offset. 965 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 966 auto *Offset = II.getArgOperand(OffsetIdx); 967 unsigned SingleComponentSizeInBits = 968 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 969 unsigned OffsetAdd = 970 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 971 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 972 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 973 } 974 } 975 } else { 976 // Image case. 977 978 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 979 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 980 981 // Mask off values that are undefined because the dmask doesn't cover them 982 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 983 984 unsigned NewDMaskVal = 0; 985 unsigned OrigLoadIdx = 0; 986 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 987 const unsigned Bit = 1 << SrcIdx; 988 if (!!(DMaskVal & Bit)) { 989 if (!!DemandedElts[OrigLoadIdx]) 990 NewDMaskVal |= Bit; 991 OrigLoadIdx++; 992 } 993 } 994 995 if (DMaskVal != NewDMaskVal) 996 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 997 } 998 999 unsigned NewNumElts = DemandedElts.countPopulation(); 1000 if (!NewNumElts) 1001 return UndefValue::get(II.getType()); 1002 1003 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1004 if (DMaskIdx >= 0) 1005 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1006 return nullptr; 1007 } 1008 1009 // Validate function argument and return types, extracting overloaded types 1010 // along the way. 1011 SmallVector<Type *, 6> OverloadTys; 1012 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1013 return nullptr; 1014 1015 Module *M = II.getParent()->getParent()->getParent(); 1016 Type *EltTy = IIVTy->getElementType(); 1017 Type *NewTy = 1018 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1019 1020 OverloadTys[0] = NewTy; 1021 Function *NewIntrin = 1022 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1023 1024 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1025 NewCall->takeName(&II); 1026 NewCall->copyMetadata(II); 1027 1028 if (NewNumElts == 1) { 1029 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1030 NewCall, 1031 DemandedElts.countTrailingZeros()); 1032 } 1033 1034 SmallVector<int, 8> EltMask; 1035 unsigned NewLoadIdx = 0; 1036 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1037 if (!!DemandedElts[OrigLoadIdx]) 1038 EltMask.push_back(NewLoadIdx++); 1039 else 1040 EltMask.push_back(NewNumElts); 1041 } 1042 1043 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1044 1045 return Shuffle; 1046 } 1047 1048 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1049 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1050 APInt &UndefElts2, APInt &UndefElts3, 1051 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1052 SimplifyAndSetOp) const { 1053 switch (II.getIntrinsicID()) { 1054 case Intrinsic::amdgcn_buffer_load: 1055 case Intrinsic::amdgcn_buffer_load_format: 1056 case Intrinsic::amdgcn_raw_buffer_load: 1057 case Intrinsic::amdgcn_raw_buffer_load_format: 1058 case Intrinsic::amdgcn_raw_tbuffer_load: 1059 case Intrinsic::amdgcn_s_buffer_load: 1060 case Intrinsic::amdgcn_struct_buffer_load: 1061 case Intrinsic::amdgcn_struct_buffer_load_format: 1062 case Intrinsic::amdgcn_struct_tbuffer_load: 1063 case Intrinsic::amdgcn_tbuffer_load: 1064 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1065 default: { 1066 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1067 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1068 } 1069 break; 1070 } 1071 } 1072 return None; 1073 } 1074