1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "llvm/Support/KnownBits.h" 19 #include "llvm/Transforms/InstCombine/InstCombiner.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "AMDGPUtti" 24 25 namespace { 26 27 struct AMDGPUImageDMaskIntrinsic { 28 unsigned Intr; 29 }; 30 31 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 32 #include "InstCombineTables.inc" 33 34 } // end anonymous namespace 35 36 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 37 // 38 // A single NaN input is folded to minnum, so we rely on that folding for 39 // handling NaNs. 40 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 41 const APFloat &Src2) { 42 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 43 44 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 45 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 46 if (Cmp0 == APFloat::cmpEqual) 47 return maxnum(Src1, Src2); 48 49 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 50 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp1 == APFloat::cmpEqual) 52 return maxnum(Src0, Src2); 53 54 return maxnum(Src0, Src1); 55 } 56 57 // Check if a value can be converted to a 16-bit value without losing 58 // precision. 59 static bool canSafelyConvertTo16Bit(Value &V) { 60 Type *VTy = V.getType(); 61 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 62 // The value is already 16-bit, so we don't want to convert to 16-bit again! 63 return false; 64 } 65 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 66 // We need to check that if we cast the index down to a half, we do not lose 67 // precision. 68 APFloat FloatValue(ConstFloat->getValueAPF()); 69 bool LosesInfo = true; 70 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 71 return !LosesInfo; 72 } 73 Value *CastSrc; 74 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 75 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 76 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 77 Type *CastSrcTy = CastSrc->getType(); 78 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 79 return true; 80 } 81 82 return false; 83 } 84 85 // Convert a value to 16-bit. 86 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 87 Type *VTy = V.getType(); 88 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 89 return cast<Instruction>(&V)->getOperand(0); 90 if (VTy->isIntegerTy()) 91 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 92 if (VTy->isFloatingPointTy()) 93 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 94 95 llvm_unreachable("Should never be called!"); 96 } 97 98 static Optional<Instruction *> 99 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 100 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 101 IntrinsicInst &II, InstCombiner &IC) { 102 if (!ST->hasA16() && !ST->hasG16()) 103 return None; 104 105 bool FloatCoord = false; 106 // true means derivatives can be converted to 16 bit, coordinates not 107 bool OnlyDerivatives = false; 108 109 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 110 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 111 Value *Coord = II.getOperand(OperandIndex); 112 // If the values are not derived from 16-bit values, we cannot optimize. 113 if (!canSafelyConvertTo16Bit(*Coord)) { 114 if (OperandIndex < ImageDimIntr->CoordStart || 115 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 116 return None; 117 } 118 // All gradients can be converted, so convert only them 119 OnlyDerivatives = true; 120 break; 121 } 122 123 assert(OperandIndex == ImageDimIntr->GradientStart || 124 FloatCoord == Coord->getType()->isFloatingPointTy()); 125 FloatCoord = Coord->getType()->isFloatingPointTy(); 126 } 127 128 if (OnlyDerivatives) { 129 if (!ST->hasG16()) 130 return None; 131 } else { 132 if (!ST->hasA16()) 133 OnlyDerivatives = true; // Only supports G16 134 } 135 136 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 137 : Type::getInt16Ty(II.getContext()); 138 139 SmallVector<Type *, 4> ArgTys; 140 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 141 return None; 142 143 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 144 if (!OnlyDerivatives) 145 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 146 Function *I = 147 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 148 149 SmallVector<Value *, 8> Args(II.arg_operands()); 150 151 unsigned EndIndex = 152 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 153 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 154 OperandIndex < EndIndex; OperandIndex++) { 155 Args[OperandIndex] = 156 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 157 } 158 159 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 160 NewCall->takeName(&II); 161 NewCall->copyMetadata(II); 162 if (isa<FPMathOperator>(NewCall)) 163 NewCall->copyFastMathFlags(&II); 164 return IC.replaceInstUsesWith(II, NewCall); 165 } 166 167 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 168 InstCombiner &IC) const { 169 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 170 // infinity, gives +0.0. If we can prove we don't have one of the special 171 // cases then we can use a normal multiply instead. 172 // TODO: Create and use isKnownFiniteNonZero instead of just matching 173 // constants here. 174 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 175 match(Op1, PatternMatch::m_FiniteNonZero())) { 176 // One operand is not zero or infinity or NaN. 177 return true; 178 } 179 auto *TLI = &IC.getTargetLibraryInfo(); 180 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 181 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 182 // Neither operand is infinity or NaN. 183 return true; 184 } 185 return false; 186 } 187 188 Optional<Instruction *> 189 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 190 Intrinsic::ID IID = II.getIntrinsicID(); 191 switch (IID) { 192 case Intrinsic::amdgcn_rcp: { 193 Value *Src = II.getArgOperand(0); 194 195 // TODO: Move to ConstantFolding/InstSimplify? 196 if (isa<UndefValue>(Src)) { 197 Type *Ty = II.getType(); 198 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 199 return IC.replaceInstUsesWith(II, QNaN); 200 } 201 202 if (II.isStrictFP()) 203 break; 204 205 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 206 const APFloat &ArgVal = C->getValueAPF(); 207 APFloat Val(ArgVal.getSemantics(), 1); 208 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 209 210 // This is more precise than the instruction may give. 211 // 212 // TODO: The instruction always flushes denormal results (except for f16), 213 // should this also? 214 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 215 } 216 217 break; 218 } 219 case Intrinsic::amdgcn_rsq: { 220 Value *Src = II.getArgOperand(0); 221 222 // TODO: Move to ConstantFolding/InstSimplify? 223 if (isa<UndefValue>(Src)) { 224 Type *Ty = II.getType(); 225 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 226 return IC.replaceInstUsesWith(II, QNaN); 227 } 228 229 break; 230 } 231 case Intrinsic::amdgcn_frexp_mant: 232 case Intrinsic::amdgcn_frexp_exp: { 233 Value *Src = II.getArgOperand(0); 234 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 235 int Exp; 236 APFloat Significand = 237 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 238 239 if (IID == Intrinsic::amdgcn_frexp_mant) { 240 return IC.replaceInstUsesWith( 241 II, ConstantFP::get(II.getContext(), Significand)); 242 } 243 244 // Match instruction special case behavior. 245 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 246 Exp = 0; 247 248 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 249 } 250 251 if (isa<UndefValue>(Src)) { 252 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 253 } 254 255 break; 256 } 257 case Intrinsic::amdgcn_class: { 258 enum { 259 S_NAN = 1 << 0, // Signaling NaN 260 Q_NAN = 1 << 1, // Quiet NaN 261 N_INFINITY = 1 << 2, // Negative infinity 262 N_NORMAL = 1 << 3, // Negative normal 263 N_SUBNORMAL = 1 << 4, // Negative subnormal 264 N_ZERO = 1 << 5, // Negative zero 265 P_ZERO = 1 << 6, // Positive zero 266 P_SUBNORMAL = 1 << 7, // Positive subnormal 267 P_NORMAL = 1 << 8, // Positive normal 268 P_INFINITY = 1 << 9 // Positive infinity 269 }; 270 271 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 272 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 273 P_NORMAL | P_INFINITY; 274 275 Value *Src0 = II.getArgOperand(0); 276 Value *Src1 = II.getArgOperand(1); 277 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 278 if (!CMask) { 279 if (isa<UndefValue>(Src0)) { 280 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 281 } 282 283 if (isa<UndefValue>(Src1)) { 284 return IC.replaceInstUsesWith(II, 285 ConstantInt::get(II.getType(), false)); 286 } 287 break; 288 } 289 290 uint32_t Mask = CMask->getZExtValue(); 291 292 // If all tests are made, it doesn't matter what the value is. 293 if ((Mask & FullMask) == FullMask) { 294 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 295 } 296 297 if ((Mask & FullMask) == 0) { 298 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 299 } 300 301 if (Mask == (S_NAN | Q_NAN)) { 302 // Equivalent of isnan. Replace with standard fcmp. 303 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 304 FCmp->takeName(&II); 305 return IC.replaceInstUsesWith(II, FCmp); 306 } 307 308 if (Mask == (N_ZERO | P_ZERO)) { 309 // Equivalent of == 0. 310 Value *FCmp = 311 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 312 313 FCmp->takeName(&II); 314 return IC.replaceInstUsesWith(II, FCmp); 315 } 316 317 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 318 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 319 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 320 return IC.replaceOperand( 321 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 322 } 323 324 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 325 if (!CVal) { 326 if (isa<UndefValue>(Src0)) { 327 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 328 } 329 330 // Clamp mask to used bits 331 if ((Mask & FullMask) != Mask) { 332 CallInst *NewCall = IC.Builder.CreateCall( 333 II.getCalledFunction(), 334 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 335 336 NewCall->takeName(&II); 337 return IC.replaceInstUsesWith(II, NewCall); 338 } 339 340 break; 341 } 342 343 const APFloat &Val = CVal->getValueAPF(); 344 345 bool Result = 346 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 347 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 348 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 349 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 350 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 351 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 352 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 353 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 354 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 355 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 356 357 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 358 } 359 case Intrinsic::amdgcn_cvt_pkrtz: { 360 Value *Src0 = II.getArgOperand(0); 361 Value *Src1 = II.getArgOperand(1); 362 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 363 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 364 const fltSemantics &HalfSem = 365 II.getType()->getScalarType()->getFltSemantics(); 366 bool LosesInfo; 367 APFloat Val0 = C0->getValueAPF(); 368 APFloat Val1 = C1->getValueAPF(); 369 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 370 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 371 372 Constant *Folded = 373 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 374 ConstantFP::get(II.getContext(), Val1)}); 375 return IC.replaceInstUsesWith(II, Folded); 376 } 377 } 378 379 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 380 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 381 } 382 383 break; 384 } 385 case Intrinsic::amdgcn_cvt_pknorm_i16: 386 case Intrinsic::amdgcn_cvt_pknorm_u16: 387 case Intrinsic::amdgcn_cvt_pk_i16: 388 case Intrinsic::amdgcn_cvt_pk_u16: { 389 Value *Src0 = II.getArgOperand(0); 390 Value *Src1 = II.getArgOperand(1); 391 392 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 393 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 394 } 395 396 break; 397 } 398 case Intrinsic::amdgcn_ubfe: 399 case Intrinsic::amdgcn_sbfe: { 400 // Decompose simple cases into standard shifts. 401 Value *Src = II.getArgOperand(0); 402 if (isa<UndefValue>(Src)) { 403 return IC.replaceInstUsesWith(II, Src); 404 } 405 406 unsigned Width; 407 Type *Ty = II.getType(); 408 unsigned IntSize = Ty->getIntegerBitWidth(); 409 410 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 411 if (CWidth) { 412 Width = CWidth->getZExtValue(); 413 if ((Width & (IntSize - 1)) == 0) { 414 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 415 } 416 417 // Hardware ignores high bits, so remove those. 418 if (Width >= IntSize) { 419 return IC.replaceOperand( 420 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 421 } 422 } 423 424 unsigned Offset; 425 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 426 if (COffset) { 427 Offset = COffset->getZExtValue(); 428 if (Offset >= IntSize) { 429 return IC.replaceOperand( 430 II, 1, 431 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 432 } 433 } 434 435 bool Signed = IID == Intrinsic::amdgcn_sbfe; 436 437 if (!CWidth || !COffset) 438 break; 439 440 // The case of Width == 0 is handled above, which makes this tranformation 441 // safe. If Width == 0, then the ashr and lshr instructions become poison 442 // value since the shift amount would be equal to the bit size. 443 assert(Width != 0); 444 445 // TODO: This allows folding to undef when the hardware has specific 446 // behavior? 447 if (Offset + Width < IntSize) { 448 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 449 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 450 : IC.Builder.CreateLShr(Shl, IntSize - Width); 451 RightShift->takeName(&II); 452 return IC.replaceInstUsesWith(II, RightShift); 453 } 454 455 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 456 : IC.Builder.CreateLShr(Src, Offset); 457 458 RightShift->takeName(&II); 459 return IC.replaceInstUsesWith(II, RightShift); 460 } 461 case Intrinsic::amdgcn_exp: 462 case Intrinsic::amdgcn_exp_compr: { 463 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 464 unsigned EnBits = En->getZExtValue(); 465 if (EnBits == 0xf) 466 break; // All inputs enabled. 467 468 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 469 bool Changed = false; 470 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 471 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 472 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 473 Value *Src = II.getArgOperand(I + 2); 474 if (!isa<UndefValue>(Src)) { 475 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 476 Changed = true; 477 } 478 } 479 } 480 481 if (Changed) { 482 return &II; 483 } 484 485 break; 486 } 487 case Intrinsic::amdgcn_fmed3: { 488 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 489 // for the shader. 490 491 Value *Src0 = II.getArgOperand(0); 492 Value *Src1 = II.getArgOperand(1); 493 Value *Src2 = II.getArgOperand(2); 494 495 // Checking for NaN before canonicalization provides better fidelity when 496 // mapping other operations onto fmed3 since the order of operands is 497 // unchanged. 498 CallInst *NewCall = nullptr; 499 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 500 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 501 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 502 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 503 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 504 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 505 } 506 507 if (NewCall) { 508 NewCall->copyFastMathFlags(&II); 509 NewCall->takeName(&II); 510 return IC.replaceInstUsesWith(II, NewCall); 511 } 512 513 bool Swap = false; 514 // Canonicalize constants to RHS operands. 515 // 516 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 517 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 518 std::swap(Src0, Src1); 519 Swap = true; 520 } 521 522 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 523 std::swap(Src1, Src2); 524 Swap = true; 525 } 526 527 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 528 std::swap(Src0, Src1); 529 Swap = true; 530 } 531 532 if (Swap) { 533 II.setArgOperand(0, Src0); 534 II.setArgOperand(1, Src1); 535 II.setArgOperand(2, Src2); 536 return &II; 537 } 538 539 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 540 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 541 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 542 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 543 C2->getValueAPF()); 544 return IC.replaceInstUsesWith( 545 II, ConstantFP::get(IC.Builder.getContext(), Result)); 546 } 547 } 548 } 549 550 break; 551 } 552 case Intrinsic::amdgcn_icmp: 553 case Intrinsic::amdgcn_fcmp: { 554 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 555 // Guard against invalid arguments. 556 int64_t CCVal = CC->getZExtValue(); 557 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 558 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 559 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 560 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 561 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 562 break; 563 564 Value *Src0 = II.getArgOperand(0); 565 Value *Src1 = II.getArgOperand(1); 566 567 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 568 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 569 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 570 if (CCmp->isNullValue()) { 571 return IC.replaceInstUsesWith( 572 II, ConstantExpr::getSExt(CCmp, II.getType())); 573 } 574 575 // The result of V_ICMP/V_FCMP assembly instructions (which this 576 // intrinsic exposes) is one bit per thread, masked with the EXEC 577 // register (which contains the bitmask of live threads). So a 578 // comparison that always returns true is the same as a read of the 579 // EXEC register. 580 Function *NewF = Intrinsic::getDeclaration( 581 II.getModule(), Intrinsic::read_register, II.getType()); 582 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 583 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 584 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 585 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 586 NewCall->addAttribute(AttributeList::FunctionIndex, 587 Attribute::Convergent); 588 NewCall->takeName(&II); 589 return IC.replaceInstUsesWith(II, NewCall); 590 } 591 592 // Canonicalize constants to RHS. 593 CmpInst::Predicate SwapPred = 594 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 595 II.setArgOperand(0, Src1); 596 II.setArgOperand(1, Src0); 597 II.setArgOperand( 598 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 599 return &II; 600 } 601 602 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 603 break; 604 605 // Canonicalize compare eq with true value to compare != 0 606 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 607 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 608 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 609 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 610 Value *ExtSrc; 611 if (CCVal == CmpInst::ICMP_EQ && 612 ((match(Src1, PatternMatch::m_One()) && 613 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 614 (match(Src1, PatternMatch::m_AllOnes()) && 615 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 616 ExtSrc->getType()->isIntegerTy(1)) { 617 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 618 IC.replaceOperand(II, 2, 619 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 620 return &II; 621 } 622 623 CmpInst::Predicate SrcPred; 624 Value *SrcLHS; 625 Value *SrcRHS; 626 627 // Fold compare eq/ne with 0 from a compare result as the predicate to the 628 // intrinsic. The typical use is a wave vote function in the library, which 629 // will be fed from a user code condition compared with 0. Fold in the 630 // redundant compare. 631 632 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 633 // -> llvm.amdgcn.[if]cmp(a, b, pred) 634 // 635 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 636 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 637 if (match(Src1, PatternMatch::m_Zero()) && 638 match(Src0, PatternMatch::m_ZExtOrSExt( 639 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 640 PatternMatch::m_Value(SrcRHS))))) { 641 if (CCVal == CmpInst::ICMP_EQ) 642 SrcPred = CmpInst::getInversePredicate(SrcPred); 643 644 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 645 ? Intrinsic::amdgcn_fcmp 646 : Intrinsic::amdgcn_icmp; 647 648 Type *Ty = SrcLHS->getType(); 649 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 650 // Promote to next legal integer type. 651 unsigned Width = CmpType->getBitWidth(); 652 unsigned NewWidth = Width; 653 654 // Don't do anything for i1 comparisons. 655 if (Width == 1) 656 break; 657 658 if (Width <= 16) 659 NewWidth = 16; 660 else if (Width <= 32) 661 NewWidth = 32; 662 else if (Width <= 64) 663 NewWidth = 64; 664 else if (Width > 64) 665 break; // Can't handle this. 666 667 if (Width != NewWidth) { 668 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 669 if (CmpInst::isSigned(SrcPred)) { 670 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 671 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 672 } else { 673 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 674 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 675 } 676 } 677 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 678 break; 679 680 Function *NewF = Intrinsic::getDeclaration( 681 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 682 Value *Args[] = {SrcLHS, SrcRHS, 683 ConstantInt::get(CC->getType(), SrcPred)}; 684 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 685 NewCall->takeName(&II); 686 return IC.replaceInstUsesWith(II, NewCall); 687 } 688 689 break; 690 } 691 case Intrinsic::amdgcn_ballot: { 692 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 693 if (Src->isZero()) { 694 // amdgcn.ballot(i1 0) is zero. 695 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 696 } 697 698 if (Src->isOne()) { 699 // amdgcn.ballot(i1 1) is exec. 700 const char *RegName = "exec"; 701 if (II.getType()->isIntegerTy(32)) 702 RegName = "exec_lo"; 703 else if (!II.getType()->isIntegerTy(64)) 704 break; 705 706 Function *NewF = Intrinsic::getDeclaration( 707 II.getModule(), Intrinsic::read_register, II.getType()); 708 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 709 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 710 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 711 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 712 NewCall->addAttribute(AttributeList::FunctionIndex, 713 Attribute::Convergent); 714 NewCall->takeName(&II); 715 return IC.replaceInstUsesWith(II, NewCall); 716 } 717 } 718 break; 719 } 720 case Intrinsic::amdgcn_wqm_vote: { 721 // wqm_vote is identity when the argument is constant. 722 if (!isa<Constant>(II.getArgOperand(0))) 723 break; 724 725 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 726 } 727 case Intrinsic::amdgcn_kill: { 728 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 729 if (!C || !C->getZExtValue()) 730 break; 731 732 // amdgcn.kill(i1 1) is a no-op 733 return IC.eraseInstFromFunction(II); 734 } 735 case Intrinsic::amdgcn_update_dpp: { 736 Value *Old = II.getArgOperand(0); 737 738 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 739 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 740 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 741 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 742 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 743 break; 744 745 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 746 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 747 } 748 case Intrinsic::amdgcn_permlane16: 749 case Intrinsic::amdgcn_permlanex16: { 750 // Discard vdst_in if it's not going to be read. 751 Value *VDstIn = II.getArgOperand(0); 752 if (isa<UndefValue>(VDstIn)) 753 break; 754 755 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 756 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 757 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 758 break; 759 760 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 761 } 762 case Intrinsic::amdgcn_readfirstlane: 763 case Intrinsic::amdgcn_readlane: { 764 // A constant value is trivially uniform. 765 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 766 return IC.replaceInstUsesWith(II, C); 767 } 768 769 // The rest of these may not be safe if the exec may not be the same between 770 // the def and use. 771 Value *Src = II.getArgOperand(0); 772 Instruction *SrcInst = dyn_cast<Instruction>(Src); 773 if (SrcInst && SrcInst->getParent() != II.getParent()) 774 break; 775 776 // readfirstlane (readfirstlane x) -> readfirstlane x 777 // readlane (readfirstlane x), y -> readfirstlane x 778 if (match(Src, 779 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 780 return IC.replaceInstUsesWith(II, Src); 781 } 782 783 if (IID == Intrinsic::amdgcn_readfirstlane) { 784 // readfirstlane (readlane x, y) -> readlane x, y 785 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 786 return IC.replaceInstUsesWith(II, Src); 787 } 788 } else { 789 // readlane (readlane x, y), y -> readlane x, y 790 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 791 PatternMatch::m_Value(), 792 PatternMatch::m_Specific(II.getArgOperand(1))))) { 793 return IC.replaceInstUsesWith(II, Src); 794 } 795 } 796 797 break; 798 } 799 case Intrinsic::amdgcn_ldexp: { 800 // FIXME: This doesn't introduce new instructions and belongs in 801 // InstructionSimplify. 802 Type *Ty = II.getType(); 803 Value *Op0 = II.getArgOperand(0); 804 Value *Op1 = II.getArgOperand(1); 805 806 // Folding undef to qnan is safe regardless of the FP mode. 807 if (isa<UndefValue>(Op0)) { 808 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 809 return IC.replaceInstUsesWith(II, QNaN); 810 } 811 812 const APFloat *C = nullptr; 813 match(Op0, PatternMatch::m_APFloat(C)); 814 815 // FIXME: Should flush denorms depending on FP mode, but that's ignored 816 // everywhere else. 817 // 818 // These cases should be safe, even with strictfp. 819 // ldexp(0.0, x) -> 0.0 820 // ldexp(-0.0, x) -> -0.0 821 // ldexp(inf, x) -> inf 822 // ldexp(-inf, x) -> -inf 823 if (C && (C->isZero() || C->isInfinity())) { 824 return IC.replaceInstUsesWith(II, Op0); 825 } 826 827 // With strictfp, be more careful about possibly needing to flush denormals 828 // or not, and snan behavior depends on ieee_mode. 829 if (II.isStrictFP()) 830 break; 831 832 if (C && C->isNaN()) { 833 // FIXME: We just need to make the nan quiet here, but that's unavailable 834 // on APFloat, only IEEEfloat 835 auto *Quieted = 836 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 837 return IC.replaceInstUsesWith(II, Quieted); 838 } 839 840 // ldexp(x, 0) -> x 841 // ldexp(x, undef) -> x 842 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 843 return IC.replaceInstUsesWith(II, Op0); 844 } 845 846 break; 847 } 848 case Intrinsic::amdgcn_fmul_legacy: { 849 Value *Op0 = II.getArgOperand(0); 850 Value *Op1 = II.getArgOperand(1); 851 852 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 853 // infinity, gives +0.0. 854 // TODO: Move to InstSimplify? 855 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 856 match(Op1, PatternMatch::m_AnyZeroFP())) 857 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 858 859 // If we can prove we don't have one of the special cases then we can use a 860 // normal fmul instruction instead. 861 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 862 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 863 FMul->takeName(&II); 864 return IC.replaceInstUsesWith(II, FMul); 865 } 866 break; 867 } 868 case Intrinsic::amdgcn_fma_legacy: { 869 Value *Op0 = II.getArgOperand(0); 870 Value *Op1 = II.getArgOperand(1); 871 Value *Op2 = II.getArgOperand(2); 872 873 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 874 // infinity, gives +0.0. 875 // TODO: Move to InstSimplify? 876 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 877 match(Op1, PatternMatch::m_AnyZeroFP())) { 878 // It's tempting to just return Op2 here, but that would give the wrong 879 // result if Op2 was -0.0. 880 auto *Zero = ConstantFP::getNullValue(II.getType()); 881 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 882 FAdd->takeName(&II); 883 return IC.replaceInstUsesWith(II, FAdd); 884 } 885 886 // If we can prove we don't have one of the special cases then we can use a 887 // normal fma instead. 888 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 889 II.setCalledOperand(Intrinsic::getDeclaration( 890 II.getModule(), Intrinsic::fma, II.getType())); 891 return &II; 892 } 893 break; 894 } 895 default: { 896 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 897 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 898 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 899 } 900 } 901 } 902 return None; 903 } 904 905 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 906 /// 907 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 908 /// struct returns. 909 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 910 IntrinsicInst &II, 911 APInt DemandedElts, 912 int DMaskIdx = -1) { 913 914 auto *IIVTy = cast<FixedVectorType>(II.getType()); 915 unsigned VWidth = IIVTy->getNumElements(); 916 if (VWidth == 1) 917 return nullptr; 918 919 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 920 IC.Builder.SetInsertPoint(&II); 921 922 // Assume the arguments are unchanged and later override them, if needed. 923 SmallVector<Value *, 16> Args(II.args()); 924 925 if (DMaskIdx < 0) { 926 // Buffer case. 927 928 const unsigned ActiveBits = DemandedElts.getActiveBits(); 929 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 930 931 // Start assuming the prefix of elements is demanded, but possibly clear 932 // some other bits if there are trailing zeros (unused components at front) 933 // and update offset. 934 DemandedElts = (1 << ActiveBits) - 1; 935 936 if (UnusedComponentsAtFront > 0) { 937 static const unsigned InvalidOffsetIdx = 0xf; 938 939 unsigned OffsetIdx; 940 switch (II.getIntrinsicID()) { 941 case Intrinsic::amdgcn_raw_buffer_load: 942 OffsetIdx = 1; 943 break; 944 case Intrinsic::amdgcn_s_buffer_load: 945 // If resulting type is vec3, there is no point in trimming the 946 // load with updated offset, as the vec3 would most likely be widened to 947 // vec4 anyway during lowering. 948 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 949 OffsetIdx = InvalidOffsetIdx; 950 else 951 OffsetIdx = 1; 952 break; 953 case Intrinsic::amdgcn_struct_buffer_load: 954 OffsetIdx = 2; 955 break; 956 default: 957 // TODO: handle tbuffer* intrinsics. 958 OffsetIdx = InvalidOffsetIdx; 959 break; 960 } 961 962 if (OffsetIdx != InvalidOffsetIdx) { 963 // Clear demanded bits and update the offset. 964 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 965 auto *Offset = II.getArgOperand(OffsetIdx); 966 unsigned SingleComponentSizeInBits = 967 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 968 unsigned OffsetAdd = 969 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 970 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 971 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 972 } 973 } 974 } else { 975 // Image case. 976 977 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 978 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 979 980 // Mask off values that are undefined because the dmask doesn't cover them 981 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 982 983 unsigned NewDMaskVal = 0; 984 unsigned OrigLoadIdx = 0; 985 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 986 const unsigned Bit = 1 << SrcIdx; 987 if (!!(DMaskVal & Bit)) { 988 if (!!DemandedElts[OrigLoadIdx]) 989 NewDMaskVal |= Bit; 990 OrigLoadIdx++; 991 } 992 } 993 994 if (DMaskVal != NewDMaskVal) 995 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 996 } 997 998 unsigned NewNumElts = DemandedElts.countPopulation(); 999 if (!NewNumElts) 1000 return UndefValue::get(II.getType()); 1001 1002 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1003 if (DMaskIdx >= 0) 1004 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1005 return nullptr; 1006 } 1007 1008 // Validate function argument and return types, extracting overloaded types 1009 // along the way. 1010 SmallVector<Type *, 6> OverloadTys; 1011 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1012 return nullptr; 1013 1014 Module *M = II.getParent()->getParent()->getParent(); 1015 Type *EltTy = IIVTy->getElementType(); 1016 Type *NewTy = 1017 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1018 1019 OverloadTys[0] = NewTy; 1020 Function *NewIntrin = 1021 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1022 1023 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1024 NewCall->takeName(&II); 1025 NewCall->copyMetadata(II); 1026 1027 if (NewNumElts == 1) { 1028 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1029 NewCall, 1030 DemandedElts.countTrailingZeros()); 1031 } 1032 1033 SmallVector<int, 8> EltMask; 1034 unsigned NewLoadIdx = 0; 1035 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1036 if (!!DemandedElts[OrigLoadIdx]) 1037 EltMask.push_back(NewLoadIdx++); 1038 else 1039 EltMask.push_back(NewNumElts); 1040 } 1041 1042 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1043 1044 return Shuffle; 1045 } 1046 1047 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1048 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1049 APInt &UndefElts2, APInt &UndefElts3, 1050 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1051 SimplifyAndSetOp) const { 1052 switch (II.getIntrinsicID()) { 1053 case Intrinsic::amdgcn_buffer_load: 1054 case Intrinsic::amdgcn_buffer_load_format: 1055 case Intrinsic::amdgcn_raw_buffer_load: 1056 case Intrinsic::amdgcn_raw_buffer_load_format: 1057 case Intrinsic::amdgcn_raw_tbuffer_load: 1058 case Intrinsic::amdgcn_s_buffer_load: 1059 case Intrinsic::amdgcn_struct_buffer_load: 1060 case Intrinsic::amdgcn_struct_buffer_load_format: 1061 case Intrinsic::amdgcn_struct_tbuffer_load: 1062 case Intrinsic::amdgcn_tbuffer_load: 1063 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1064 default: { 1065 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1066 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1067 } 1068 break; 1069 } 1070 } 1071 return None; 1072 } 1073