1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "llvm/Support/KnownBits.h" 19 #include "llvm/Transforms/InstCombine/InstCombiner.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "AMDGPUtti" 24 25 namespace { 26 27 struct AMDGPUImageDMaskIntrinsic { 28 unsigned Intr; 29 }; 30 31 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 32 #include "InstCombineTables.inc" 33 34 } // end anonymous namespace 35 36 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 37 // 38 // A single NaN input is folded to minnum, so we rely on that folding for 39 // handling NaNs. 40 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 41 const APFloat &Src2) { 42 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 43 44 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 45 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 46 if (Cmp0 == APFloat::cmpEqual) 47 return maxnum(Src1, Src2); 48 49 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 50 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp1 == APFloat::cmpEqual) 52 return maxnum(Src0, Src2); 53 54 return maxnum(Src0, Src1); 55 } 56 57 // Check if a value can be converted to a 16-bit value without losing 58 // precision. 59 static bool canSafelyConvertTo16Bit(Value &V) { 60 Type *VTy = V.getType(); 61 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 62 // The value is already 16-bit, so we don't want to convert to 16-bit again! 63 return false; 64 } 65 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 66 // We need to check that if we cast the index down to a half, we do not lose 67 // precision. 68 APFloat FloatValue(ConstFloat->getValueAPF()); 69 bool LosesInfo = true; 70 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 71 return !LosesInfo; 72 } 73 Value *CastSrc; 74 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 75 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 76 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 77 Type *CastSrcTy = CastSrc->getType(); 78 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 79 return true; 80 } 81 82 return false; 83 } 84 85 // Convert a value to 16-bit. 86 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 87 Type *VTy = V.getType(); 88 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 89 return cast<Instruction>(&V)->getOperand(0); 90 if (VTy->isIntegerTy()) 91 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 92 if (VTy->isFloatingPointTy()) 93 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 94 95 llvm_unreachable("Should never be called!"); 96 } 97 98 static Optional<Instruction *> 99 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 100 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 101 IntrinsicInst &II, InstCombiner &IC) { 102 if (!ST->hasA16() && !ST->hasG16()) 103 return None; 104 105 bool FloatCoord = false; 106 // true means derivatives can be converted to 16 bit, coordinates not 107 bool OnlyDerivatives = false; 108 109 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 110 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 111 Value *Coord = II.getOperand(OperandIndex); 112 // If the values are not derived from 16-bit values, we cannot optimize. 113 if (!canSafelyConvertTo16Bit(*Coord)) { 114 if (OperandIndex < ImageDimIntr->CoordStart || 115 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 116 return None; 117 } 118 // All gradients can be converted, so convert only them 119 OnlyDerivatives = true; 120 break; 121 } 122 123 assert(OperandIndex == ImageDimIntr->GradientStart || 124 FloatCoord == Coord->getType()->isFloatingPointTy()); 125 FloatCoord = Coord->getType()->isFloatingPointTy(); 126 } 127 128 if (OnlyDerivatives) { 129 if (!ST->hasG16()) 130 return None; 131 } else { 132 if (!ST->hasA16()) 133 OnlyDerivatives = true; // Only supports G16 134 } 135 136 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 137 : Type::getInt16Ty(II.getContext()); 138 139 SmallVector<Type *, 4> ArgTys; 140 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 141 return None; 142 143 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 144 if (!OnlyDerivatives) 145 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 146 Function *I = 147 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 148 149 SmallVector<Value *, 8> Args(II.arg_operands()); 150 151 unsigned EndIndex = 152 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 153 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 154 OperandIndex < EndIndex; OperandIndex++) { 155 Args[OperandIndex] = 156 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 157 } 158 159 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 160 NewCall->takeName(&II); 161 NewCall->copyMetadata(II); 162 NewCall->copyFastMathFlags(&II); 163 return IC.replaceInstUsesWith(II, NewCall); 164 } 165 166 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 167 InstCombiner &IC) const { 168 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 169 // infinity, gives +0.0. If we can prove we don't have one of the special 170 // cases then we can use a normal multiply instead. 171 // TODO: Create and use isKnownFiniteNonZero instead of just matching 172 // constants here. 173 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 174 match(Op1, PatternMatch::m_FiniteNonZero())) { 175 // One operand is not zero or infinity or NaN. 176 return true; 177 } 178 auto *TLI = &IC.getTargetLibraryInfo(); 179 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 180 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 181 // Neither operand is infinity or NaN. 182 return true; 183 } 184 return false; 185 } 186 187 Optional<Instruction *> 188 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 189 Intrinsic::ID IID = II.getIntrinsicID(); 190 switch (IID) { 191 case Intrinsic::amdgcn_rcp: { 192 Value *Src = II.getArgOperand(0); 193 194 // TODO: Move to ConstantFolding/InstSimplify? 195 if (isa<UndefValue>(Src)) { 196 Type *Ty = II.getType(); 197 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 198 return IC.replaceInstUsesWith(II, QNaN); 199 } 200 201 if (II.isStrictFP()) 202 break; 203 204 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 205 const APFloat &ArgVal = C->getValueAPF(); 206 APFloat Val(ArgVal.getSemantics(), 1); 207 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 208 209 // This is more precise than the instruction may give. 210 // 211 // TODO: The instruction always flushes denormal results (except for f16), 212 // should this also? 213 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 214 } 215 216 break; 217 } 218 case Intrinsic::amdgcn_rsq: { 219 Value *Src = II.getArgOperand(0); 220 221 // TODO: Move to ConstantFolding/InstSimplify? 222 if (isa<UndefValue>(Src)) { 223 Type *Ty = II.getType(); 224 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 225 return IC.replaceInstUsesWith(II, QNaN); 226 } 227 228 break; 229 } 230 case Intrinsic::amdgcn_frexp_mant: 231 case Intrinsic::amdgcn_frexp_exp: { 232 Value *Src = II.getArgOperand(0); 233 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 234 int Exp; 235 APFloat Significand = 236 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 237 238 if (IID == Intrinsic::amdgcn_frexp_mant) { 239 return IC.replaceInstUsesWith( 240 II, ConstantFP::get(II.getContext(), Significand)); 241 } 242 243 // Match instruction special case behavior. 244 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 245 Exp = 0; 246 247 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 248 } 249 250 if (isa<UndefValue>(Src)) { 251 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 252 } 253 254 break; 255 } 256 case Intrinsic::amdgcn_class: { 257 enum { 258 S_NAN = 1 << 0, // Signaling NaN 259 Q_NAN = 1 << 1, // Quiet NaN 260 N_INFINITY = 1 << 2, // Negative infinity 261 N_NORMAL = 1 << 3, // Negative normal 262 N_SUBNORMAL = 1 << 4, // Negative subnormal 263 N_ZERO = 1 << 5, // Negative zero 264 P_ZERO = 1 << 6, // Positive zero 265 P_SUBNORMAL = 1 << 7, // Positive subnormal 266 P_NORMAL = 1 << 8, // Positive normal 267 P_INFINITY = 1 << 9 // Positive infinity 268 }; 269 270 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 271 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 272 P_NORMAL | P_INFINITY; 273 274 Value *Src0 = II.getArgOperand(0); 275 Value *Src1 = II.getArgOperand(1); 276 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 277 if (!CMask) { 278 if (isa<UndefValue>(Src0)) { 279 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 280 } 281 282 if (isa<UndefValue>(Src1)) { 283 return IC.replaceInstUsesWith(II, 284 ConstantInt::get(II.getType(), false)); 285 } 286 break; 287 } 288 289 uint32_t Mask = CMask->getZExtValue(); 290 291 // If all tests are made, it doesn't matter what the value is. 292 if ((Mask & FullMask) == FullMask) { 293 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 294 } 295 296 if ((Mask & FullMask) == 0) { 297 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 298 } 299 300 if (Mask == (S_NAN | Q_NAN)) { 301 // Equivalent of isnan. Replace with standard fcmp. 302 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 303 FCmp->takeName(&II); 304 return IC.replaceInstUsesWith(II, FCmp); 305 } 306 307 if (Mask == (N_ZERO | P_ZERO)) { 308 // Equivalent of == 0. 309 Value *FCmp = 310 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 311 312 FCmp->takeName(&II); 313 return IC.replaceInstUsesWith(II, FCmp); 314 } 315 316 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 317 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 318 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 319 return IC.replaceOperand( 320 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 321 } 322 323 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 324 if (!CVal) { 325 if (isa<UndefValue>(Src0)) { 326 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 327 } 328 329 // Clamp mask to used bits 330 if ((Mask & FullMask) != Mask) { 331 CallInst *NewCall = IC.Builder.CreateCall( 332 II.getCalledFunction(), 333 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 334 335 NewCall->takeName(&II); 336 return IC.replaceInstUsesWith(II, NewCall); 337 } 338 339 break; 340 } 341 342 const APFloat &Val = CVal->getValueAPF(); 343 344 bool Result = 345 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 346 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 347 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 348 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 349 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 350 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 351 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 352 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 353 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 354 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 355 356 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 357 } 358 case Intrinsic::amdgcn_cvt_pkrtz: { 359 Value *Src0 = II.getArgOperand(0); 360 Value *Src1 = II.getArgOperand(1); 361 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 362 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 363 const fltSemantics &HalfSem = 364 II.getType()->getScalarType()->getFltSemantics(); 365 bool LosesInfo; 366 APFloat Val0 = C0->getValueAPF(); 367 APFloat Val1 = C1->getValueAPF(); 368 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 369 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 370 371 Constant *Folded = 372 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 373 ConstantFP::get(II.getContext(), Val1)}); 374 return IC.replaceInstUsesWith(II, Folded); 375 } 376 } 377 378 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 379 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 380 } 381 382 break; 383 } 384 case Intrinsic::amdgcn_cvt_pknorm_i16: 385 case Intrinsic::amdgcn_cvt_pknorm_u16: 386 case Intrinsic::amdgcn_cvt_pk_i16: 387 case Intrinsic::amdgcn_cvt_pk_u16: { 388 Value *Src0 = II.getArgOperand(0); 389 Value *Src1 = II.getArgOperand(1); 390 391 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 392 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 393 } 394 395 break; 396 } 397 case Intrinsic::amdgcn_ubfe: 398 case Intrinsic::amdgcn_sbfe: { 399 // Decompose simple cases into standard shifts. 400 Value *Src = II.getArgOperand(0); 401 if (isa<UndefValue>(Src)) { 402 return IC.replaceInstUsesWith(II, Src); 403 } 404 405 unsigned Width; 406 Type *Ty = II.getType(); 407 unsigned IntSize = Ty->getIntegerBitWidth(); 408 409 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 410 if (CWidth) { 411 Width = CWidth->getZExtValue(); 412 if ((Width & (IntSize - 1)) == 0) { 413 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 414 } 415 416 // Hardware ignores high bits, so remove those. 417 if (Width >= IntSize) { 418 return IC.replaceOperand( 419 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 420 } 421 } 422 423 unsigned Offset; 424 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 425 if (COffset) { 426 Offset = COffset->getZExtValue(); 427 if (Offset >= IntSize) { 428 return IC.replaceOperand( 429 II, 1, 430 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 431 } 432 } 433 434 bool Signed = IID == Intrinsic::amdgcn_sbfe; 435 436 if (!CWidth || !COffset) 437 break; 438 439 // The case of Width == 0 is handled above, which makes this tranformation 440 // safe. If Width == 0, then the ashr and lshr instructions become poison 441 // value since the shift amount would be equal to the bit size. 442 assert(Width != 0); 443 444 // TODO: This allows folding to undef when the hardware has specific 445 // behavior? 446 if (Offset + Width < IntSize) { 447 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 448 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 449 : IC.Builder.CreateLShr(Shl, IntSize - Width); 450 RightShift->takeName(&II); 451 return IC.replaceInstUsesWith(II, RightShift); 452 } 453 454 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 455 : IC.Builder.CreateLShr(Src, Offset); 456 457 RightShift->takeName(&II); 458 return IC.replaceInstUsesWith(II, RightShift); 459 } 460 case Intrinsic::amdgcn_exp: 461 case Intrinsic::amdgcn_exp_compr: { 462 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 463 unsigned EnBits = En->getZExtValue(); 464 if (EnBits == 0xf) 465 break; // All inputs enabled. 466 467 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 468 bool Changed = false; 469 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 470 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 471 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 472 Value *Src = II.getArgOperand(I + 2); 473 if (!isa<UndefValue>(Src)) { 474 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 475 Changed = true; 476 } 477 } 478 } 479 480 if (Changed) { 481 return &II; 482 } 483 484 break; 485 } 486 case Intrinsic::amdgcn_fmed3: { 487 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 488 // for the shader. 489 490 Value *Src0 = II.getArgOperand(0); 491 Value *Src1 = II.getArgOperand(1); 492 Value *Src2 = II.getArgOperand(2); 493 494 // Checking for NaN before canonicalization provides better fidelity when 495 // mapping other operations onto fmed3 since the order of operands is 496 // unchanged. 497 CallInst *NewCall = nullptr; 498 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 499 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 500 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 501 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 502 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 503 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 504 } 505 506 if (NewCall) { 507 NewCall->copyFastMathFlags(&II); 508 NewCall->takeName(&II); 509 return IC.replaceInstUsesWith(II, NewCall); 510 } 511 512 bool Swap = false; 513 // Canonicalize constants to RHS operands. 514 // 515 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 516 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 517 std::swap(Src0, Src1); 518 Swap = true; 519 } 520 521 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 522 std::swap(Src1, Src2); 523 Swap = true; 524 } 525 526 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 527 std::swap(Src0, Src1); 528 Swap = true; 529 } 530 531 if (Swap) { 532 II.setArgOperand(0, Src0); 533 II.setArgOperand(1, Src1); 534 II.setArgOperand(2, Src2); 535 return &II; 536 } 537 538 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 539 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 540 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 541 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 542 C2->getValueAPF()); 543 return IC.replaceInstUsesWith( 544 II, ConstantFP::get(IC.Builder.getContext(), Result)); 545 } 546 } 547 } 548 549 break; 550 } 551 case Intrinsic::amdgcn_icmp: 552 case Intrinsic::amdgcn_fcmp: { 553 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 554 // Guard against invalid arguments. 555 int64_t CCVal = CC->getZExtValue(); 556 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 557 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 558 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 559 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 560 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 561 break; 562 563 Value *Src0 = II.getArgOperand(0); 564 Value *Src1 = II.getArgOperand(1); 565 566 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 567 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 568 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 569 if (CCmp->isNullValue()) { 570 return IC.replaceInstUsesWith( 571 II, ConstantExpr::getSExt(CCmp, II.getType())); 572 } 573 574 // The result of V_ICMP/V_FCMP assembly instructions (which this 575 // intrinsic exposes) is one bit per thread, masked with the EXEC 576 // register (which contains the bitmask of live threads). So a 577 // comparison that always returns true is the same as a read of the 578 // EXEC register. 579 Function *NewF = Intrinsic::getDeclaration( 580 II.getModule(), Intrinsic::read_register, II.getType()); 581 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 582 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 583 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 584 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 585 NewCall->addAttribute(AttributeList::FunctionIndex, 586 Attribute::Convergent); 587 NewCall->takeName(&II); 588 return IC.replaceInstUsesWith(II, NewCall); 589 } 590 591 // Canonicalize constants to RHS. 592 CmpInst::Predicate SwapPred = 593 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 594 II.setArgOperand(0, Src1); 595 II.setArgOperand(1, Src0); 596 II.setArgOperand( 597 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 598 return &II; 599 } 600 601 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 602 break; 603 604 // Canonicalize compare eq with true value to compare != 0 605 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 606 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 607 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 608 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 609 Value *ExtSrc; 610 if (CCVal == CmpInst::ICMP_EQ && 611 ((match(Src1, PatternMatch::m_One()) && 612 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 613 (match(Src1, PatternMatch::m_AllOnes()) && 614 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 615 ExtSrc->getType()->isIntegerTy(1)) { 616 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 617 IC.replaceOperand(II, 2, 618 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 619 return &II; 620 } 621 622 CmpInst::Predicate SrcPred; 623 Value *SrcLHS; 624 Value *SrcRHS; 625 626 // Fold compare eq/ne with 0 from a compare result as the predicate to the 627 // intrinsic. The typical use is a wave vote function in the library, which 628 // will be fed from a user code condition compared with 0. Fold in the 629 // redundant compare. 630 631 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 632 // -> llvm.amdgcn.[if]cmp(a, b, pred) 633 // 634 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 635 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 636 if (match(Src1, PatternMatch::m_Zero()) && 637 match(Src0, PatternMatch::m_ZExtOrSExt( 638 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 639 PatternMatch::m_Value(SrcRHS))))) { 640 if (CCVal == CmpInst::ICMP_EQ) 641 SrcPred = CmpInst::getInversePredicate(SrcPred); 642 643 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 644 ? Intrinsic::amdgcn_fcmp 645 : Intrinsic::amdgcn_icmp; 646 647 Type *Ty = SrcLHS->getType(); 648 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 649 // Promote to next legal integer type. 650 unsigned Width = CmpType->getBitWidth(); 651 unsigned NewWidth = Width; 652 653 // Don't do anything for i1 comparisons. 654 if (Width == 1) 655 break; 656 657 if (Width <= 16) 658 NewWidth = 16; 659 else if (Width <= 32) 660 NewWidth = 32; 661 else if (Width <= 64) 662 NewWidth = 64; 663 else if (Width > 64) 664 break; // Can't handle this. 665 666 if (Width != NewWidth) { 667 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 668 if (CmpInst::isSigned(SrcPred)) { 669 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 670 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 671 } else { 672 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 673 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 674 } 675 } 676 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 677 break; 678 679 Function *NewF = Intrinsic::getDeclaration( 680 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 681 Value *Args[] = {SrcLHS, SrcRHS, 682 ConstantInt::get(CC->getType(), SrcPred)}; 683 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 684 NewCall->takeName(&II); 685 return IC.replaceInstUsesWith(II, NewCall); 686 } 687 688 break; 689 } 690 case Intrinsic::amdgcn_ballot: { 691 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 692 if (Src->isZero()) { 693 // amdgcn.ballot(i1 0) is zero. 694 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 695 } 696 697 if (Src->isOne()) { 698 // amdgcn.ballot(i1 1) is exec. 699 const char *RegName = "exec"; 700 if (II.getType()->isIntegerTy(32)) 701 RegName = "exec_lo"; 702 else if (!II.getType()->isIntegerTy(64)) 703 break; 704 705 Function *NewF = Intrinsic::getDeclaration( 706 II.getModule(), Intrinsic::read_register, II.getType()); 707 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 708 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 709 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 710 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 711 NewCall->addAttribute(AttributeList::FunctionIndex, 712 Attribute::Convergent); 713 NewCall->takeName(&II); 714 return IC.replaceInstUsesWith(II, NewCall); 715 } 716 } 717 break; 718 } 719 case Intrinsic::amdgcn_wqm_vote: { 720 // wqm_vote is identity when the argument is constant. 721 if (!isa<Constant>(II.getArgOperand(0))) 722 break; 723 724 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 725 } 726 case Intrinsic::amdgcn_kill: { 727 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 728 if (!C || !C->getZExtValue()) 729 break; 730 731 // amdgcn.kill(i1 1) is a no-op 732 return IC.eraseInstFromFunction(II); 733 } 734 case Intrinsic::amdgcn_update_dpp: { 735 Value *Old = II.getArgOperand(0); 736 737 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 738 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 739 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 740 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 741 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 742 break; 743 744 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 745 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 746 } 747 case Intrinsic::amdgcn_permlane16: 748 case Intrinsic::amdgcn_permlanex16: { 749 // Discard vdst_in if it's not going to be read. 750 Value *VDstIn = II.getArgOperand(0); 751 if (isa<UndefValue>(VDstIn)) 752 break; 753 754 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 755 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 756 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 757 break; 758 759 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 760 } 761 case Intrinsic::amdgcn_readfirstlane: 762 case Intrinsic::amdgcn_readlane: { 763 // A constant value is trivially uniform. 764 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 765 return IC.replaceInstUsesWith(II, C); 766 } 767 768 // The rest of these may not be safe if the exec may not be the same between 769 // the def and use. 770 Value *Src = II.getArgOperand(0); 771 Instruction *SrcInst = dyn_cast<Instruction>(Src); 772 if (SrcInst && SrcInst->getParent() != II.getParent()) 773 break; 774 775 // readfirstlane (readfirstlane x) -> readfirstlane x 776 // readlane (readfirstlane x), y -> readfirstlane x 777 if (match(Src, 778 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 779 return IC.replaceInstUsesWith(II, Src); 780 } 781 782 if (IID == Intrinsic::amdgcn_readfirstlane) { 783 // readfirstlane (readlane x, y) -> readlane x, y 784 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 785 return IC.replaceInstUsesWith(II, Src); 786 } 787 } else { 788 // readlane (readlane x, y), y -> readlane x, y 789 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 790 PatternMatch::m_Value(), 791 PatternMatch::m_Specific(II.getArgOperand(1))))) { 792 return IC.replaceInstUsesWith(II, Src); 793 } 794 } 795 796 break; 797 } 798 case Intrinsic::amdgcn_ldexp: { 799 // FIXME: This doesn't introduce new instructions and belongs in 800 // InstructionSimplify. 801 Type *Ty = II.getType(); 802 Value *Op0 = II.getArgOperand(0); 803 Value *Op1 = II.getArgOperand(1); 804 805 // Folding undef to qnan is safe regardless of the FP mode. 806 if (isa<UndefValue>(Op0)) { 807 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 808 return IC.replaceInstUsesWith(II, QNaN); 809 } 810 811 const APFloat *C = nullptr; 812 match(Op0, PatternMatch::m_APFloat(C)); 813 814 // FIXME: Should flush denorms depending on FP mode, but that's ignored 815 // everywhere else. 816 // 817 // These cases should be safe, even with strictfp. 818 // ldexp(0.0, x) -> 0.0 819 // ldexp(-0.0, x) -> -0.0 820 // ldexp(inf, x) -> inf 821 // ldexp(-inf, x) -> -inf 822 if (C && (C->isZero() || C->isInfinity())) { 823 return IC.replaceInstUsesWith(II, Op0); 824 } 825 826 // With strictfp, be more careful about possibly needing to flush denormals 827 // or not, and snan behavior depends on ieee_mode. 828 if (II.isStrictFP()) 829 break; 830 831 if (C && C->isNaN()) { 832 // FIXME: We just need to make the nan quiet here, but that's unavailable 833 // on APFloat, only IEEEfloat 834 auto *Quieted = 835 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 836 return IC.replaceInstUsesWith(II, Quieted); 837 } 838 839 // ldexp(x, 0) -> x 840 // ldexp(x, undef) -> x 841 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 842 return IC.replaceInstUsesWith(II, Op0); 843 } 844 845 break; 846 } 847 case Intrinsic::amdgcn_fmul_legacy: { 848 Value *Op0 = II.getArgOperand(0); 849 Value *Op1 = II.getArgOperand(1); 850 851 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 852 // infinity, gives +0.0. 853 // TODO: Move to InstSimplify? 854 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 855 match(Op1, PatternMatch::m_AnyZeroFP())) 856 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 857 858 // If we can prove we don't have one of the special cases then we can use a 859 // normal fmul instruction instead. 860 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 861 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 862 FMul->takeName(&II); 863 return IC.replaceInstUsesWith(II, FMul); 864 } 865 break; 866 } 867 case Intrinsic::amdgcn_fma_legacy: { 868 Value *Op0 = II.getArgOperand(0); 869 Value *Op1 = II.getArgOperand(1); 870 Value *Op2 = II.getArgOperand(2); 871 872 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 873 // infinity, gives +0.0. 874 // TODO: Move to InstSimplify? 875 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 876 match(Op1, PatternMatch::m_AnyZeroFP())) { 877 // It's tempting to just return Op2 here, but that would give the wrong 878 // result if Op2 was -0.0. 879 auto *Zero = ConstantFP::getNullValue(II.getType()); 880 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 881 FAdd->takeName(&II); 882 return IC.replaceInstUsesWith(II, FAdd); 883 } 884 885 // If we can prove we don't have one of the special cases then we can use a 886 // normal fma instead. 887 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 888 II.setCalledOperand(Intrinsic::getDeclaration( 889 II.getModule(), Intrinsic::fma, II.getType())); 890 return &II; 891 } 892 break; 893 } 894 default: { 895 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 896 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 897 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 898 } 899 } 900 } 901 return None; 902 } 903 904 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 905 /// 906 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 907 /// struct returns. 908 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 909 IntrinsicInst &II, 910 APInt DemandedElts, 911 int DMaskIdx = -1) { 912 913 auto *IIVTy = cast<FixedVectorType>(II.getType()); 914 unsigned VWidth = IIVTy->getNumElements(); 915 if (VWidth == 1) 916 return nullptr; 917 918 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 919 IC.Builder.SetInsertPoint(&II); 920 921 // Assume the arguments are unchanged and later override them, if needed. 922 SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end()); 923 924 if (DMaskIdx < 0) { 925 // Buffer case. 926 927 const unsigned ActiveBits = DemandedElts.getActiveBits(); 928 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 929 930 // Start assuming the prefix of elements is demanded, but possibly clear 931 // some other bits if there are trailing zeros (unused components at front) 932 // and update offset. 933 DemandedElts = (1 << ActiveBits) - 1; 934 935 if (UnusedComponentsAtFront > 0) { 936 static const unsigned InvalidOffsetIdx = 0xf; 937 938 unsigned OffsetIdx; 939 switch (II.getIntrinsicID()) { 940 case Intrinsic::amdgcn_raw_buffer_load: 941 OffsetIdx = 1; 942 break; 943 case Intrinsic::amdgcn_s_buffer_load: 944 // If resulting type is vec3, there is no point in trimming the 945 // load with updated offset, as the vec3 would most likely be widened to 946 // vec4 anyway during lowering. 947 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 948 OffsetIdx = InvalidOffsetIdx; 949 else 950 OffsetIdx = 1; 951 break; 952 case Intrinsic::amdgcn_struct_buffer_load: 953 OffsetIdx = 2; 954 break; 955 default: 956 // TODO: handle tbuffer* intrinsics. 957 OffsetIdx = InvalidOffsetIdx; 958 break; 959 } 960 961 if (OffsetIdx != InvalidOffsetIdx) { 962 // Clear demanded bits and update the offset. 963 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 964 auto *Offset = II.getArgOperand(OffsetIdx); 965 unsigned SingleComponentSizeInBits = 966 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 967 unsigned OffsetAdd = 968 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 969 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 970 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 971 } 972 } 973 } else { 974 // Image case. 975 976 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 977 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 978 979 // Mask off values that are undefined because the dmask doesn't cover them 980 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 981 982 unsigned NewDMaskVal = 0; 983 unsigned OrigLoadIdx = 0; 984 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 985 const unsigned Bit = 1 << SrcIdx; 986 if (!!(DMaskVal & Bit)) { 987 if (!!DemandedElts[OrigLoadIdx]) 988 NewDMaskVal |= Bit; 989 OrigLoadIdx++; 990 } 991 } 992 993 if (DMaskVal != NewDMaskVal) 994 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 995 } 996 997 unsigned NewNumElts = DemandedElts.countPopulation(); 998 if (!NewNumElts) 999 return UndefValue::get(II.getType()); 1000 1001 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1002 if (DMaskIdx >= 0) 1003 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1004 return nullptr; 1005 } 1006 1007 // Validate function argument and return types, extracting overloaded types 1008 // along the way. 1009 SmallVector<Type *, 6> OverloadTys; 1010 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1011 return nullptr; 1012 1013 Module *M = II.getParent()->getParent()->getParent(); 1014 Type *EltTy = IIVTy->getElementType(); 1015 Type *NewTy = 1016 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1017 1018 OverloadTys[0] = NewTy; 1019 Function *NewIntrin = 1020 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1021 1022 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1023 NewCall->takeName(&II); 1024 NewCall->copyMetadata(II); 1025 1026 if (NewNumElts == 1) { 1027 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1028 NewCall, 1029 DemandedElts.countTrailingZeros()); 1030 } 1031 1032 SmallVector<int, 8> EltMask; 1033 unsigned NewLoadIdx = 0; 1034 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1035 if (!!DemandedElts[OrigLoadIdx]) 1036 EltMask.push_back(NewLoadIdx++); 1037 else 1038 EltMask.push_back(NewNumElts); 1039 } 1040 1041 Value *Shuffle = 1042 IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); 1043 1044 return Shuffle; 1045 } 1046 1047 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1048 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1049 APInt &UndefElts2, APInt &UndefElts3, 1050 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1051 SimplifyAndSetOp) const { 1052 switch (II.getIntrinsicID()) { 1053 case Intrinsic::amdgcn_buffer_load: 1054 case Intrinsic::amdgcn_buffer_load_format: 1055 case Intrinsic::amdgcn_raw_buffer_load: 1056 case Intrinsic::amdgcn_raw_buffer_load_format: 1057 case Intrinsic::amdgcn_raw_tbuffer_load: 1058 case Intrinsic::amdgcn_s_buffer_load: 1059 case Intrinsic::amdgcn_struct_buffer_load: 1060 case Intrinsic::amdgcn_struct_buffer_load_format: 1061 case Intrinsic::amdgcn_struct_tbuffer_load: 1062 case Intrinsic::amdgcn_tbuffer_load: 1063 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1064 default: { 1065 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1066 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1067 } 1068 break; 1069 } 1070 } 1071 return None; 1072 } 1073