1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "llvm/Transforms/InstCombine/InstCombiner.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "AMDGPUtti" 23 24 namespace { 25 26 struct AMDGPUImageDMaskIntrinsic { 27 unsigned Intr; 28 }; 29 30 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 31 #include "InstCombineTables.inc" 32 33 } // end anonymous namespace 34 35 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 36 // 37 // A single NaN input is folded to minnum, so we rely on that folding for 38 // handling NaNs. 39 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 40 const APFloat &Src2) { 41 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 42 43 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 44 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 45 if (Cmp0 == APFloat::cmpEqual) 46 return maxnum(Src1, Src2); 47 48 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 49 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 50 if (Cmp1 == APFloat::cmpEqual) 51 return maxnum(Src0, Src2); 52 53 return maxnum(Src0, Src1); 54 } 55 56 Optional<Instruction *> 57 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 58 Intrinsic::ID IID = II.getIntrinsicID(); 59 switch (IID) { 60 default: 61 break; 62 case Intrinsic::amdgcn_rcp: { 63 Value *Src = II.getArgOperand(0); 64 65 // TODO: Move to ConstantFolding/InstSimplify? 66 if (isa<UndefValue>(Src)) { 67 Type *Ty = II.getType(); 68 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 69 return IC.replaceInstUsesWith(II, QNaN); 70 } 71 72 if (II.isStrictFP()) 73 break; 74 75 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 76 const APFloat &ArgVal = C->getValueAPF(); 77 APFloat Val(ArgVal.getSemantics(), 1); 78 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 79 80 // This is more precise than the instruction may give. 81 // 82 // TODO: The instruction always flushes denormal results (except for f16), 83 // should this also? 84 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 85 } 86 87 break; 88 } 89 case Intrinsic::amdgcn_rsq: { 90 Value *Src = II.getArgOperand(0); 91 92 // TODO: Move to ConstantFolding/InstSimplify? 93 if (isa<UndefValue>(Src)) { 94 Type *Ty = II.getType(); 95 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 96 return IC.replaceInstUsesWith(II, QNaN); 97 } 98 99 break; 100 } 101 case Intrinsic::amdgcn_frexp_mant: 102 case Intrinsic::amdgcn_frexp_exp: { 103 Value *Src = II.getArgOperand(0); 104 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 105 int Exp; 106 APFloat Significand = 107 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 108 109 if (IID == Intrinsic::amdgcn_frexp_mant) { 110 return IC.replaceInstUsesWith( 111 II, ConstantFP::get(II.getContext(), Significand)); 112 } 113 114 // Match instruction special case behavior. 115 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 116 Exp = 0; 117 118 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 119 } 120 121 if (isa<UndefValue>(Src)) { 122 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 123 } 124 125 break; 126 } 127 case Intrinsic::amdgcn_class: { 128 enum { 129 S_NAN = 1 << 0, // Signaling NaN 130 Q_NAN = 1 << 1, // Quiet NaN 131 N_INFINITY = 1 << 2, // Negative infinity 132 N_NORMAL = 1 << 3, // Negative normal 133 N_SUBNORMAL = 1 << 4, // Negative subnormal 134 N_ZERO = 1 << 5, // Negative zero 135 P_ZERO = 1 << 6, // Positive zero 136 P_SUBNORMAL = 1 << 7, // Positive subnormal 137 P_NORMAL = 1 << 8, // Positive normal 138 P_INFINITY = 1 << 9 // Positive infinity 139 }; 140 141 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 142 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 143 P_NORMAL | P_INFINITY; 144 145 Value *Src0 = II.getArgOperand(0); 146 Value *Src1 = II.getArgOperand(1); 147 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 148 if (!CMask) { 149 if (isa<UndefValue>(Src0)) { 150 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 151 } 152 153 if (isa<UndefValue>(Src1)) { 154 return IC.replaceInstUsesWith(II, 155 ConstantInt::get(II.getType(), false)); 156 } 157 break; 158 } 159 160 uint32_t Mask = CMask->getZExtValue(); 161 162 // If all tests are made, it doesn't matter what the value is. 163 if ((Mask & FullMask) == FullMask) { 164 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 165 } 166 167 if ((Mask & FullMask) == 0) { 168 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 169 } 170 171 if (Mask == (S_NAN | Q_NAN)) { 172 // Equivalent of isnan. Replace with standard fcmp. 173 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 174 FCmp->takeName(&II); 175 return IC.replaceInstUsesWith(II, FCmp); 176 } 177 178 if (Mask == (N_ZERO | P_ZERO)) { 179 // Equivalent of == 0. 180 Value *FCmp = 181 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 182 183 FCmp->takeName(&II); 184 return IC.replaceInstUsesWith(II, FCmp); 185 } 186 187 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 188 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 189 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 190 return IC.replaceOperand( 191 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 192 } 193 194 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 195 if (!CVal) { 196 if (isa<UndefValue>(Src0)) { 197 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 198 } 199 200 // Clamp mask to used bits 201 if ((Mask & FullMask) != Mask) { 202 CallInst *NewCall = IC.Builder.CreateCall( 203 II.getCalledFunction(), 204 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 205 206 NewCall->takeName(&II); 207 return IC.replaceInstUsesWith(II, NewCall); 208 } 209 210 break; 211 } 212 213 const APFloat &Val = CVal->getValueAPF(); 214 215 bool Result = 216 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 217 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 218 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 219 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 220 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 221 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 222 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 223 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 224 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 225 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 226 227 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 228 } 229 case Intrinsic::amdgcn_cvt_pkrtz: { 230 Value *Src0 = II.getArgOperand(0); 231 Value *Src1 = II.getArgOperand(1); 232 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 233 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 234 const fltSemantics &HalfSem = 235 II.getType()->getScalarType()->getFltSemantics(); 236 bool LosesInfo; 237 APFloat Val0 = C0->getValueAPF(); 238 APFloat Val1 = C1->getValueAPF(); 239 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 240 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 241 242 Constant *Folded = 243 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 244 ConstantFP::get(II.getContext(), Val1)}); 245 return IC.replaceInstUsesWith(II, Folded); 246 } 247 } 248 249 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 250 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 251 } 252 253 break; 254 } 255 case Intrinsic::amdgcn_cvt_pknorm_i16: 256 case Intrinsic::amdgcn_cvt_pknorm_u16: 257 case Intrinsic::amdgcn_cvt_pk_i16: 258 case Intrinsic::amdgcn_cvt_pk_u16: { 259 Value *Src0 = II.getArgOperand(0); 260 Value *Src1 = II.getArgOperand(1); 261 262 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 263 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 264 } 265 266 break; 267 } 268 case Intrinsic::amdgcn_ubfe: 269 case Intrinsic::amdgcn_sbfe: { 270 // Decompose simple cases into standard shifts. 271 Value *Src = II.getArgOperand(0); 272 if (isa<UndefValue>(Src)) { 273 return IC.replaceInstUsesWith(II, Src); 274 } 275 276 unsigned Width; 277 Type *Ty = II.getType(); 278 unsigned IntSize = Ty->getIntegerBitWidth(); 279 280 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 281 if (CWidth) { 282 Width = CWidth->getZExtValue(); 283 if ((Width & (IntSize - 1)) == 0) { 284 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 285 } 286 287 // Hardware ignores high bits, so remove those. 288 if (Width >= IntSize) { 289 return IC.replaceOperand( 290 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 291 } 292 } 293 294 unsigned Offset; 295 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 296 if (COffset) { 297 Offset = COffset->getZExtValue(); 298 if (Offset >= IntSize) { 299 return IC.replaceOperand( 300 II, 1, 301 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 302 } 303 } 304 305 bool Signed = IID == Intrinsic::amdgcn_sbfe; 306 307 if (!CWidth || !COffset) 308 break; 309 310 // The case of Width == 0 is handled above, which makes this tranformation 311 // safe. If Width == 0, then the ashr and lshr instructions become poison 312 // value since the shift amount would be equal to the bit size. 313 assert(Width != 0); 314 315 // TODO: This allows folding to undef when the hardware has specific 316 // behavior? 317 if (Offset + Width < IntSize) { 318 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 319 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 320 : IC.Builder.CreateLShr(Shl, IntSize - Width); 321 RightShift->takeName(&II); 322 return IC.replaceInstUsesWith(II, RightShift); 323 } 324 325 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 326 : IC.Builder.CreateLShr(Src, Offset); 327 328 RightShift->takeName(&II); 329 return IC.replaceInstUsesWith(II, RightShift); 330 } 331 case Intrinsic::amdgcn_exp: 332 case Intrinsic::amdgcn_exp_compr: { 333 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 334 unsigned EnBits = En->getZExtValue(); 335 if (EnBits == 0xf) 336 break; // All inputs enabled. 337 338 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 339 bool Changed = false; 340 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 341 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 342 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 343 Value *Src = II.getArgOperand(I + 2); 344 if (!isa<UndefValue>(Src)) { 345 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 346 Changed = true; 347 } 348 } 349 } 350 351 if (Changed) { 352 return &II; 353 } 354 355 break; 356 } 357 case Intrinsic::amdgcn_fmed3: { 358 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 359 // for the shader. 360 361 Value *Src0 = II.getArgOperand(0); 362 Value *Src1 = II.getArgOperand(1); 363 Value *Src2 = II.getArgOperand(2); 364 365 // Checking for NaN before canonicalization provides better fidelity when 366 // mapping other operations onto fmed3 since the order of operands is 367 // unchanged. 368 CallInst *NewCall = nullptr; 369 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 370 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 371 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 372 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 373 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 374 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 375 } 376 377 if (NewCall) { 378 NewCall->copyFastMathFlags(&II); 379 NewCall->takeName(&II); 380 return IC.replaceInstUsesWith(II, NewCall); 381 } 382 383 bool Swap = false; 384 // Canonicalize constants to RHS operands. 385 // 386 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 387 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 388 std::swap(Src0, Src1); 389 Swap = true; 390 } 391 392 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 393 std::swap(Src1, Src2); 394 Swap = true; 395 } 396 397 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 398 std::swap(Src0, Src1); 399 Swap = true; 400 } 401 402 if (Swap) { 403 II.setArgOperand(0, Src0); 404 II.setArgOperand(1, Src1); 405 II.setArgOperand(2, Src2); 406 return &II; 407 } 408 409 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 410 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 411 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 412 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 413 C2->getValueAPF()); 414 return IC.replaceInstUsesWith( 415 II, ConstantFP::get(IC.Builder.getContext(), Result)); 416 } 417 } 418 } 419 420 break; 421 } 422 case Intrinsic::amdgcn_icmp: 423 case Intrinsic::amdgcn_fcmp: { 424 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 425 // Guard against invalid arguments. 426 int64_t CCVal = CC->getZExtValue(); 427 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 428 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 429 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 430 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 431 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 432 break; 433 434 Value *Src0 = II.getArgOperand(0); 435 Value *Src1 = II.getArgOperand(1); 436 437 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 438 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 439 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 440 if (CCmp->isNullValue()) { 441 return IC.replaceInstUsesWith( 442 II, ConstantExpr::getSExt(CCmp, II.getType())); 443 } 444 445 // The result of V_ICMP/V_FCMP assembly instructions (which this 446 // intrinsic exposes) is one bit per thread, masked with the EXEC 447 // register (which contains the bitmask of live threads). So a 448 // comparison that always returns true is the same as a read of the 449 // EXEC register. 450 Function *NewF = Intrinsic::getDeclaration( 451 II.getModule(), Intrinsic::read_register, II.getType()); 452 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 453 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 454 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 455 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 456 NewCall->addAttribute(AttributeList::FunctionIndex, 457 Attribute::Convergent); 458 NewCall->takeName(&II); 459 return IC.replaceInstUsesWith(II, NewCall); 460 } 461 462 // Canonicalize constants to RHS. 463 CmpInst::Predicate SwapPred = 464 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 465 II.setArgOperand(0, Src1); 466 II.setArgOperand(1, Src0); 467 II.setArgOperand( 468 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 469 return &II; 470 } 471 472 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 473 break; 474 475 // Canonicalize compare eq with true value to compare != 0 476 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 477 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 478 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 479 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 480 Value *ExtSrc; 481 if (CCVal == CmpInst::ICMP_EQ && 482 ((match(Src1, PatternMatch::m_One()) && 483 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 484 (match(Src1, PatternMatch::m_AllOnes()) && 485 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 486 ExtSrc->getType()->isIntegerTy(1)) { 487 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 488 IC.replaceOperand(II, 2, 489 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 490 return &II; 491 } 492 493 CmpInst::Predicate SrcPred; 494 Value *SrcLHS; 495 Value *SrcRHS; 496 497 // Fold compare eq/ne with 0 from a compare result as the predicate to the 498 // intrinsic. The typical use is a wave vote function in the library, which 499 // will be fed from a user code condition compared with 0. Fold in the 500 // redundant compare. 501 502 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 503 // -> llvm.amdgcn.[if]cmp(a, b, pred) 504 // 505 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 506 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 507 if (match(Src1, PatternMatch::m_Zero()) && 508 match(Src0, PatternMatch::m_ZExtOrSExt( 509 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 510 PatternMatch::m_Value(SrcRHS))))) { 511 if (CCVal == CmpInst::ICMP_EQ) 512 SrcPred = CmpInst::getInversePredicate(SrcPred); 513 514 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 515 ? Intrinsic::amdgcn_fcmp 516 : Intrinsic::amdgcn_icmp; 517 518 Type *Ty = SrcLHS->getType(); 519 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 520 // Promote to next legal integer type. 521 unsigned Width = CmpType->getBitWidth(); 522 unsigned NewWidth = Width; 523 524 // Don't do anything for i1 comparisons. 525 if (Width == 1) 526 break; 527 528 if (Width <= 16) 529 NewWidth = 16; 530 else if (Width <= 32) 531 NewWidth = 32; 532 else if (Width <= 64) 533 NewWidth = 64; 534 else if (Width > 64) 535 break; // Can't handle this. 536 537 if (Width != NewWidth) { 538 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 539 if (CmpInst::isSigned(SrcPred)) { 540 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 541 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 542 } else { 543 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 544 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 545 } 546 } 547 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 548 break; 549 550 Function *NewF = Intrinsic::getDeclaration( 551 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 552 Value *Args[] = {SrcLHS, SrcRHS, 553 ConstantInt::get(CC->getType(), SrcPred)}; 554 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 555 NewCall->takeName(&II); 556 return IC.replaceInstUsesWith(II, NewCall); 557 } 558 559 break; 560 } 561 case Intrinsic::amdgcn_ballot: { 562 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 563 if (Src->isZero()) { 564 // amdgcn.ballot(i1 0) is zero. 565 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 566 } 567 568 if (Src->isOne()) { 569 // amdgcn.ballot(i1 1) is exec. 570 const char *RegName = "exec"; 571 if (II.getType()->isIntegerTy(32)) 572 RegName = "exec_lo"; 573 else if (!II.getType()->isIntegerTy(64)) 574 break; 575 576 Function *NewF = Intrinsic::getDeclaration( 577 II.getModule(), Intrinsic::read_register, II.getType()); 578 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 579 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 580 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 581 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 582 NewCall->addAttribute(AttributeList::FunctionIndex, 583 Attribute::Convergent); 584 NewCall->takeName(&II); 585 return IC.replaceInstUsesWith(II, NewCall); 586 } 587 } 588 break; 589 } 590 case Intrinsic::amdgcn_wqm_vote: { 591 // wqm_vote is identity when the argument is constant. 592 if (!isa<Constant>(II.getArgOperand(0))) 593 break; 594 595 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 596 } 597 case Intrinsic::amdgcn_kill: { 598 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 599 if (!C || !C->getZExtValue()) 600 break; 601 602 // amdgcn.kill(i1 1) is a no-op 603 return IC.eraseInstFromFunction(II); 604 } 605 case Intrinsic::amdgcn_update_dpp: { 606 Value *Old = II.getArgOperand(0); 607 608 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 609 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 610 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 611 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 612 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 613 break; 614 615 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 616 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 617 } 618 case Intrinsic::amdgcn_permlane16: 619 case Intrinsic::amdgcn_permlanex16: { 620 // Discard vdst_in if it's not going to be read. 621 Value *VDstIn = II.getArgOperand(0); 622 if (isa<UndefValue>(VDstIn)) 623 break; 624 625 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 626 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 627 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 628 break; 629 630 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 631 } 632 case Intrinsic::amdgcn_readfirstlane: 633 case Intrinsic::amdgcn_readlane: { 634 // A constant value is trivially uniform. 635 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 636 return IC.replaceInstUsesWith(II, C); 637 } 638 639 // The rest of these may not be safe if the exec may not be the same between 640 // the def and use. 641 Value *Src = II.getArgOperand(0); 642 Instruction *SrcInst = dyn_cast<Instruction>(Src); 643 if (SrcInst && SrcInst->getParent() != II.getParent()) 644 break; 645 646 // readfirstlane (readfirstlane x) -> readfirstlane x 647 // readlane (readfirstlane x), y -> readfirstlane x 648 if (match(Src, 649 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 650 return IC.replaceInstUsesWith(II, Src); 651 } 652 653 if (IID == Intrinsic::amdgcn_readfirstlane) { 654 // readfirstlane (readlane x, y) -> readlane x, y 655 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 656 return IC.replaceInstUsesWith(II, Src); 657 } 658 } else { 659 // readlane (readlane x, y), y -> readlane x, y 660 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 661 PatternMatch::m_Value(), 662 PatternMatch::m_Specific(II.getArgOperand(1))))) { 663 return IC.replaceInstUsesWith(II, Src); 664 } 665 } 666 667 break; 668 } 669 case Intrinsic::amdgcn_ldexp: { 670 // FIXME: This doesn't introduce new instructions and belongs in 671 // InstructionSimplify. 672 Type *Ty = II.getType(); 673 Value *Op0 = II.getArgOperand(0); 674 Value *Op1 = II.getArgOperand(1); 675 676 // Folding undef to qnan is safe regardless of the FP mode. 677 if (isa<UndefValue>(Op0)) { 678 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 679 return IC.replaceInstUsesWith(II, QNaN); 680 } 681 682 const APFloat *C = nullptr; 683 match(Op0, PatternMatch::m_APFloat(C)); 684 685 // FIXME: Should flush denorms depending on FP mode, but that's ignored 686 // everywhere else. 687 // 688 // These cases should be safe, even with strictfp. 689 // ldexp(0.0, x) -> 0.0 690 // ldexp(-0.0, x) -> -0.0 691 // ldexp(inf, x) -> inf 692 // ldexp(-inf, x) -> -inf 693 if (C && (C->isZero() || C->isInfinity())) { 694 return IC.replaceInstUsesWith(II, Op0); 695 } 696 697 // With strictfp, be more careful about possibly needing to flush denormals 698 // or not, and snan behavior depends on ieee_mode. 699 if (II.isStrictFP()) 700 break; 701 702 if (C && C->isNaN()) { 703 // FIXME: We just need to make the nan quiet here, but that's unavailable 704 // on APFloat, only IEEEfloat 705 auto *Quieted = 706 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 707 return IC.replaceInstUsesWith(II, Quieted); 708 } 709 710 // ldexp(x, 0) -> x 711 // ldexp(x, undef) -> x 712 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 713 return IC.replaceInstUsesWith(II, Op0); 714 } 715 716 break; 717 } 718 } 719 return None; 720 } 721 722 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 723 /// 724 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 725 /// struct returns. 726 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 727 IntrinsicInst &II, 728 APInt DemandedElts, 729 int DMaskIdx = -1) { 730 731 auto *IIVTy = cast<FixedVectorType>(II.getType()); 732 unsigned VWidth = IIVTy->getNumElements(); 733 if (VWidth == 1) 734 return nullptr; 735 736 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 737 IC.Builder.SetInsertPoint(&II); 738 739 // Assume the arguments are unchanged and later override them, if needed. 740 SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end()); 741 742 if (DMaskIdx < 0) { 743 // Buffer case. 744 745 const unsigned ActiveBits = DemandedElts.getActiveBits(); 746 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 747 748 // Start assuming the prefix of elements is demanded, but possibly clear 749 // some other bits if there are trailing zeros (unused components at front) 750 // and update offset. 751 DemandedElts = (1 << ActiveBits) - 1; 752 753 if (UnusedComponentsAtFront > 0) { 754 static const unsigned InvalidOffsetIdx = 0xf; 755 756 unsigned OffsetIdx; 757 switch (II.getIntrinsicID()) { 758 case Intrinsic::amdgcn_raw_buffer_load: 759 OffsetIdx = 1; 760 break; 761 case Intrinsic::amdgcn_s_buffer_load: 762 // If resulting type is vec3, there is no point in trimming the 763 // load with updated offset, as the vec3 would most likely be widened to 764 // vec4 anyway during lowering. 765 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 766 OffsetIdx = InvalidOffsetIdx; 767 else 768 OffsetIdx = 1; 769 break; 770 case Intrinsic::amdgcn_struct_buffer_load: 771 OffsetIdx = 2; 772 break; 773 default: 774 // TODO: handle tbuffer* intrinsics. 775 OffsetIdx = InvalidOffsetIdx; 776 break; 777 } 778 779 if (OffsetIdx != InvalidOffsetIdx) { 780 // Clear demanded bits and update the offset. 781 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 782 auto *Offset = II.getArgOperand(OffsetIdx); 783 unsigned SingleComponentSizeInBits = 784 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 785 unsigned OffsetAdd = 786 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 787 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 788 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 789 } 790 } 791 } else { 792 // Image case. 793 794 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 795 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 796 797 // Mask off values that are undefined because the dmask doesn't cover them 798 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 799 800 unsigned NewDMaskVal = 0; 801 unsigned OrigLoadIdx = 0; 802 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 803 const unsigned Bit = 1 << SrcIdx; 804 if (!!(DMaskVal & Bit)) { 805 if (!!DemandedElts[OrigLoadIdx]) 806 NewDMaskVal |= Bit; 807 OrigLoadIdx++; 808 } 809 } 810 811 if (DMaskVal != NewDMaskVal) 812 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 813 } 814 815 unsigned NewNumElts = DemandedElts.countPopulation(); 816 if (!NewNumElts) 817 return UndefValue::get(II.getType()); 818 819 // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are 820 // fully supported. 821 if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) 822 return nullptr; 823 824 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 825 if (DMaskIdx >= 0) 826 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 827 return nullptr; 828 } 829 830 // Validate function argument and return types, extracting overloaded types 831 // along the way. 832 SmallVector<Type *, 6> OverloadTys; 833 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 834 return nullptr; 835 836 Module *M = II.getParent()->getParent()->getParent(); 837 Type *EltTy = IIVTy->getElementType(); 838 Type *NewTy = 839 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 840 841 OverloadTys[0] = NewTy; 842 Function *NewIntrin = 843 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 844 845 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 846 NewCall->takeName(&II); 847 NewCall->copyMetadata(II); 848 849 if (NewNumElts == 1) { 850 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 851 NewCall, 852 DemandedElts.countTrailingZeros()); 853 } 854 855 SmallVector<int, 8> EltMask; 856 unsigned NewLoadIdx = 0; 857 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 858 if (!!DemandedElts[OrigLoadIdx]) 859 EltMask.push_back(NewLoadIdx++); 860 else 861 EltMask.push_back(NewNumElts); 862 } 863 864 Value *Shuffle = 865 IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); 866 867 return Shuffle; 868 } 869 870 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 871 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 872 APInt &UndefElts2, APInt &UndefElts3, 873 std::function<void(Instruction *, unsigned, APInt, APInt &)> 874 SimplifyAndSetOp) const { 875 switch (II.getIntrinsicID()) { 876 case Intrinsic::amdgcn_buffer_load: 877 case Intrinsic::amdgcn_buffer_load_format: 878 case Intrinsic::amdgcn_raw_buffer_load: 879 case Intrinsic::amdgcn_raw_buffer_load_format: 880 case Intrinsic::amdgcn_raw_tbuffer_load: 881 case Intrinsic::amdgcn_s_buffer_load: 882 case Intrinsic::amdgcn_struct_buffer_load: 883 case Intrinsic::amdgcn_struct_buffer_load_format: 884 case Intrinsic::amdgcn_struct_tbuffer_load: 885 case Intrinsic::amdgcn_tbuffer_load: 886 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 887 default: { 888 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 889 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 890 } 891 break; 892 } 893 } 894 return None; 895 } 896