1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "llvm/Transforms/InstCombine/InstCombiner.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "AMDGPUtti" 23 24 namespace { 25 26 struct AMDGPUImageDMaskIntrinsic { 27 unsigned Intr; 28 }; 29 30 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 31 #include "InstCombineTables.inc" 32 33 } // end anonymous namespace 34 35 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 36 // 37 // A single NaN input is folded to minnum, so we rely on that folding for 38 // handling NaNs. 39 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 40 const APFloat &Src2) { 41 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 42 43 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 44 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 45 if (Cmp0 == APFloat::cmpEqual) 46 return maxnum(Src1, Src2); 47 48 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 49 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 50 if (Cmp1 == APFloat::cmpEqual) 51 return maxnum(Src0, Src2); 52 53 return maxnum(Src0, Src1); 54 } 55 56 // Check if a value can be converted to a 16-bit value without losing 57 // precision. 58 static bool canSafelyConvertTo16Bit(Value &V) { 59 Type *VTy = V.getType(); 60 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 61 // The value is already 16-bit, so we don't want to convert to 16-bit again! 62 return false; 63 } 64 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 65 // We need to check that if we cast the index down to a half, we do not lose 66 // precision. 67 APFloat FloatValue(ConstFloat->getValueAPF()); 68 bool LosesInfo = true; 69 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 70 return !LosesInfo; 71 } 72 Value *CastSrc; 73 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 74 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 75 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 76 Type *CastSrcTy = CastSrc->getType(); 77 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 78 return true; 79 } 80 81 return false; 82 } 83 84 // Convert a value to 16-bit. 85 Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 86 Type *VTy = V.getType(); 87 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 88 return cast<Instruction>(&V)->getOperand(0); 89 if (VTy->isIntegerTy()) 90 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 91 if (VTy->isFloatingPointTy()) 92 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 93 94 llvm_unreachable("Should never be called!"); 95 } 96 97 static Optional<Instruction *> 98 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 99 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 100 IntrinsicInst &II, InstCombiner &IC) { 101 if (!ST->hasA16() && !ST->hasG16()) 102 return None; 103 104 bool FloatCoord = false; 105 // true means derivatives can be converted to 16 bit, coordinates not 106 bool OnlyDerivatives = false; 107 108 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 109 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 110 Value *Coord = II.getOperand(OperandIndex); 111 // If the values are not derived from 16-bit values, we cannot optimize. 112 if (!canSafelyConvertTo16Bit(*Coord)) { 113 if (OperandIndex < ImageDimIntr->CoordStart || 114 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 115 return None; 116 } 117 // All gradients can be converted, so convert only them 118 OnlyDerivatives = true; 119 break; 120 } 121 122 assert(OperandIndex == ImageDimIntr->GradientStart || 123 FloatCoord == Coord->getType()->isFloatingPointTy()); 124 FloatCoord = Coord->getType()->isFloatingPointTy(); 125 } 126 127 if (OnlyDerivatives) { 128 if (!ST->hasG16()) 129 return None; 130 } else { 131 if (!ST->hasA16()) 132 OnlyDerivatives = true; // Only supports G16 133 } 134 135 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 136 : Type::getInt16Ty(II.getContext()); 137 138 SmallVector<Type *, 4> ArgTys; 139 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 140 return None; 141 142 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 143 if (!OnlyDerivatives) 144 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 145 Function *I = 146 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 147 148 SmallVector<Value *, 8> Args(II.arg_operands()); 149 150 unsigned EndIndex = 151 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 152 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 153 OperandIndex < EndIndex; OperandIndex++) { 154 Args[OperandIndex] = 155 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 156 } 157 158 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 159 NewCall->takeName(&II); 160 NewCall->copyMetadata(II); 161 NewCall->copyFastMathFlags(&II); 162 return IC.replaceInstUsesWith(II, NewCall); 163 } 164 165 Optional<Instruction *> 166 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 167 Intrinsic::ID IID = II.getIntrinsicID(); 168 switch (IID) { 169 case Intrinsic::amdgcn_rcp: { 170 Value *Src = II.getArgOperand(0); 171 172 // TODO: Move to ConstantFolding/InstSimplify? 173 if (isa<UndefValue>(Src)) { 174 Type *Ty = II.getType(); 175 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 176 return IC.replaceInstUsesWith(II, QNaN); 177 } 178 179 if (II.isStrictFP()) 180 break; 181 182 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 183 const APFloat &ArgVal = C->getValueAPF(); 184 APFloat Val(ArgVal.getSemantics(), 1); 185 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 186 187 // This is more precise than the instruction may give. 188 // 189 // TODO: The instruction always flushes denormal results (except for f16), 190 // should this also? 191 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 192 } 193 194 break; 195 } 196 case Intrinsic::amdgcn_rsq: { 197 Value *Src = II.getArgOperand(0); 198 199 // TODO: Move to ConstantFolding/InstSimplify? 200 if (isa<UndefValue>(Src)) { 201 Type *Ty = II.getType(); 202 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 203 return IC.replaceInstUsesWith(II, QNaN); 204 } 205 206 break; 207 } 208 case Intrinsic::amdgcn_frexp_mant: 209 case Intrinsic::amdgcn_frexp_exp: { 210 Value *Src = II.getArgOperand(0); 211 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 212 int Exp; 213 APFloat Significand = 214 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 215 216 if (IID == Intrinsic::amdgcn_frexp_mant) { 217 return IC.replaceInstUsesWith( 218 II, ConstantFP::get(II.getContext(), Significand)); 219 } 220 221 // Match instruction special case behavior. 222 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 223 Exp = 0; 224 225 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 226 } 227 228 if (isa<UndefValue>(Src)) { 229 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 230 } 231 232 break; 233 } 234 case Intrinsic::amdgcn_class: { 235 enum { 236 S_NAN = 1 << 0, // Signaling NaN 237 Q_NAN = 1 << 1, // Quiet NaN 238 N_INFINITY = 1 << 2, // Negative infinity 239 N_NORMAL = 1 << 3, // Negative normal 240 N_SUBNORMAL = 1 << 4, // Negative subnormal 241 N_ZERO = 1 << 5, // Negative zero 242 P_ZERO = 1 << 6, // Positive zero 243 P_SUBNORMAL = 1 << 7, // Positive subnormal 244 P_NORMAL = 1 << 8, // Positive normal 245 P_INFINITY = 1 << 9 // Positive infinity 246 }; 247 248 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 249 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 250 P_NORMAL | P_INFINITY; 251 252 Value *Src0 = II.getArgOperand(0); 253 Value *Src1 = II.getArgOperand(1); 254 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 255 if (!CMask) { 256 if (isa<UndefValue>(Src0)) { 257 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 258 } 259 260 if (isa<UndefValue>(Src1)) { 261 return IC.replaceInstUsesWith(II, 262 ConstantInt::get(II.getType(), false)); 263 } 264 break; 265 } 266 267 uint32_t Mask = CMask->getZExtValue(); 268 269 // If all tests are made, it doesn't matter what the value is. 270 if ((Mask & FullMask) == FullMask) { 271 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 272 } 273 274 if ((Mask & FullMask) == 0) { 275 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 276 } 277 278 if (Mask == (S_NAN | Q_NAN)) { 279 // Equivalent of isnan. Replace with standard fcmp. 280 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 281 FCmp->takeName(&II); 282 return IC.replaceInstUsesWith(II, FCmp); 283 } 284 285 if (Mask == (N_ZERO | P_ZERO)) { 286 // Equivalent of == 0. 287 Value *FCmp = 288 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 289 290 FCmp->takeName(&II); 291 return IC.replaceInstUsesWith(II, FCmp); 292 } 293 294 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 295 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 296 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 297 return IC.replaceOperand( 298 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 299 } 300 301 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 302 if (!CVal) { 303 if (isa<UndefValue>(Src0)) { 304 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 305 } 306 307 // Clamp mask to used bits 308 if ((Mask & FullMask) != Mask) { 309 CallInst *NewCall = IC.Builder.CreateCall( 310 II.getCalledFunction(), 311 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 312 313 NewCall->takeName(&II); 314 return IC.replaceInstUsesWith(II, NewCall); 315 } 316 317 break; 318 } 319 320 const APFloat &Val = CVal->getValueAPF(); 321 322 bool Result = 323 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 324 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 325 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 326 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 327 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 328 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 329 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 330 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 331 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 332 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 333 334 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 335 } 336 case Intrinsic::amdgcn_cvt_pkrtz: { 337 Value *Src0 = II.getArgOperand(0); 338 Value *Src1 = II.getArgOperand(1); 339 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 340 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 341 const fltSemantics &HalfSem = 342 II.getType()->getScalarType()->getFltSemantics(); 343 bool LosesInfo; 344 APFloat Val0 = C0->getValueAPF(); 345 APFloat Val1 = C1->getValueAPF(); 346 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 347 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 348 349 Constant *Folded = 350 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 351 ConstantFP::get(II.getContext(), Val1)}); 352 return IC.replaceInstUsesWith(II, Folded); 353 } 354 } 355 356 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 357 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 358 } 359 360 break; 361 } 362 case Intrinsic::amdgcn_cvt_pknorm_i16: 363 case Intrinsic::amdgcn_cvt_pknorm_u16: 364 case Intrinsic::amdgcn_cvt_pk_i16: 365 case Intrinsic::amdgcn_cvt_pk_u16: { 366 Value *Src0 = II.getArgOperand(0); 367 Value *Src1 = II.getArgOperand(1); 368 369 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 370 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 371 } 372 373 break; 374 } 375 case Intrinsic::amdgcn_ubfe: 376 case Intrinsic::amdgcn_sbfe: { 377 // Decompose simple cases into standard shifts. 378 Value *Src = II.getArgOperand(0); 379 if (isa<UndefValue>(Src)) { 380 return IC.replaceInstUsesWith(II, Src); 381 } 382 383 unsigned Width; 384 Type *Ty = II.getType(); 385 unsigned IntSize = Ty->getIntegerBitWidth(); 386 387 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 388 if (CWidth) { 389 Width = CWidth->getZExtValue(); 390 if ((Width & (IntSize - 1)) == 0) { 391 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 392 } 393 394 // Hardware ignores high bits, so remove those. 395 if (Width >= IntSize) { 396 return IC.replaceOperand( 397 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 398 } 399 } 400 401 unsigned Offset; 402 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 403 if (COffset) { 404 Offset = COffset->getZExtValue(); 405 if (Offset >= IntSize) { 406 return IC.replaceOperand( 407 II, 1, 408 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 409 } 410 } 411 412 bool Signed = IID == Intrinsic::amdgcn_sbfe; 413 414 if (!CWidth || !COffset) 415 break; 416 417 // The case of Width == 0 is handled above, which makes this tranformation 418 // safe. If Width == 0, then the ashr and lshr instructions become poison 419 // value since the shift amount would be equal to the bit size. 420 assert(Width != 0); 421 422 // TODO: This allows folding to undef when the hardware has specific 423 // behavior? 424 if (Offset + Width < IntSize) { 425 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 426 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 427 : IC.Builder.CreateLShr(Shl, IntSize - Width); 428 RightShift->takeName(&II); 429 return IC.replaceInstUsesWith(II, RightShift); 430 } 431 432 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 433 : IC.Builder.CreateLShr(Src, Offset); 434 435 RightShift->takeName(&II); 436 return IC.replaceInstUsesWith(II, RightShift); 437 } 438 case Intrinsic::amdgcn_exp: 439 case Intrinsic::amdgcn_exp_compr: { 440 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 441 unsigned EnBits = En->getZExtValue(); 442 if (EnBits == 0xf) 443 break; // All inputs enabled. 444 445 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 446 bool Changed = false; 447 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 448 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 449 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 450 Value *Src = II.getArgOperand(I + 2); 451 if (!isa<UndefValue>(Src)) { 452 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 453 Changed = true; 454 } 455 } 456 } 457 458 if (Changed) { 459 return &II; 460 } 461 462 break; 463 } 464 case Intrinsic::amdgcn_fmed3: { 465 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 466 // for the shader. 467 468 Value *Src0 = II.getArgOperand(0); 469 Value *Src1 = II.getArgOperand(1); 470 Value *Src2 = II.getArgOperand(2); 471 472 // Checking for NaN before canonicalization provides better fidelity when 473 // mapping other operations onto fmed3 since the order of operands is 474 // unchanged. 475 CallInst *NewCall = nullptr; 476 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 477 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 478 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 479 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 480 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 481 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 482 } 483 484 if (NewCall) { 485 NewCall->copyFastMathFlags(&II); 486 NewCall->takeName(&II); 487 return IC.replaceInstUsesWith(II, NewCall); 488 } 489 490 bool Swap = false; 491 // Canonicalize constants to RHS operands. 492 // 493 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 494 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 495 std::swap(Src0, Src1); 496 Swap = true; 497 } 498 499 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 500 std::swap(Src1, Src2); 501 Swap = true; 502 } 503 504 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 505 std::swap(Src0, Src1); 506 Swap = true; 507 } 508 509 if (Swap) { 510 II.setArgOperand(0, Src0); 511 II.setArgOperand(1, Src1); 512 II.setArgOperand(2, Src2); 513 return &II; 514 } 515 516 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 517 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 518 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 519 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 520 C2->getValueAPF()); 521 return IC.replaceInstUsesWith( 522 II, ConstantFP::get(IC.Builder.getContext(), Result)); 523 } 524 } 525 } 526 527 break; 528 } 529 case Intrinsic::amdgcn_icmp: 530 case Intrinsic::amdgcn_fcmp: { 531 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 532 // Guard against invalid arguments. 533 int64_t CCVal = CC->getZExtValue(); 534 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 535 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 536 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 537 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 538 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 539 break; 540 541 Value *Src0 = II.getArgOperand(0); 542 Value *Src1 = II.getArgOperand(1); 543 544 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 545 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 546 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 547 if (CCmp->isNullValue()) { 548 return IC.replaceInstUsesWith( 549 II, ConstantExpr::getSExt(CCmp, II.getType())); 550 } 551 552 // The result of V_ICMP/V_FCMP assembly instructions (which this 553 // intrinsic exposes) is one bit per thread, masked with the EXEC 554 // register (which contains the bitmask of live threads). So a 555 // comparison that always returns true is the same as a read of the 556 // EXEC register. 557 Function *NewF = Intrinsic::getDeclaration( 558 II.getModule(), Intrinsic::read_register, II.getType()); 559 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 560 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 561 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 562 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 563 NewCall->addAttribute(AttributeList::FunctionIndex, 564 Attribute::Convergent); 565 NewCall->takeName(&II); 566 return IC.replaceInstUsesWith(II, NewCall); 567 } 568 569 // Canonicalize constants to RHS. 570 CmpInst::Predicate SwapPred = 571 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 572 II.setArgOperand(0, Src1); 573 II.setArgOperand(1, Src0); 574 II.setArgOperand( 575 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 576 return &II; 577 } 578 579 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 580 break; 581 582 // Canonicalize compare eq with true value to compare != 0 583 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 584 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 585 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 586 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 587 Value *ExtSrc; 588 if (CCVal == CmpInst::ICMP_EQ && 589 ((match(Src1, PatternMatch::m_One()) && 590 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 591 (match(Src1, PatternMatch::m_AllOnes()) && 592 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 593 ExtSrc->getType()->isIntegerTy(1)) { 594 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 595 IC.replaceOperand(II, 2, 596 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 597 return &II; 598 } 599 600 CmpInst::Predicate SrcPred; 601 Value *SrcLHS; 602 Value *SrcRHS; 603 604 // Fold compare eq/ne with 0 from a compare result as the predicate to the 605 // intrinsic. The typical use is a wave vote function in the library, which 606 // will be fed from a user code condition compared with 0. Fold in the 607 // redundant compare. 608 609 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 610 // -> llvm.amdgcn.[if]cmp(a, b, pred) 611 // 612 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 613 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 614 if (match(Src1, PatternMatch::m_Zero()) && 615 match(Src0, PatternMatch::m_ZExtOrSExt( 616 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 617 PatternMatch::m_Value(SrcRHS))))) { 618 if (CCVal == CmpInst::ICMP_EQ) 619 SrcPred = CmpInst::getInversePredicate(SrcPred); 620 621 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 622 ? Intrinsic::amdgcn_fcmp 623 : Intrinsic::amdgcn_icmp; 624 625 Type *Ty = SrcLHS->getType(); 626 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 627 // Promote to next legal integer type. 628 unsigned Width = CmpType->getBitWidth(); 629 unsigned NewWidth = Width; 630 631 // Don't do anything for i1 comparisons. 632 if (Width == 1) 633 break; 634 635 if (Width <= 16) 636 NewWidth = 16; 637 else if (Width <= 32) 638 NewWidth = 32; 639 else if (Width <= 64) 640 NewWidth = 64; 641 else if (Width > 64) 642 break; // Can't handle this. 643 644 if (Width != NewWidth) { 645 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 646 if (CmpInst::isSigned(SrcPred)) { 647 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 648 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 649 } else { 650 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 651 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 652 } 653 } 654 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 655 break; 656 657 Function *NewF = Intrinsic::getDeclaration( 658 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 659 Value *Args[] = {SrcLHS, SrcRHS, 660 ConstantInt::get(CC->getType(), SrcPred)}; 661 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 662 NewCall->takeName(&II); 663 return IC.replaceInstUsesWith(II, NewCall); 664 } 665 666 break; 667 } 668 case Intrinsic::amdgcn_ballot: { 669 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 670 if (Src->isZero()) { 671 // amdgcn.ballot(i1 0) is zero. 672 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 673 } 674 675 if (Src->isOne()) { 676 // amdgcn.ballot(i1 1) is exec. 677 const char *RegName = "exec"; 678 if (II.getType()->isIntegerTy(32)) 679 RegName = "exec_lo"; 680 else if (!II.getType()->isIntegerTy(64)) 681 break; 682 683 Function *NewF = Intrinsic::getDeclaration( 684 II.getModule(), Intrinsic::read_register, II.getType()); 685 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 686 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 687 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 688 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 689 NewCall->addAttribute(AttributeList::FunctionIndex, 690 Attribute::Convergent); 691 NewCall->takeName(&II); 692 return IC.replaceInstUsesWith(II, NewCall); 693 } 694 } 695 break; 696 } 697 case Intrinsic::amdgcn_wqm_vote: { 698 // wqm_vote is identity when the argument is constant. 699 if (!isa<Constant>(II.getArgOperand(0))) 700 break; 701 702 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 703 } 704 case Intrinsic::amdgcn_kill: { 705 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 706 if (!C || !C->getZExtValue()) 707 break; 708 709 // amdgcn.kill(i1 1) is a no-op 710 return IC.eraseInstFromFunction(II); 711 } 712 case Intrinsic::amdgcn_update_dpp: { 713 Value *Old = II.getArgOperand(0); 714 715 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 716 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 717 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 718 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 719 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 720 break; 721 722 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 723 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 724 } 725 case Intrinsic::amdgcn_permlane16: 726 case Intrinsic::amdgcn_permlanex16: { 727 // Discard vdst_in if it's not going to be read. 728 Value *VDstIn = II.getArgOperand(0); 729 if (isa<UndefValue>(VDstIn)) 730 break; 731 732 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 733 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 734 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 735 break; 736 737 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 738 } 739 case Intrinsic::amdgcn_readfirstlane: 740 case Intrinsic::amdgcn_readlane: { 741 // A constant value is trivially uniform. 742 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 743 return IC.replaceInstUsesWith(II, C); 744 } 745 746 // The rest of these may not be safe if the exec may not be the same between 747 // the def and use. 748 Value *Src = II.getArgOperand(0); 749 Instruction *SrcInst = dyn_cast<Instruction>(Src); 750 if (SrcInst && SrcInst->getParent() != II.getParent()) 751 break; 752 753 // readfirstlane (readfirstlane x) -> readfirstlane x 754 // readlane (readfirstlane x), y -> readfirstlane x 755 if (match(Src, 756 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 757 return IC.replaceInstUsesWith(II, Src); 758 } 759 760 if (IID == Intrinsic::amdgcn_readfirstlane) { 761 // readfirstlane (readlane x, y) -> readlane x, y 762 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 763 return IC.replaceInstUsesWith(II, Src); 764 } 765 } else { 766 // readlane (readlane x, y), y -> readlane x, y 767 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 768 PatternMatch::m_Value(), 769 PatternMatch::m_Specific(II.getArgOperand(1))))) { 770 return IC.replaceInstUsesWith(II, Src); 771 } 772 } 773 774 break; 775 } 776 case Intrinsic::amdgcn_ldexp: { 777 // FIXME: This doesn't introduce new instructions and belongs in 778 // InstructionSimplify. 779 Type *Ty = II.getType(); 780 Value *Op0 = II.getArgOperand(0); 781 Value *Op1 = II.getArgOperand(1); 782 783 // Folding undef to qnan is safe regardless of the FP mode. 784 if (isa<UndefValue>(Op0)) { 785 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 786 return IC.replaceInstUsesWith(II, QNaN); 787 } 788 789 const APFloat *C = nullptr; 790 match(Op0, PatternMatch::m_APFloat(C)); 791 792 // FIXME: Should flush denorms depending on FP mode, but that's ignored 793 // everywhere else. 794 // 795 // These cases should be safe, even with strictfp. 796 // ldexp(0.0, x) -> 0.0 797 // ldexp(-0.0, x) -> -0.0 798 // ldexp(inf, x) -> inf 799 // ldexp(-inf, x) -> -inf 800 if (C && (C->isZero() || C->isInfinity())) { 801 return IC.replaceInstUsesWith(II, Op0); 802 } 803 804 // With strictfp, be more careful about possibly needing to flush denormals 805 // or not, and snan behavior depends on ieee_mode. 806 if (II.isStrictFP()) 807 break; 808 809 if (C && C->isNaN()) { 810 // FIXME: We just need to make the nan quiet here, but that's unavailable 811 // on APFloat, only IEEEfloat 812 auto *Quieted = 813 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 814 return IC.replaceInstUsesWith(II, Quieted); 815 } 816 817 // ldexp(x, 0) -> x 818 // ldexp(x, undef) -> x 819 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 820 return IC.replaceInstUsesWith(II, Op0); 821 } 822 823 break; 824 } 825 default: { 826 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 827 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 828 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 829 } 830 } 831 } 832 return None; 833 } 834 835 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 836 /// 837 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 838 /// struct returns. 839 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 840 IntrinsicInst &II, 841 APInt DemandedElts, 842 int DMaskIdx = -1) { 843 844 auto *IIVTy = cast<FixedVectorType>(II.getType()); 845 unsigned VWidth = IIVTy->getNumElements(); 846 if (VWidth == 1) 847 return nullptr; 848 849 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 850 IC.Builder.SetInsertPoint(&II); 851 852 // Assume the arguments are unchanged and later override them, if needed. 853 SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end()); 854 855 if (DMaskIdx < 0) { 856 // Buffer case. 857 858 const unsigned ActiveBits = DemandedElts.getActiveBits(); 859 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 860 861 // Start assuming the prefix of elements is demanded, but possibly clear 862 // some other bits if there are trailing zeros (unused components at front) 863 // and update offset. 864 DemandedElts = (1 << ActiveBits) - 1; 865 866 if (UnusedComponentsAtFront > 0) { 867 static const unsigned InvalidOffsetIdx = 0xf; 868 869 unsigned OffsetIdx; 870 switch (II.getIntrinsicID()) { 871 case Intrinsic::amdgcn_raw_buffer_load: 872 OffsetIdx = 1; 873 break; 874 case Intrinsic::amdgcn_s_buffer_load: 875 // If resulting type is vec3, there is no point in trimming the 876 // load with updated offset, as the vec3 would most likely be widened to 877 // vec4 anyway during lowering. 878 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 879 OffsetIdx = InvalidOffsetIdx; 880 else 881 OffsetIdx = 1; 882 break; 883 case Intrinsic::amdgcn_struct_buffer_load: 884 OffsetIdx = 2; 885 break; 886 default: 887 // TODO: handle tbuffer* intrinsics. 888 OffsetIdx = InvalidOffsetIdx; 889 break; 890 } 891 892 if (OffsetIdx != InvalidOffsetIdx) { 893 // Clear demanded bits and update the offset. 894 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 895 auto *Offset = II.getArgOperand(OffsetIdx); 896 unsigned SingleComponentSizeInBits = 897 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 898 unsigned OffsetAdd = 899 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 900 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 901 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 902 } 903 } 904 } else { 905 // Image case. 906 907 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 908 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 909 910 // Mask off values that are undefined because the dmask doesn't cover them 911 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 912 913 unsigned NewDMaskVal = 0; 914 unsigned OrigLoadIdx = 0; 915 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 916 const unsigned Bit = 1 << SrcIdx; 917 if (!!(DMaskVal & Bit)) { 918 if (!!DemandedElts[OrigLoadIdx]) 919 NewDMaskVal |= Bit; 920 OrigLoadIdx++; 921 } 922 } 923 924 if (DMaskVal != NewDMaskVal) 925 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 926 } 927 928 unsigned NewNumElts = DemandedElts.countPopulation(); 929 if (!NewNumElts) 930 return UndefValue::get(II.getType()); 931 932 // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are 933 // fully supported. 934 if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3) 935 return nullptr; 936 937 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 938 if (DMaskIdx >= 0) 939 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 940 return nullptr; 941 } 942 943 // Validate function argument and return types, extracting overloaded types 944 // along the way. 945 SmallVector<Type *, 6> OverloadTys; 946 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 947 return nullptr; 948 949 Module *M = II.getParent()->getParent()->getParent(); 950 Type *EltTy = IIVTy->getElementType(); 951 Type *NewTy = 952 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 953 954 OverloadTys[0] = NewTy; 955 Function *NewIntrin = 956 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 957 958 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 959 NewCall->takeName(&II); 960 NewCall->copyMetadata(II); 961 962 if (NewNumElts == 1) { 963 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 964 NewCall, 965 DemandedElts.countTrailingZeros()); 966 } 967 968 SmallVector<int, 8> EltMask; 969 unsigned NewLoadIdx = 0; 970 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 971 if (!!DemandedElts[OrigLoadIdx]) 972 EltMask.push_back(NewLoadIdx++); 973 else 974 EltMask.push_back(NewNumElts); 975 } 976 977 Value *Shuffle = 978 IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); 979 980 return Shuffle; 981 } 982 983 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 984 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 985 APInt &UndefElts2, APInt &UndefElts3, 986 std::function<void(Instruction *, unsigned, APInt, APInt &)> 987 SimplifyAndSetOp) const { 988 switch (II.getIntrinsicID()) { 989 case Intrinsic::amdgcn_buffer_load: 990 case Intrinsic::amdgcn_buffer_load_format: 991 case Intrinsic::amdgcn_raw_buffer_load: 992 case Intrinsic::amdgcn_raw_buffer_load_format: 993 case Intrinsic::amdgcn_raw_tbuffer_load: 994 case Intrinsic::amdgcn_s_buffer_load: 995 case Intrinsic::amdgcn_struct_buffer_load: 996 case Intrinsic::amdgcn_struct_buffer_load_format: 997 case Intrinsic::amdgcn_struct_tbuffer_load: 998 case Intrinsic::amdgcn_tbuffer_load: 999 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1000 default: { 1001 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1002 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1003 } 1004 break; 1005 } 1006 } 1007 return None; 1008 } 1009