1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUTargetTransformInfo.h" 18 #include "llvm/Support/KnownBits.h" 19 #include "llvm/Transforms/InstCombine/InstCombiner.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "AMDGPUtti" 24 25 namespace { 26 27 struct AMDGPUImageDMaskIntrinsic { 28 unsigned Intr; 29 }; 30 31 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 32 #include "InstCombineTables.inc" 33 34 } // end anonymous namespace 35 36 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 37 // 38 // A single NaN input is folded to minnum, so we rely on that folding for 39 // handling NaNs. 40 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 41 const APFloat &Src2) { 42 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 43 44 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 45 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 46 if (Cmp0 == APFloat::cmpEqual) 47 return maxnum(Src1, Src2); 48 49 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 50 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 51 if (Cmp1 == APFloat::cmpEqual) 52 return maxnum(Src0, Src2); 53 54 return maxnum(Src0, Src1); 55 } 56 57 // Check if a value can be converted to a 16-bit value without losing 58 // precision. 59 static bool canSafelyConvertTo16Bit(Value &V) { 60 Type *VTy = V.getType(); 61 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 62 // The value is already 16-bit, so we don't want to convert to 16-bit again! 63 return false; 64 } 65 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 66 // We need to check that if we cast the index down to a half, we do not lose 67 // precision. 68 APFloat FloatValue(ConstFloat->getValueAPF()); 69 bool LosesInfo = true; 70 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 71 return !LosesInfo; 72 } 73 Value *CastSrc; 74 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 75 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 76 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 77 Type *CastSrcTy = CastSrc->getType(); 78 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 79 return true; 80 } 81 82 return false; 83 } 84 85 // Convert a value to 16-bit. 86 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 87 Type *VTy = V.getType(); 88 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 89 return cast<Instruction>(&V)->getOperand(0); 90 if (VTy->isIntegerTy()) 91 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 92 if (VTy->isFloatingPointTy()) 93 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 94 95 llvm_unreachable("Should never be called!"); 96 } 97 98 static Optional<Instruction *> 99 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 100 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 101 IntrinsicInst &II, InstCombiner &IC) { 102 if (!ST->hasA16() && !ST->hasG16()) 103 return None; 104 105 bool FloatCoord = false; 106 // true means derivatives can be converted to 16 bit, coordinates not 107 bool OnlyDerivatives = false; 108 109 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 110 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 111 Value *Coord = II.getOperand(OperandIndex); 112 // If the values are not derived from 16-bit values, we cannot optimize. 113 if (!canSafelyConvertTo16Bit(*Coord)) { 114 if (OperandIndex < ImageDimIntr->CoordStart || 115 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 116 return None; 117 } 118 // All gradients can be converted, so convert only them 119 OnlyDerivatives = true; 120 break; 121 } 122 123 assert(OperandIndex == ImageDimIntr->GradientStart || 124 FloatCoord == Coord->getType()->isFloatingPointTy()); 125 FloatCoord = Coord->getType()->isFloatingPointTy(); 126 } 127 128 if (OnlyDerivatives) { 129 if (!ST->hasG16()) 130 return None; 131 } else { 132 if (!ST->hasA16()) 133 OnlyDerivatives = true; // Only supports G16 134 } 135 136 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 137 : Type::getInt16Ty(II.getContext()); 138 139 SmallVector<Type *, 4> ArgTys; 140 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 141 return None; 142 143 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 144 if (!OnlyDerivatives) 145 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 146 Function *I = 147 Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys); 148 149 SmallVector<Value *, 8> Args(II.arg_operands()); 150 151 unsigned EndIndex = 152 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 153 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 154 OperandIndex < EndIndex; OperandIndex++) { 155 Args[OperandIndex] = 156 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 157 } 158 159 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 160 NewCall->takeName(&II); 161 NewCall->copyMetadata(II); 162 NewCall->copyFastMathFlags(&II); 163 return IC.replaceInstUsesWith(II, NewCall); 164 } 165 166 Optional<Instruction *> 167 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 168 Intrinsic::ID IID = II.getIntrinsicID(); 169 switch (IID) { 170 case Intrinsic::amdgcn_rcp: { 171 Value *Src = II.getArgOperand(0); 172 173 // TODO: Move to ConstantFolding/InstSimplify? 174 if (isa<UndefValue>(Src)) { 175 Type *Ty = II.getType(); 176 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 177 return IC.replaceInstUsesWith(II, QNaN); 178 } 179 180 if (II.isStrictFP()) 181 break; 182 183 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 184 const APFloat &ArgVal = C->getValueAPF(); 185 APFloat Val(ArgVal.getSemantics(), 1); 186 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 187 188 // This is more precise than the instruction may give. 189 // 190 // TODO: The instruction always flushes denormal results (except for f16), 191 // should this also? 192 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 193 } 194 195 break; 196 } 197 case Intrinsic::amdgcn_rsq: { 198 Value *Src = II.getArgOperand(0); 199 200 // TODO: Move to ConstantFolding/InstSimplify? 201 if (isa<UndefValue>(Src)) { 202 Type *Ty = II.getType(); 203 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 204 return IC.replaceInstUsesWith(II, QNaN); 205 } 206 207 break; 208 } 209 case Intrinsic::amdgcn_frexp_mant: 210 case Intrinsic::amdgcn_frexp_exp: { 211 Value *Src = II.getArgOperand(0); 212 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 213 int Exp; 214 APFloat Significand = 215 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 216 217 if (IID == Intrinsic::amdgcn_frexp_mant) { 218 return IC.replaceInstUsesWith( 219 II, ConstantFP::get(II.getContext(), Significand)); 220 } 221 222 // Match instruction special case behavior. 223 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 224 Exp = 0; 225 226 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 227 } 228 229 if (isa<UndefValue>(Src)) { 230 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 231 } 232 233 break; 234 } 235 case Intrinsic::amdgcn_class: { 236 enum { 237 S_NAN = 1 << 0, // Signaling NaN 238 Q_NAN = 1 << 1, // Quiet NaN 239 N_INFINITY = 1 << 2, // Negative infinity 240 N_NORMAL = 1 << 3, // Negative normal 241 N_SUBNORMAL = 1 << 4, // Negative subnormal 242 N_ZERO = 1 << 5, // Negative zero 243 P_ZERO = 1 << 6, // Positive zero 244 P_SUBNORMAL = 1 << 7, // Positive subnormal 245 P_NORMAL = 1 << 8, // Positive normal 246 P_INFINITY = 1 << 9 // Positive infinity 247 }; 248 249 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 250 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 251 P_NORMAL | P_INFINITY; 252 253 Value *Src0 = II.getArgOperand(0); 254 Value *Src1 = II.getArgOperand(1); 255 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 256 if (!CMask) { 257 if (isa<UndefValue>(Src0)) { 258 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 259 } 260 261 if (isa<UndefValue>(Src1)) { 262 return IC.replaceInstUsesWith(II, 263 ConstantInt::get(II.getType(), false)); 264 } 265 break; 266 } 267 268 uint32_t Mask = CMask->getZExtValue(); 269 270 // If all tests are made, it doesn't matter what the value is. 271 if ((Mask & FullMask) == FullMask) { 272 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 273 } 274 275 if ((Mask & FullMask) == 0) { 276 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 277 } 278 279 if (Mask == (S_NAN | Q_NAN)) { 280 // Equivalent of isnan. Replace with standard fcmp. 281 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 282 FCmp->takeName(&II); 283 return IC.replaceInstUsesWith(II, FCmp); 284 } 285 286 if (Mask == (N_ZERO | P_ZERO)) { 287 // Equivalent of == 0. 288 Value *FCmp = 289 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 290 291 FCmp->takeName(&II); 292 return IC.replaceInstUsesWith(II, FCmp); 293 } 294 295 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 296 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 297 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 298 return IC.replaceOperand( 299 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 300 } 301 302 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 303 if (!CVal) { 304 if (isa<UndefValue>(Src0)) { 305 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 306 } 307 308 // Clamp mask to used bits 309 if ((Mask & FullMask) != Mask) { 310 CallInst *NewCall = IC.Builder.CreateCall( 311 II.getCalledFunction(), 312 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 313 314 NewCall->takeName(&II); 315 return IC.replaceInstUsesWith(II, NewCall); 316 } 317 318 break; 319 } 320 321 const APFloat &Val = CVal->getValueAPF(); 322 323 bool Result = 324 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 325 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 326 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 327 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 328 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 329 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 330 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 331 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 332 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 333 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 334 335 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 336 } 337 case Intrinsic::amdgcn_cvt_pkrtz: { 338 Value *Src0 = II.getArgOperand(0); 339 Value *Src1 = II.getArgOperand(1); 340 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 341 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 342 const fltSemantics &HalfSem = 343 II.getType()->getScalarType()->getFltSemantics(); 344 bool LosesInfo; 345 APFloat Val0 = C0->getValueAPF(); 346 APFloat Val1 = C1->getValueAPF(); 347 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 348 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 349 350 Constant *Folded = 351 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 352 ConstantFP::get(II.getContext(), Val1)}); 353 return IC.replaceInstUsesWith(II, Folded); 354 } 355 } 356 357 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 358 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 359 } 360 361 break; 362 } 363 case Intrinsic::amdgcn_cvt_pknorm_i16: 364 case Intrinsic::amdgcn_cvt_pknorm_u16: 365 case Intrinsic::amdgcn_cvt_pk_i16: 366 case Intrinsic::amdgcn_cvt_pk_u16: { 367 Value *Src0 = II.getArgOperand(0); 368 Value *Src1 = II.getArgOperand(1); 369 370 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 371 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 372 } 373 374 break; 375 } 376 case Intrinsic::amdgcn_ubfe: 377 case Intrinsic::amdgcn_sbfe: { 378 // Decompose simple cases into standard shifts. 379 Value *Src = II.getArgOperand(0); 380 if (isa<UndefValue>(Src)) { 381 return IC.replaceInstUsesWith(II, Src); 382 } 383 384 unsigned Width; 385 Type *Ty = II.getType(); 386 unsigned IntSize = Ty->getIntegerBitWidth(); 387 388 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 389 if (CWidth) { 390 Width = CWidth->getZExtValue(); 391 if ((Width & (IntSize - 1)) == 0) { 392 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 393 } 394 395 // Hardware ignores high bits, so remove those. 396 if (Width >= IntSize) { 397 return IC.replaceOperand( 398 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 399 } 400 } 401 402 unsigned Offset; 403 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 404 if (COffset) { 405 Offset = COffset->getZExtValue(); 406 if (Offset >= IntSize) { 407 return IC.replaceOperand( 408 II, 1, 409 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 410 } 411 } 412 413 bool Signed = IID == Intrinsic::amdgcn_sbfe; 414 415 if (!CWidth || !COffset) 416 break; 417 418 // The case of Width == 0 is handled above, which makes this tranformation 419 // safe. If Width == 0, then the ashr and lshr instructions become poison 420 // value since the shift amount would be equal to the bit size. 421 assert(Width != 0); 422 423 // TODO: This allows folding to undef when the hardware has specific 424 // behavior? 425 if (Offset + Width < IntSize) { 426 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 427 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 428 : IC.Builder.CreateLShr(Shl, IntSize - Width); 429 RightShift->takeName(&II); 430 return IC.replaceInstUsesWith(II, RightShift); 431 } 432 433 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 434 : IC.Builder.CreateLShr(Src, Offset); 435 436 RightShift->takeName(&II); 437 return IC.replaceInstUsesWith(II, RightShift); 438 } 439 case Intrinsic::amdgcn_exp: 440 case Intrinsic::amdgcn_exp_compr: { 441 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 442 unsigned EnBits = En->getZExtValue(); 443 if (EnBits == 0xf) 444 break; // All inputs enabled. 445 446 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 447 bool Changed = false; 448 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 449 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 450 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 451 Value *Src = II.getArgOperand(I + 2); 452 if (!isa<UndefValue>(Src)) { 453 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 454 Changed = true; 455 } 456 } 457 } 458 459 if (Changed) { 460 return &II; 461 } 462 463 break; 464 } 465 case Intrinsic::amdgcn_fmed3: { 466 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 467 // for the shader. 468 469 Value *Src0 = II.getArgOperand(0); 470 Value *Src1 = II.getArgOperand(1); 471 Value *Src2 = II.getArgOperand(2); 472 473 // Checking for NaN before canonicalization provides better fidelity when 474 // mapping other operations onto fmed3 since the order of operands is 475 // unchanged. 476 CallInst *NewCall = nullptr; 477 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 478 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 479 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 480 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 481 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 482 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 483 } 484 485 if (NewCall) { 486 NewCall->copyFastMathFlags(&II); 487 NewCall->takeName(&II); 488 return IC.replaceInstUsesWith(II, NewCall); 489 } 490 491 bool Swap = false; 492 // Canonicalize constants to RHS operands. 493 // 494 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 495 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 496 std::swap(Src0, Src1); 497 Swap = true; 498 } 499 500 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 501 std::swap(Src1, Src2); 502 Swap = true; 503 } 504 505 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 506 std::swap(Src0, Src1); 507 Swap = true; 508 } 509 510 if (Swap) { 511 II.setArgOperand(0, Src0); 512 II.setArgOperand(1, Src1); 513 II.setArgOperand(2, Src2); 514 return &II; 515 } 516 517 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 518 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 519 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 520 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 521 C2->getValueAPF()); 522 return IC.replaceInstUsesWith( 523 II, ConstantFP::get(IC.Builder.getContext(), Result)); 524 } 525 } 526 } 527 528 break; 529 } 530 case Intrinsic::amdgcn_icmp: 531 case Intrinsic::amdgcn_fcmp: { 532 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 533 // Guard against invalid arguments. 534 int64_t CCVal = CC->getZExtValue(); 535 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 536 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 537 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 538 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 539 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 540 break; 541 542 Value *Src0 = II.getArgOperand(0); 543 Value *Src1 = II.getArgOperand(1); 544 545 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 546 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 547 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 548 if (CCmp->isNullValue()) { 549 return IC.replaceInstUsesWith( 550 II, ConstantExpr::getSExt(CCmp, II.getType())); 551 } 552 553 // The result of V_ICMP/V_FCMP assembly instructions (which this 554 // intrinsic exposes) is one bit per thread, masked with the EXEC 555 // register (which contains the bitmask of live threads). So a 556 // comparison that always returns true is the same as a read of the 557 // EXEC register. 558 Function *NewF = Intrinsic::getDeclaration( 559 II.getModule(), Intrinsic::read_register, II.getType()); 560 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 561 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 562 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 563 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 564 NewCall->addAttribute(AttributeList::FunctionIndex, 565 Attribute::Convergent); 566 NewCall->takeName(&II); 567 return IC.replaceInstUsesWith(II, NewCall); 568 } 569 570 // Canonicalize constants to RHS. 571 CmpInst::Predicate SwapPred = 572 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 573 II.setArgOperand(0, Src1); 574 II.setArgOperand(1, Src0); 575 II.setArgOperand( 576 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 577 return &II; 578 } 579 580 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 581 break; 582 583 // Canonicalize compare eq with true value to compare != 0 584 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 585 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 586 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 587 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 588 Value *ExtSrc; 589 if (CCVal == CmpInst::ICMP_EQ && 590 ((match(Src1, PatternMatch::m_One()) && 591 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 592 (match(Src1, PatternMatch::m_AllOnes()) && 593 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 594 ExtSrc->getType()->isIntegerTy(1)) { 595 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 596 IC.replaceOperand(II, 2, 597 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 598 return &II; 599 } 600 601 CmpInst::Predicate SrcPred; 602 Value *SrcLHS; 603 Value *SrcRHS; 604 605 // Fold compare eq/ne with 0 from a compare result as the predicate to the 606 // intrinsic. The typical use is a wave vote function in the library, which 607 // will be fed from a user code condition compared with 0. Fold in the 608 // redundant compare. 609 610 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 611 // -> llvm.amdgcn.[if]cmp(a, b, pred) 612 // 613 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 614 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 615 if (match(Src1, PatternMatch::m_Zero()) && 616 match(Src0, PatternMatch::m_ZExtOrSExt( 617 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 618 PatternMatch::m_Value(SrcRHS))))) { 619 if (CCVal == CmpInst::ICMP_EQ) 620 SrcPred = CmpInst::getInversePredicate(SrcPred); 621 622 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 623 ? Intrinsic::amdgcn_fcmp 624 : Intrinsic::amdgcn_icmp; 625 626 Type *Ty = SrcLHS->getType(); 627 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 628 // Promote to next legal integer type. 629 unsigned Width = CmpType->getBitWidth(); 630 unsigned NewWidth = Width; 631 632 // Don't do anything for i1 comparisons. 633 if (Width == 1) 634 break; 635 636 if (Width <= 16) 637 NewWidth = 16; 638 else if (Width <= 32) 639 NewWidth = 32; 640 else if (Width <= 64) 641 NewWidth = 64; 642 else if (Width > 64) 643 break; // Can't handle this. 644 645 if (Width != NewWidth) { 646 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 647 if (CmpInst::isSigned(SrcPred)) { 648 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 649 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 650 } else { 651 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 652 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 653 } 654 } 655 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 656 break; 657 658 Function *NewF = Intrinsic::getDeclaration( 659 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 660 Value *Args[] = {SrcLHS, SrcRHS, 661 ConstantInt::get(CC->getType(), SrcPred)}; 662 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 663 NewCall->takeName(&II); 664 return IC.replaceInstUsesWith(II, NewCall); 665 } 666 667 break; 668 } 669 case Intrinsic::amdgcn_ballot: { 670 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 671 if (Src->isZero()) { 672 // amdgcn.ballot(i1 0) is zero. 673 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 674 } 675 676 if (Src->isOne()) { 677 // amdgcn.ballot(i1 1) is exec. 678 const char *RegName = "exec"; 679 if (II.getType()->isIntegerTy(32)) 680 RegName = "exec_lo"; 681 else if (!II.getType()->isIntegerTy(64)) 682 break; 683 684 Function *NewF = Intrinsic::getDeclaration( 685 II.getModule(), Intrinsic::read_register, II.getType()); 686 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 687 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 688 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 689 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 690 NewCall->addAttribute(AttributeList::FunctionIndex, 691 Attribute::Convergent); 692 NewCall->takeName(&II); 693 return IC.replaceInstUsesWith(II, NewCall); 694 } 695 } 696 break; 697 } 698 case Intrinsic::amdgcn_wqm_vote: { 699 // wqm_vote is identity when the argument is constant. 700 if (!isa<Constant>(II.getArgOperand(0))) 701 break; 702 703 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 704 } 705 case Intrinsic::amdgcn_kill: { 706 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 707 if (!C || !C->getZExtValue()) 708 break; 709 710 // amdgcn.kill(i1 1) is a no-op 711 return IC.eraseInstFromFunction(II); 712 } 713 case Intrinsic::amdgcn_update_dpp: { 714 Value *Old = II.getArgOperand(0); 715 716 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 717 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 718 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 719 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 720 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 721 break; 722 723 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 724 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 725 } 726 case Intrinsic::amdgcn_permlane16: 727 case Intrinsic::amdgcn_permlanex16: { 728 // Discard vdst_in if it's not going to be read. 729 Value *VDstIn = II.getArgOperand(0); 730 if (isa<UndefValue>(VDstIn)) 731 break; 732 733 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 734 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 735 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 736 break; 737 738 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 739 } 740 case Intrinsic::amdgcn_readfirstlane: 741 case Intrinsic::amdgcn_readlane: { 742 // A constant value is trivially uniform. 743 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 744 return IC.replaceInstUsesWith(II, C); 745 } 746 747 // The rest of these may not be safe if the exec may not be the same between 748 // the def and use. 749 Value *Src = II.getArgOperand(0); 750 Instruction *SrcInst = dyn_cast<Instruction>(Src); 751 if (SrcInst && SrcInst->getParent() != II.getParent()) 752 break; 753 754 // readfirstlane (readfirstlane x) -> readfirstlane x 755 // readlane (readfirstlane x), y -> readfirstlane x 756 if (match(Src, 757 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 758 return IC.replaceInstUsesWith(II, Src); 759 } 760 761 if (IID == Intrinsic::amdgcn_readfirstlane) { 762 // readfirstlane (readlane x, y) -> readlane x, y 763 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 764 return IC.replaceInstUsesWith(II, Src); 765 } 766 } else { 767 // readlane (readlane x, y), y -> readlane x, y 768 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 769 PatternMatch::m_Value(), 770 PatternMatch::m_Specific(II.getArgOperand(1))))) { 771 return IC.replaceInstUsesWith(II, Src); 772 } 773 } 774 775 break; 776 } 777 case Intrinsic::amdgcn_ldexp: { 778 // FIXME: This doesn't introduce new instructions and belongs in 779 // InstructionSimplify. 780 Type *Ty = II.getType(); 781 Value *Op0 = II.getArgOperand(0); 782 Value *Op1 = II.getArgOperand(1); 783 784 // Folding undef to qnan is safe regardless of the FP mode. 785 if (isa<UndefValue>(Op0)) { 786 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 787 return IC.replaceInstUsesWith(II, QNaN); 788 } 789 790 const APFloat *C = nullptr; 791 match(Op0, PatternMatch::m_APFloat(C)); 792 793 // FIXME: Should flush denorms depending on FP mode, but that's ignored 794 // everywhere else. 795 // 796 // These cases should be safe, even with strictfp. 797 // ldexp(0.0, x) -> 0.0 798 // ldexp(-0.0, x) -> -0.0 799 // ldexp(inf, x) -> inf 800 // ldexp(-inf, x) -> -inf 801 if (C && (C->isZero() || C->isInfinity())) { 802 return IC.replaceInstUsesWith(II, Op0); 803 } 804 805 // With strictfp, be more careful about possibly needing to flush denormals 806 // or not, and snan behavior depends on ieee_mode. 807 if (II.isStrictFP()) 808 break; 809 810 if (C && C->isNaN()) { 811 // FIXME: We just need to make the nan quiet here, but that's unavailable 812 // on APFloat, only IEEEfloat 813 auto *Quieted = 814 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 815 return IC.replaceInstUsesWith(II, Quieted); 816 } 817 818 // ldexp(x, 0) -> x 819 // ldexp(x, undef) -> x 820 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 821 return IC.replaceInstUsesWith(II, Op0); 822 } 823 824 break; 825 } 826 default: { 827 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 828 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 829 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 830 } 831 } 832 } 833 return None; 834 } 835 836 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 837 /// 838 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 839 /// struct returns. 840 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 841 IntrinsicInst &II, 842 APInt DemandedElts, 843 int DMaskIdx = -1) { 844 845 auto *IIVTy = cast<FixedVectorType>(II.getType()); 846 unsigned VWidth = IIVTy->getNumElements(); 847 if (VWidth == 1) 848 return nullptr; 849 850 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 851 IC.Builder.SetInsertPoint(&II); 852 853 // Assume the arguments are unchanged and later override them, if needed. 854 SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end()); 855 856 if (DMaskIdx < 0) { 857 // Buffer case. 858 859 const unsigned ActiveBits = DemandedElts.getActiveBits(); 860 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 861 862 // Start assuming the prefix of elements is demanded, but possibly clear 863 // some other bits if there are trailing zeros (unused components at front) 864 // and update offset. 865 DemandedElts = (1 << ActiveBits) - 1; 866 867 if (UnusedComponentsAtFront > 0) { 868 static const unsigned InvalidOffsetIdx = 0xf; 869 870 unsigned OffsetIdx; 871 switch (II.getIntrinsicID()) { 872 case Intrinsic::amdgcn_raw_buffer_load: 873 OffsetIdx = 1; 874 break; 875 case Intrinsic::amdgcn_s_buffer_load: 876 // If resulting type is vec3, there is no point in trimming the 877 // load with updated offset, as the vec3 would most likely be widened to 878 // vec4 anyway during lowering. 879 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 880 OffsetIdx = InvalidOffsetIdx; 881 else 882 OffsetIdx = 1; 883 break; 884 case Intrinsic::amdgcn_struct_buffer_load: 885 OffsetIdx = 2; 886 break; 887 default: 888 // TODO: handle tbuffer* intrinsics. 889 OffsetIdx = InvalidOffsetIdx; 890 break; 891 } 892 893 if (OffsetIdx != InvalidOffsetIdx) { 894 // Clear demanded bits and update the offset. 895 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 896 auto *Offset = II.getArgOperand(OffsetIdx); 897 unsigned SingleComponentSizeInBits = 898 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 899 unsigned OffsetAdd = 900 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 901 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 902 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 903 } 904 } 905 } else { 906 // Image case. 907 908 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 909 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 910 911 // Mask off values that are undefined because the dmask doesn't cover them 912 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 913 914 unsigned NewDMaskVal = 0; 915 unsigned OrigLoadIdx = 0; 916 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 917 const unsigned Bit = 1 << SrcIdx; 918 if (!!(DMaskVal & Bit)) { 919 if (!!DemandedElts[OrigLoadIdx]) 920 NewDMaskVal |= Bit; 921 OrigLoadIdx++; 922 } 923 } 924 925 if (DMaskVal != NewDMaskVal) 926 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 927 } 928 929 unsigned NewNumElts = DemandedElts.countPopulation(); 930 if (!NewNumElts) 931 return UndefValue::get(II.getType()); 932 933 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 934 if (DMaskIdx >= 0) 935 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 936 return nullptr; 937 } 938 939 // Validate function argument and return types, extracting overloaded types 940 // along the way. 941 SmallVector<Type *, 6> OverloadTys; 942 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 943 return nullptr; 944 945 Module *M = II.getParent()->getParent()->getParent(); 946 Type *EltTy = IIVTy->getElementType(); 947 Type *NewTy = 948 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 949 950 OverloadTys[0] = NewTy; 951 Function *NewIntrin = 952 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 953 954 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 955 NewCall->takeName(&II); 956 NewCall->copyMetadata(II); 957 958 if (NewNumElts == 1) { 959 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 960 NewCall, 961 DemandedElts.countTrailingZeros()); 962 } 963 964 SmallVector<int, 8> EltMask; 965 unsigned NewLoadIdx = 0; 966 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 967 if (!!DemandedElts[OrigLoadIdx]) 968 EltMask.push_back(NewLoadIdx++); 969 else 970 EltMask.push_back(NewNumElts); 971 } 972 973 Value *Shuffle = 974 IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask); 975 976 return Shuffle; 977 } 978 979 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 980 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 981 APInt &UndefElts2, APInt &UndefElts3, 982 std::function<void(Instruction *, unsigned, APInt, APInt &)> 983 SimplifyAndSetOp) const { 984 switch (II.getIntrinsicID()) { 985 case Intrinsic::amdgcn_buffer_load: 986 case Intrinsic::amdgcn_buffer_load_format: 987 case Intrinsic::amdgcn_raw_buffer_load: 988 case Intrinsic::amdgcn_raw_buffer_load_format: 989 case Intrinsic::amdgcn_raw_tbuffer_load: 990 case Intrinsic::amdgcn_s_buffer_load: 991 case Intrinsic::amdgcn_struct_buffer_load: 992 case Intrinsic::amdgcn_struct_buffer_load_format: 993 case Intrinsic::amdgcn_struct_tbuffer_load: 994 case Intrinsic::amdgcn_tbuffer_load: 995 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 996 default: { 997 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 998 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 999 } 1000 break; 1001 } 1002 } 1003 return None; 1004 } 1005