1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // \file 10 // This file implements a TargetTransformInfo analysis pass specific to the 11 // AMDGPU target machine. It uses the target's detailed information to provide 12 // more precise answers to certain TTI queries, while letting the target 13 // independent and default TTI implementations handle the rest. 14 // 15 //===----------------------------------------------------------------------===// 16 17 #include "AMDGPUInstrInfo.h" 18 #include "AMDGPUTargetTransformInfo.h" 19 #include "GCNSubtarget.h" 20 #include "llvm/IR/IntrinsicsAMDGPU.h" 21 #include "llvm/Transforms/InstCombine/InstCombiner.h" 22 23 using namespace llvm; 24 25 #define DEBUG_TYPE "AMDGPUtti" 26 27 namespace { 28 29 struct AMDGPUImageDMaskIntrinsic { 30 unsigned Intr; 31 }; 32 33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL 34 #include "InstCombineTables.inc" 35 36 } // end anonymous namespace 37 38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs. 39 // 40 // A single NaN input is folded to minnum, so we rely on that folding for 41 // handling NaNs. 42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, 43 const APFloat &Src2) { 44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2); 45 46 APFloat::cmpResult Cmp0 = Max3.compare(Src0); 47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately"); 48 if (Cmp0 == APFloat::cmpEqual) 49 return maxnum(Src1, Src2); 50 51 APFloat::cmpResult Cmp1 = Max3.compare(Src1); 52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately"); 53 if (Cmp1 == APFloat::cmpEqual) 54 return maxnum(Src0, Src2); 55 56 return maxnum(Src0, Src1); 57 } 58 59 // Check if a value can be converted to a 16-bit value without losing 60 // precision. 61 static bool canSafelyConvertTo16Bit(Value &V) { 62 Type *VTy = V.getType(); 63 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) { 64 // The value is already 16-bit, so we don't want to convert to 16-bit again! 65 return false; 66 } 67 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) { 68 // We need to check that if we cast the index down to a half, we do not lose 69 // precision. 70 APFloat FloatValue(ConstFloat->getValueAPF()); 71 bool LosesInfo = true; 72 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo); 73 return !LosesInfo; 74 } 75 Value *CastSrc; 76 if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) || 77 match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) || 78 match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) { 79 Type *CastSrcTy = CastSrc->getType(); 80 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16)) 81 return true; 82 } 83 84 return false; 85 } 86 87 // Convert a value to 16-bit. 88 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) { 89 Type *VTy = V.getType(); 90 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V)) 91 return cast<Instruction>(&V)->getOperand(0); 92 if (VTy->isIntegerTy()) 93 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false); 94 if (VTy->isFloatingPointTy()) 95 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext())); 96 97 llvm_unreachable("Should never be called!"); 98 } 99 100 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with 101 /// the modified arguments. 102 static Optional<Instruction *> modifyIntrinsicCall( 103 IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC, 104 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)> 105 Func) { 106 SmallVector<Type *, 4> ArgTys; 107 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys)) 108 return None; 109 110 SmallVector<Value *, 8> Args(II.args()); 111 112 // Modify arguments and types 113 Func(Args, ArgTys); 114 115 Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys); 116 117 CallInst *NewCall = IC.Builder.CreateCall(I, Args); 118 NewCall->takeName(&II); 119 NewCall->copyMetadata(II); 120 if (isa<FPMathOperator>(NewCall)) 121 NewCall->copyFastMathFlags(&II); 122 123 // Erase and replace uses 124 if (!II.getType()->isVoidTy()) 125 IC.replaceInstUsesWith(II, NewCall); 126 return IC.eraseInstFromFunction(II); 127 } 128 129 static Optional<Instruction *> 130 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, 131 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, 132 IntrinsicInst &II, InstCombiner &IC) { 133 // Optimize _L to _LZ when _L is zero 134 if (const auto *LZMappingInfo = 135 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 136 if (auto *ConstantLod = 137 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) { 138 if (ConstantLod->isZero() || ConstantLod->isNegative()) { 139 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 140 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ, 141 ImageDimIntr->Dim); 142 return modifyIntrinsicCall( 143 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 144 Args.erase(Args.begin() + ImageDimIntr->LodIndex); 145 }); 146 } 147 } 148 } 149 150 // Optimize _mip away, when 'lod' is zero 151 if (const auto *MIPMappingInfo = 152 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 153 if (auto *ConstantMip = 154 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) { 155 if (ConstantMip->isZero()) { 156 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 157 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP, 158 ImageDimIntr->Dim); 159 return modifyIntrinsicCall( 160 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 161 Args.erase(Args.begin() + ImageDimIntr->MipIndex); 162 }); 163 } 164 } 165 } 166 167 // Optimize _bias away when 'bias' is zero 168 if (const auto *BiasMappingInfo = 169 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) { 170 if (auto *ConstantBias = 171 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) { 172 if (ConstantBias->isZero()) { 173 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 174 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias, 175 ImageDimIntr->Dim); 176 return modifyIntrinsicCall( 177 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 178 Args.erase(Args.begin() + ImageDimIntr->BiasIndex); 179 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg); 180 }); 181 } 182 } 183 } 184 185 // Optimize _offset away when 'offset' is zero 186 if (const auto *OffsetMappingInfo = 187 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) { 188 if (auto *ConstantOffset = 189 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) { 190 if (ConstantOffset->isZero()) { 191 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 192 AMDGPU::getImageDimIntrinsicByBaseOpcode( 193 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim); 194 return modifyIntrinsicCall( 195 II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) { 196 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex); 197 }); 198 } 199 } 200 } 201 202 // Try to use A16 or G16 203 if (!ST->hasA16() && !ST->hasG16()) 204 return None; 205 206 bool FloatCoord = false; 207 // true means derivatives can be converted to 16 bit, coordinates not 208 bool OnlyDerivatives = false; 209 210 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 211 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) { 212 Value *Coord = II.getOperand(OperandIndex); 213 // If the values are not derived from 16-bit values, we cannot optimize. 214 if (!canSafelyConvertTo16Bit(*Coord)) { 215 if (OperandIndex < ImageDimIntr->CoordStart || 216 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) { 217 return None; 218 } 219 // All gradients can be converted, so convert only them 220 OnlyDerivatives = true; 221 break; 222 } 223 224 assert(OperandIndex == ImageDimIntr->GradientStart || 225 FloatCoord == Coord->getType()->isFloatingPointTy()); 226 FloatCoord = Coord->getType()->isFloatingPointTy(); 227 } 228 229 if (!OnlyDerivatives && !ST->hasA16()) 230 OnlyDerivatives = true; // Only supports G16 231 232 // Check if there is a bias parameter and if it can be converted to f16 233 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 234 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 235 if (!canSafelyConvertTo16Bit(*Bias)) 236 OnlyDerivatives = true; 237 } 238 239 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart == 240 ImageDimIntr->CoordStart)) 241 return None; 242 243 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext()) 244 : Type::getInt16Ty(II.getContext()); 245 246 return modifyIntrinsicCall( 247 II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) { 248 ArgTys[ImageDimIntr->GradientTyArg] = CoordType; 249 if (!OnlyDerivatives) { 250 ArgTys[ImageDimIntr->CoordTyArg] = CoordType; 251 252 // Change the bias type 253 if (ImageDimIntr->NumBiasArgs != 0) 254 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext()); 255 } 256 257 unsigned EndIndex = 258 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd; 259 for (unsigned OperandIndex = ImageDimIntr->GradientStart; 260 OperandIndex < EndIndex; OperandIndex++) { 261 Args[OperandIndex] = 262 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder); 263 } 264 265 // Convert the bias 266 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) { 267 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex); 268 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder); 269 } 270 }); 271 } 272 273 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, 274 InstCombiner &IC) const { 275 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 276 // infinity, gives +0.0. If we can prove we don't have one of the special 277 // cases then we can use a normal multiply instead. 278 // TODO: Create and use isKnownFiniteNonZero instead of just matching 279 // constants here. 280 if (match(Op0, PatternMatch::m_FiniteNonZero()) || 281 match(Op1, PatternMatch::m_FiniteNonZero())) { 282 // One operand is not zero or infinity or NaN. 283 return true; 284 } 285 auto *TLI = &IC.getTargetLibraryInfo(); 286 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) && 287 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) { 288 // Neither operand is infinity or NaN. 289 return true; 290 } 291 return false; 292 } 293 294 Optional<Instruction *> 295 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { 296 Intrinsic::ID IID = II.getIntrinsicID(); 297 switch (IID) { 298 case Intrinsic::amdgcn_rcp: { 299 Value *Src = II.getArgOperand(0); 300 301 // TODO: Move to ConstantFolding/InstSimplify? 302 if (isa<UndefValue>(Src)) { 303 Type *Ty = II.getType(); 304 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 305 return IC.replaceInstUsesWith(II, QNaN); 306 } 307 308 if (II.isStrictFP()) 309 break; 310 311 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 312 const APFloat &ArgVal = C->getValueAPF(); 313 APFloat Val(ArgVal.getSemantics(), 1); 314 Val.divide(ArgVal, APFloat::rmNearestTiesToEven); 315 316 // This is more precise than the instruction may give. 317 // 318 // TODO: The instruction always flushes denormal results (except for f16), 319 // should this also? 320 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val)); 321 } 322 323 break; 324 } 325 case Intrinsic::amdgcn_rsq: { 326 Value *Src = II.getArgOperand(0); 327 328 // TODO: Move to ConstantFolding/InstSimplify? 329 if (isa<UndefValue>(Src)) { 330 Type *Ty = II.getType(); 331 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 332 return IC.replaceInstUsesWith(II, QNaN); 333 } 334 335 break; 336 } 337 case Intrinsic::amdgcn_frexp_mant: 338 case Intrinsic::amdgcn_frexp_exp: { 339 Value *Src = II.getArgOperand(0); 340 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) { 341 int Exp; 342 APFloat Significand = 343 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven); 344 345 if (IID == Intrinsic::amdgcn_frexp_mant) { 346 return IC.replaceInstUsesWith( 347 II, ConstantFP::get(II.getContext(), Significand)); 348 } 349 350 // Match instruction special case behavior. 351 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf) 352 Exp = 0; 353 354 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp)); 355 } 356 357 if (isa<UndefValue>(Src)) { 358 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 359 } 360 361 break; 362 } 363 case Intrinsic::amdgcn_class: { 364 enum { 365 S_NAN = 1 << 0, // Signaling NaN 366 Q_NAN = 1 << 1, // Quiet NaN 367 N_INFINITY = 1 << 2, // Negative infinity 368 N_NORMAL = 1 << 3, // Negative normal 369 N_SUBNORMAL = 1 << 4, // Negative subnormal 370 N_ZERO = 1 << 5, // Negative zero 371 P_ZERO = 1 << 6, // Positive zero 372 P_SUBNORMAL = 1 << 7, // Positive subnormal 373 P_NORMAL = 1 << 8, // Positive normal 374 P_INFINITY = 1 << 9 // Positive infinity 375 }; 376 377 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL | 378 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL | 379 P_NORMAL | P_INFINITY; 380 381 Value *Src0 = II.getArgOperand(0); 382 Value *Src1 = II.getArgOperand(1); 383 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1); 384 if (!CMask) { 385 if (isa<UndefValue>(Src0)) { 386 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 387 } 388 389 if (isa<UndefValue>(Src1)) { 390 return IC.replaceInstUsesWith(II, 391 ConstantInt::get(II.getType(), false)); 392 } 393 break; 394 } 395 396 uint32_t Mask = CMask->getZExtValue(); 397 398 // If all tests are made, it doesn't matter what the value is. 399 if ((Mask & FullMask) == FullMask) { 400 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true)); 401 } 402 403 if ((Mask & FullMask) == 0) { 404 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false)); 405 } 406 407 if (Mask == (S_NAN | Q_NAN)) { 408 // Equivalent of isnan. Replace with standard fcmp. 409 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0); 410 FCmp->takeName(&II); 411 return IC.replaceInstUsesWith(II, FCmp); 412 } 413 414 if (Mask == (N_ZERO | P_ZERO)) { 415 // Equivalent of == 0. 416 Value *FCmp = 417 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0)); 418 419 FCmp->takeName(&II); 420 return IC.replaceInstUsesWith(II, FCmp); 421 } 422 423 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other 424 if (((Mask & S_NAN) || (Mask & Q_NAN)) && 425 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) { 426 return IC.replaceOperand( 427 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN))); 428 } 429 430 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0); 431 if (!CVal) { 432 if (isa<UndefValue>(Src0)) { 433 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 434 } 435 436 // Clamp mask to used bits 437 if ((Mask & FullMask) != Mask) { 438 CallInst *NewCall = IC.Builder.CreateCall( 439 II.getCalledFunction(), 440 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)}); 441 442 NewCall->takeName(&II); 443 return IC.replaceInstUsesWith(II, NewCall); 444 } 445 446 break; 447 } 448 449 const APFloat &Val = CVal->getValueAPF(); 450 451 bool Result = 452 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) || 453 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) || 454 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) || 455 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) || 456 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) || 457 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) || 458 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) || 459 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) || 460 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) || 461 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative()); 462 463 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result)); 464 } 465 case Intrinsic::amdgcn_cvt_pkrtz: { 466 Value *Src0 = II.getArgOperand(0); 467 Value *Src1 = II.getArgOperand(1); 468 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 469 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 470 const fltSemantics &HalfSem = 471 II.getType()->getScalarType()->getFltSemantics(); 472 bool LosesInfo; 473 APFloat Val0 = C0->getValueAPF(); 474 APFloat Val1 = C1->getValueAPF(); 475 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 476 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo); 477 478 Constant *Folded = 479 ConstantVector::get({ConstantFP::get(II.getContext(), Val0), 480 ConstantFP::get(II.getContext(), Val1)}); 481 return IC.replaceInstUsesWith(II, Folded); 482 } 483 } 484 485 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 486 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 487 } 488 489 break; 490 } 491 case Intrinsic::amdgcn_cvt_pknorm_i16: 492 case Intrinsic::amdgcn_cvt_pknorm_u16: 493 case Intrinsic::amdgcn_cvt_pk_i16: 494 case Intrinsic::amdgcn_cvt_pk_u16: { 495 Value *Src0 = II.getArgOperand(0); 496 Value *Src1 = II.getArgOperand(1); 497 498 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) { 499 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 500 } 501 502 break; 503 } 504 case Intrinsic::amdgcn_ubfe: 505 case Intrinsic::amdgcn_sbfe: { 506 // Decompose simple cases into standard shifts. 507 Value *Src = II.getArgOperand(0); 508 if (isa<UndefValue>(Src)) { 509 return IC.replaceInstUsesWith(II, Src); 510 } 511 512 unsigned Width; 513 Type *Ty = II.getType(); 514 unsigned IntSize = Ty->getIntegerBitWidth(); 515 516 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2)); 517 if (CWidth) { 518 Width = CWidth->getZExtValue(); 519 if ((Width & (IntSize - 1)) == 0) { 520 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty)); 521 } 522 523 // Hardware ignores high bits, so remove those. 524 if (Width >= IntSize) { 525 return IC.replaceOperand( 526 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1))); 527 } 528 } 529 530 unsigned Offset; 531 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1)); 532 if (COffset) { 533 Offset = COffset->getZExtValue(); 534 if (Offset >= IntSize) { 535 return IC.replaceOperand( 536 II, 1, 537 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1))); 538 } 539 } 540 541 bool Signed = IID == Intrinsic::amdgcn_sbfe; 542 543 if (!CWidth || !COffset) 544 break; 545 546 // The case of Width == 0 is handled above, which makes this transformation 547 // safe. If Width == 0, then the ashr and lshr instructions become poison 548 // value since the shift amount would be equal to the bit size. 549 assert(Width != 0); 550 551 // TODO: This allows folding to undef when the hardware has specific 552 // behavior? 553 if (Offset + Width < IntSize) { 554 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width); 555 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width) 556 : IC.Builder.CreateLShr(Shl, IntSize - Width); 557 RightShift->takeName(&II); 558 return IC.replaceInstUsesWith(II, RightShift); 559 } 560 561 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset) 562 : IC.Builder.CreateLShr(Src, Offset); 563 564 RightShift->takeName(&II); 565 return IC.replaceInstUsesWith(II, RightShift); 566 } 567 case Intrinsic::amdgcn_exp: 568 case Intrinsic::amdgcn_exp_compr: { 569 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1)); 570 unsigned EnBits = En->getZExtValue(); 571 if (EnBits == 0xf) 572 break; // All inputs enabled. 573 574 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr; 575 bool Changed = false; 576 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) { 577 if ((!IsCompr && (EnBits & (1 << I)) == 0) || 578 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) { 579 Value *Src = II.getArgOperand(I + 2); 580 if (!isa<UndefValue>(Src)) { 581 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType())); 582 Changed = true; 583 } 584 } 585 } 586 587 if (Changed) { 588 return &II; 589 } 590 591 break; 592 } 593 case Intrinsic::amdgcn_fmed3: { 594 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled 595 // for the shader. 596 597 Value *Src0 = II.getArgOperand(0); 598 Value *Src1 = II.getArgOperand(1); 599 Value *Src2 = II.getArgOperand(2); 600 601 // Checking for NaN before canonicalization provides better fidelity when 602 // mapping other operations onto fmed3 since the order of operands is 603 // unchanged. 604 CallInst *NewCall = nullptr; 605 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) { 606 NewCall = IC.Builder.CreateMinNum(Src1, Src2); 607 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) { 608 NewCall = IC.Builder.CreateMinNum(Src0, Src2); 609 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) { 610 NewCall = IC.Builder.CreateMaxNum(Src0, Src1); 611 } 612 613 if (NewCall) { 614 NewCall->copyFastMathFlags(&II); 615 NewCall->takeName(&II); 616 return IC.replaceInstUsesWith(II, NewCall); 617 } 618 619 bool Swap = false; 620 // Canonicalize constants to RHS operands. 621 // 622 // fmed3(c0, x, c1) -> fmed3(x, c0, c1) 623 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 624 std::swap(Src0, Src1); 625 Swap = true; 626 } 627 628 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) { 629 std::swap(Src1, Src2); 630 Swap = true; 631 } 632 633 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) { 634 std::swap(Src0, Src1); 635 Swap = true; 636 } 637 638 if (Swap) { 639 II.setArgOperand(0, Src0); 640 II.setArgOperand(1, Src1); 641 II.setArgOperand(2, Src2); 642 return &II; 643 } 644 645 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) { 646 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) { 647 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) { 648 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(), 649 C2->getValueAPF()); 650 return IC.replaceInstUsesWith( 651 II, ConstantFP::get(IC.Builder.getContext(), Result)); 652 } 653 } 654 } 655 656 break; 657 } 658 case Intrinsic::amdgcn_icmp: 659 case Intrinsic::amdgcn_fcmp: { 660 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2)); 661 // Guard against invalid arguments. 662 int64_t CCVal = CC->getZExtValue(); 663 bool IsInteger = IID == Intrinsic::amdgcn_icmp; 664 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE || 665 CCVal > CmpInst::LAST_ICMP_PREDICATE)) || 666 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE || 667 CCVal > CmpInst::LAST_FCMP_PREDICATE))) 668 break; 669 670 Value *Src0 = II.getArgOperand(0); 671 Value *Src1 = II.getArgOperand(1); 672 673 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) { 674 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) { 675 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1); 676 if (CCmp->isNullValue()) { 677 return IC.replaceInstUsesWith( 678 II, ConstantExpr::getSExt(CCmp, II.getType())); 679 } 680 681 // The result of V_ICMP/V_FCMP assembly instructions (which this 682 // intrinsic exposes) is one bit per thread, masked with the EXEC 683 // register (which contains the bitmask of live threads). So a 684 // comparison that always returns true is the same as a read of the 685 // EXEC register. 686 Function *NewF = Intrinsic::getDeclaration( 687 II.getModule(), Intrinsic::read_register, II.getType()); 688 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; 689 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 690 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 691 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 692 NewCall->addFnAttr(Attribute::Convergent); 693 NewCall->takeName(&II); 694 return IC.replaceInstUsesWith(II, NewCall); 695 } 696 697 // Canonicalize constants to RHS. 698 CmpInst::Predicate SwapPred = 699 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal)); 700 II.setArgOperand(0, Src1); 701 II.setArgOperand(1, Src0); 702 II.setArgOperand( 703 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred))); 704 return &II; 705 } 706 707 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE) 708 break; 709 710 // Canonicalize compare eq with true value to compare != 0 711 // llvm.amdgcn.icmp(zext (i1 x), 1, eq) 712 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne) 713 // llvm.amdgcn.icmp(sext (i1 x), -1, eq) 714 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne) 715 Value *ExtSrc; 716 if (CCVal == CmpInst::ICMP_EQ && 717 ((match(Src1, PatternMatch::m_One()) && 718 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) || 719 (match(Src1, PatternMatch::m_AllOnes()) && 720 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) && 721 ExtSrc->getType()->isIntegerTy(1)) { 722 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType())); 723 IC.replaceOperand(II, 2, 724 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE)); 725 return &II; 726 } 727 728 CmpInst::Predicate SrcPred; 729 Value *SrcLHS; 730 Value *SrcRHS; 731 732 // Fold compare eq/ne with 0 from a compare result as the predicate to the 733 // intrinsic. The typical use is a wave vote function in the library, which 734 // will be fed from a user code condition compared with 0. Fold in the 735 // redundant compare. 736 737 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne) 738 // -> llvm.amdgcn.[if]cmp(a, b, pred) 739 // 740 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq) 741 // -> llvm.amdgcn.[if]cmp(a, b, inv pred) 742 if (match(Src1, PatternMatch::m_Zero()) && 743 match(Src0, PatternMatch::m_ZExtOrSExt( 744 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS), 745 PatternMatch::m_Value(SrcRHS))))) { 746 if (CCVal == CmpInst::ICMP_EQ) 747 SrcPred = CmpInst::getInversePredicate(SrcPred); 748 749 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) 750 ? Intrinsic::amdgcn_fcmp 751 : Intrinsic::amdgcn_icmp; 752 753 Type *Ty = SrcLHS->getType(); 754 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) { 755 // Promote to next legal integer type. 756 unsigned Width = CmpType->getBitWidth(); 757 unsigned NewWidth = Width; 758 759 // Don't do anything for i1 comparisons. 760 if (Width == 1) 761 break; 762 763 if (Width <= 16) 764 NewWidth = 16; 765 else if (Width <= 32) 766 NewWidth = 32; 767 else if (Width <= 64) 768 NewWidth = 64; 769 else if (Width > 64) 770 break; // Can't handle this. 771 772 if (Width != NewWidth) { 773 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth); 774 if (CmpInst::isSigned(SrcPred)) { 775 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy); 776 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy); 777 } else { 778 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy); 779 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy); 780 } 781 } 782 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) 783 break; 784 785 Function *NewF = Intrinsic::getDeclaration( 786 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); 787 Value *Args[] = {SrcLHS, SrcRHS, 788 ConstantInt::get(CC->getType(), SrcPred)}; 789 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 790 NewCall->takeName(&II); 791 return IC.replaceInstUsesWith(II, NewCall); 792 } 793 794 break; 795 } 796 case Intrinsic::amdgcn_ballot: { 797 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) { 798 if (Src->isZero()) { 799 // amdgcn.ballot(i1 0) is zero. 800 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType())); 801 } 802 803 if (Src->isOne()) { 804 // amdgcn.ballot(i1 1) is exec. 805 const char *RegName = "exec"; 806 if (II.getType()->isIntegerTy(32)) 807 RegName = "exec_lo"; 808 else if (!II.getType()->isIntegerTy(64)) 809 break; 810 811 Function *NewF = Intrinsic::getDeclaration( 812 II.getModule(), Intrinsic::read_register, II.getType()); 813 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)}; 814 MDNode *MD = MDNode::get(II.getContext(), MDArgs); 815 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)}; 816 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args); 817 NewCall->addFnAttr(Attribute::Convergent); 818 NewCall->takeName(&II); 819 return IC.replaceInstUsesWith(II, NewCall); 820 } 821 } 822 break; 823 } 824 case Intrinsic::amdgcn_wqm_vote: { 825 // wqm_vote is identity when the argument is constant. 826 if (!isa<Constant>(II.getArgOperand(0))) 827 break; 828 829 return IC.replaceInstUsesWith(II, II.getArgOperand(0)); 830 } 831 case Intrinsic::amdgcn_kill: { 832 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0)); 833 if (!C || !C->getZExtValue()) 834 break; 835 836 // amdgcn.kill(i1 1) is a no-op 837 return IC.eraseInstFromFunction(II); 838 } 839 case Intrinsic::amdgcn_update_dpp: { 840 Value *Old = II.getArgOperand(0); 841 842 auto *BC = cast<ConstantInt>(II.getArgOperand(5)); 843 auto *RM = cast<ConstantInt>(II.getArgOperand(3)); 844 auto *BM = cast<ConstantInt>(II.getArgOperand(4)); 845 if (BC->isZeroValue() || RM->getZExtValue() != 0xF || 846 BM->getZExtValue() != 0xF || isa<UndefValue>(Old)) 847 break; 848 849 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value. 850 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType())); 851 } 852 case Intrinsic::amdgcn_permlane16: 853 case Intrinsic::amdgcn_permlanex16: { 854 // Discard vdst_in if it's not going to be read. 855 Value *VDstIn = II.getArgOperand(0); 856 if (isa<UndefValue>(VDstIn)) 857 break; 858 859 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4)); 860 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5)); 861 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue()) 862 break; 863 864 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType())); 865 } 866 case Intrinsic::amdgcn_readfirstlane: 867 case Intrinsic::amdgcn_readlane: { 868 // A constant value is trivially uniform. 869 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) { 870 return IC.replaceInstUsesWith(II, C); 871 } 872 873 // The rest of these may not be safe if the exec may not be the same between 874 // the def and use. 875 Value *Src = II.getArgOperand(0); 876 Instruction *SrcInst = dyn_cast<Instruction>(Src); 877 if (SrcInst && SrcInst->getParent() != II.getParent()) 878 break; 879 880 // readfirstlane (readfirstlane x) -> readfirstlane x 881 // readlane (readfirstlane x), y -> readfirstlane x 882 if (match(Src, 883 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) { 884 return IC.replaceInstUsesWith(II, Src); 885 } 886 887 if (IID == Intrinsic::amdgcn_readfirstlane) { 888 // readfirstlane (readlane x, y) -> readlane x, y 889 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) { 890 return IC.replaceInstUsesWith(II, Src); 891 } 892 } else { 893 // readlane (readlane x, y), y -> readlane x, y 894 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>( 895 PatternMatch::m_Value(), 896 PatternMatch::m_Specific(II.getArgOperand(1))))) { 897 return IC.replaceInstUsesWith(II, Src); 898 } 899 } 900 901 break; 902 } 903 case Intrinsic::amdgcn_ldexp: { 904 // FIXME: This doesn't introduce new instructions and belongs in 905 // InstructionSimplify. 906 Type *Ty = II.getType(); 907 Value *Op0 = II.getArgOperand(0); 908 Value *Op1 = II.getArgOperand(1); 909 910 // Folding undef to qnan is safe regardless of the FP mode. 911 if (isa<UndefValue>(Op0)) { 912 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics())); 913 return IC.replaceInstUsesWith(II, QNaN); 914 } 915 916 const APFloat *C = nullptr; 917 match(Op0, PatternMatch::m_APFloat(C)); 918 919 // FIXME: Should flush denorms depending on FP mode, but that's ignored 920 // everywhere else. 921 // 922 // These cases should be safe, even with strictfp. 923 // ldexp(0.0, x) -> 0.0 924 // ldexp(-0.0, x) -> -0.0 925 // ldexp(inf, x) -> inf 926 // ldexp(-inf, x) -> -inf 927 if (C && (C->isZero() || C->isInfinity())) { 928 return IC.replaceInstUsesWith(II, Op0); 929 } 930 931 // With strictfp, be more careful about possibly needing to flush denormals 932 // or not, and snan behavior depends on ieee_mode. 933 if (II.isStrictFP()) 934 break; 935 936 if (C && C->isNaN()) { 937 // FIXME: We just need to make the nan quiet here, but that's unavailable 938 // on APFloat, only IEEEfloat 939 auto *Quieted = 940 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven)); 941 return IC.replaceInstUsesWith(II, Quieted); 942 } 943 944 // ldexp(x, 0) -> x 945 // ldexp(x, undef) -> x 946 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) { 947 return IC.replaceInstUsesWith(II, Op0); 948 } 949 950 break; 951 } 952 case Intrinsic::amdgcn_fmul_legacy: { 953 Value *Op0 = II.getArgOperand(0); 954 Value *Op1 = II.getArgOperand(1); 955 956 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 957 // infinity, gives +0.0. 958 // TODO: Move to InstSimplify? 959 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 960 match(Op1, PatternMatch::m_AnyZeroFP())) 961 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType())); 962 963 // If we can prove we don't have one of the special cases then we can use a 964 // normal fmul instruction instead. 965 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 966 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II); 967 FMul->takeName(&II); 968 return IC.replaceInstUsesWith(II, FMul); 969 } 970 break; 971 } 972 case Intrinsic::amdgcn_fma_legacy: { 973 Value *Op0 = II.getArgOperand(0); 974 Value *Op1 = II.getArgOperand(1); 975 Value *Op2 = II.getArgOperand(2); 976 977 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or 978 // infinity, gives +0.0. 979 // TODO: Move to InstSimplify? 980 if (match(Op0, PatternMatch::m_AnyZeroFP()) || 981 match(Op1, PatternMatch::m_AnyZeroFP())) { 982 // It's tempting to just return Op2 here, but that would give the wrong 983 // result if Op2 was -0.0. 984 auto *Zero = ConstantFP::getNullValue(II.getType()); 985 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II); 986 FAdd->takeName(&II); 987 return IC.replaceInstUsesWith(II, FAdd); 988 } 989 990 // If we can prove we don't have one of the special cases then we can use a 991 // normal fma instead. 992 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) { 993 II.setCalledOperand(Intrinsic::getDeclaration( 994 II.getModule(), Intrinsic::fma, II.getType())); 995 return &II; 996 } 997 break; 998 } 999 case Intrinsic::amdgcn_is_shared: 1000 case Intrinsic::amdgcn_is_private: { 1001 if (isa<UndefValue>(II.getArgOperand(0))) 1002 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); 1003 1004 if (isa<ConstantPointerNull>(II.getArgOperand(0))) 1005 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType())); 1006 break; 1007 } 1008 default: { 1009 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 1010 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) { 1011 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC); 1012 } 1013 } 1014 } 1015 return None; 1016 } 1017 1018 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics. 1019 /// 1020 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have 1021 /// struct returns. 1022 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, 1023 IntrinsicInst &II, 1024 APInt DemandedElts, 1025 int DMaskIdx = -1) { 1026 1027 auto *IIVTy = cast<FixedVectorType>(II.getType()); 1028 unsigned VWidth = IIVTy->getNumElements(); 1029 if (VWidth == 1) 1030 return nullptr; 1031 1032 IRBuilderBase::InsertPointGuard Guard(IC.Builder); 1033 IC.Builder.SetInsertPoint(&II); 1034 1035 // Assume the arguments are unchanged and later override them, if needed. 1036 SmallVector<Value *, 16> Args(II.args()); 1037 1038 if (DMaskIdx < 0) { 1039 // Buffer case. 1040 1041 const unsigned ActiveBits = DemandedElts.getActiveBits(); 1042 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros(); 1043 1044 // Start assuming the prefix of elements is demanded, but possibly clear 1045 // some other bits if there are trailing zeros (unused components at front) 1046 // and update offset. 1047 DemandedElts = (1 << ActiveBits) - 1; 1048 1049 if (UnusedComponentsAtFront > 0) { 1050 static const unsigned InvalidOffsetIdx = 0xf; 1051 1052 unsigned OffsetIdx; 1053 switch (II.getIntrinsicID()) { 1054 case Intrinsic::amdgcn_raw_buffer_load: 1055 OffsetIdx = 1; 1056 break; 1057 case Intrinsic::amdgcn_s_buffer_load: 1058 // If resulting type is vec3, there is no point in trimming the 1059 // load with updated offset, as the vec3 would most likely be widened to 1060 // vec4 anyway during lowering. 1061 if (ActiveBits == 4 && UnusedComponentsAtFront == 1) 1062 OffsetIdx = InvalidOffsetIdx; 1063 else 1064 OffsetIdx = 1; 1065 break; 1066 case Intrinsic::amdgcn_struct_buffer_load: 1067 OffsetIdx = 2; 1068 break; 1069 default: 1070 // TODO: handle tbuffer* intrinsics. 1071 OffsetIdx = InvalidOffsetIdx; 1072 break; 1073 } 1074 1075 if (OffsetIdx != InvalidOffsetIdx) { 1076 // Clear demanded bits and update the offset. 1077 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1); 1078 auto *Offset = II.getArgOperand(OffsetIdx); 1079 unsigned SingleComponentSizeInBits = 1080 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType()); 1081 unsigned OffsetAdd = 1082 UnusedComponentsAtFront * SingleComponentSizeInBits / 8; 1083 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd); 1084 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal); 1085 } 1086 } 1087 } else { 1088 // Image case. 1089 1090 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx)); 1091 unsigned DMaskVal = DMask->getZExtValue() & 0xf; 1092 1093 // Mask off values that are undefined because the dmask doesn't cover them 1094 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1; 1095 1096 unsigned NewDMaskVal = 0; 1097 unsigned OrigLoadIdx = 0; 1098 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) { 1099 const unsigned Bit = 1 << SrcIdx; 1100 if (!!(DMaskVal & Bit)) { 1101 if (!!DemandedElts[OrigLoadIdx]) 1102 NewDMaskVal |= Bit; 1103 OrigLoadIdx++; 1104 } 1105 } 1106 1107 if (DMaskVal != NewDMaskVal) 1108 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal); 1109 } 1110 1111 unsigned NewNumElts = DemandedElts.countPopulation(); 1112 if (!NewNumElts) 1113 return UndefValue::get(II.getType()); 1114 1115 if (NewNumElts >= VWidth && DemandedElts.isMask()) { 1116 if (DMaskIdx >= 0) 1117 II.setArgOperand(DMaskIdx, Args[DMaskIdx]); 1118 return nullptr; 1119 } 1120 1121 // Validate function argument and return types, extracting overloaded types 1122 // along the way. 1123 SmallVector<Type *, 6> OverloadTys; 1124 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys)) 1125 return nullptr; 1126 1127 Module *M = II.getParent()->getParent()->getParent(); 1128 Type *EltTy = IIVTy->getElementType(); 1129 Type *NewTy = 1130 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts); 1131 1132 OverloadTys[0] = NewTy; 1133 Function *NewIntrin = 1134 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys); 1135 1136 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); 1137 NewCall->takeName(&II); 1138 NewCall->copyMetadata(II); 1139 1140 if (NewNumElts == 1) { 1141 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()), 1142 NewCall, 1143 DemandedElts.countTrailingZeros()); 1144 } 1145 1146 SmallVector<int, 8> EltMask; 1147 unsigned NewLoadIdx = 0; 1148 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) { 1149 if (!!DemandedElts[OrigLoadIdx]) 1150 EltMask.push_back(NewLoadIdx++); 1151 else 1152 EltMask.push_back(NewNumElts); 1153 } 1154 1155 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask); 1156 1157 return Shuffle; 1158 } 1159 1160 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic( 1161 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 1162 APInt &UndefElts2, APInt &UndefElts3, 1163 std::function<void(Instruction *, unsigned, APInt, APInt &)> 1164 SimplifyAndSetOp) const { 1165 switch (II.getIntrinsicID()) { 1166 case Intrinsic::amdgcn_buffer_load: 1167 case Intrinsic::amdgcn_buffer_load_format: 1168 case Intrinsic::amdgcn_raw_buffer_load: 1169 case Intrinsic::amdgcn_raw_buffer_load_format: 1170 case Intrinsic::amdgcn_raw_tbuffer_load: 1171 case Intrinsic::amdgcn_s_buffer_load: 1172 case Intrinsic::amdgcn_struct_buffer_load: 1173 case Intrinsic::amdgcn_struct_buffer_load_format: 1174 case Intrinsic::amdgcn_struct_tbuffer_load: 1175 case Intrinsic::amdgcn_tbuffer_load: 1176 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts); 1177 default: { 1178 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) { 1179 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0); 1180 } 1181 break; 1182 } 1183 } 1184 return None; 1185 } 1186