1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/Transforms/InstCombine/InstCombiner.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "AMDGPUtti"
25 
26 namespace {
27 
28 struct AMDGPUImageDMaskIntrinsic {
29   unsigned Intr;
30 };
31 
32 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
33 #include "InstCombineTables.inc"
34 
35 } // end anonymous namespace
36 
37 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
38 //
39 // A single NaN input is folded to minnum, so we rely on that folding for
40 // handling NaNs.
41 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
42                            const APFloat &Src2) {
43   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
44 
45   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
46   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
47   if (Cmp0 == APFloat::cmpEqual)
48     return maxnum(Src1, Src2);
49 
50   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
51   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
52   if (Cmp1 == APFloat::cmpEqual)
53     return maxnum(Src0, Src2);
54 
55   return maxnum(Src0, Src1);
56 }
57 
58 // Check if a value can be converted to a 16-bit value without losing
59 // precision.
60 static bool canSafelyConvertTo16Bit(Value &V) {
61   Type *VTy = V.getType();
62   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
63     // The value is already 16-bit, so we don't want to convert to 16-bit again!
64     return false;
65   }
66   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
67     // We need to check that if we cast the index down to a half, we do not lose
68     // precision.
69     APFloat FloatValue(ConstFloat->getValueAPF());
70     bool LosesInfo = true;
71     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
72     return !LosesInfo;
73   }
74   Value *CastSrc;
75   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
76       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
77       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
78     Type *CastSrcTy = CastSrc->getType();
79     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
80       return true;
81   }
82 
83   return false;
84 }
85 
86 // Convert a value to 16-bit.
87 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
88   Type *VTy = V.getType();
89   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
90     return cast<Instruction>(&V)->getOperand(0);
91   if (VTy->isIntegerTy())
92     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
93   if (VTy->isFloatingPointTy())
94     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
95 
96   llvm_unreachable("Should never be called!");
97 }
98 
99 static Optional<Instruction *>
100 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
101                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
102                              IntrinsicInst &II, InstCombiner &IC) {
103   if (!ST->hasA16() && !ST->hasG16())
104     return None;
105 
106   bool FloatCoord = false;
107   // true means derivatives can be converted to 16 bit, coordinates not
108   bool OnlyDerivatives = false;
109 
110   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
111        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
112     Value *Coord = II.getOperand(OperandIndex);
113     // If the values are not derived from 16-bit values, we cannot optimize.
114     if (!canSafelyConvertTo16Bit(*Coord)) {
115       if (OperandIndex < ImageDimIntr->CoordStart ||
116           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
117         return None;
118       }
119       // All gradients can be converted, so convert only them
120       OnlyDerivatives = true;
121       break;
122     }
123 
124     assert(OperandIndex == ImageDimIntr->GradientStart ||
125            FloatCoord == Coord->getType()->isFloatingPointTy());
126     FloatCoord = Coord->getType()->isFloatingPointTy();
127   }
128 
129   if (OnlyDerivatives) {
130     if (!ST->hasG16())
131       return None;
132   } else {
133     if (!ST->hasA16())
134       OnlyDerivatives = true; // Only supports G16
135   }
136 
137   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
138                                : Type::getInt16Ty(II.getContext());
139 
140   SmallVector<Type *, 4> ArgTys;
141   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
142     return None;
143 
144   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
145   if (!OnlyDerivatives)
146     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
147   Function *I =
148       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
149 
150   SmallVector<Value *, 8> Args(II.arg_operands());
151 
152   unsigned EndIndex =
153       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
154   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
155        OperandIndex < EndIndex; OperandIndex++) {
156     Args[OperandIndex] =
157         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
158   }
159 
160   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
161   NewCall->takeName(&II);
162   NewCall->copyMetadata(II);
163   if (isa<FPMathOperator>(NewCall))
164     NewCall->copyFastMathFlags(&II);
165   return IC.replaceInstUsesWith(II, NewCall);
166 }
167 
168 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
169                                            InstCombiner &IC) const {
170   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
171   // infinity, gives +0.0. If we can prove we don't have one of the special
172   // cases then we can use a normal multiply instead.
173   // TODO: Create and use isKnownFiniteNonZero instead of just matching
174   // constants here.
175   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
176       match(Op1, PatternMatch::m_FiniteNonZero())) {
177     // One operand is not zero or infinity or NaN.
178     return true;
179   }
180   auto *TLI = &IC.getTargetLibraryInfo();
181   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
182       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
183     // Neither operand is infinity or NaN.
184     return true;
185   }
186   return false;
187 }
188 
189 Optional<Instruction *>
190 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
191   Intrinsic::ID IID = II.getIntrinsicID();
192   switch (IID) {
193   case Intrinsic::amdgcn_rcp: {
194     Value *Src = II.getArgOperand(0);
195 
196     // TODO: Move to ConstantFolding/InstSimplify?
197     if (isa<UndefValue>(Src)) {
198       Type *Ty = II.getType();
199       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
200       return IC.replaceInstUsesWith(II, QNaN);
201     }
202 
203     if (II.isStrictFP())
204       break;
205 
206     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
207       const APFloat &ArgVal = C->getValueAPF();
208       APFloat Val(ArgVal.getSemantics(), 1);
209       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
210 
211       // This is more precise than the instruction may give.
212       //
213       // TODO: The instruction always flushes denormal results (except for f16),
214       // should this also?
215       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
216     }
217 
218     break;
219   }
220   case Intrinsic::amdgcn_rsq: {
221     Value *Src = II.getArgOperand(0);
222 
223     // TODO: Move to ConstantFolding/InstSimplify?
224     if (isa<UndefValue>(Src)) {
225       Type *Ty = II.getType();
226       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
227       return IC.replaceInstUsesWith(II, QNaN);
228     }
229 
230     break;
231   }
232   case Intrinsic::amdgcn_frexp_mant:
233   case Intrinsic::amdgcn_frexp_exp: {
234     Value *Src = II.getArgOperand(0);
235     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
236       int Exp;
237       APFloat Significand =
238           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
239 
240       if (IID == Intrinsic::amdgcn_frexp_mant) {
241         return IC.replaceInstUsesWith(
242             II, ConstantFP::get(II.getContext(), Significand));
243       }
244 
245       // Match instruction special case behavior.
246       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
247         Exp = 0;
248 
249       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
250     }
251 
252     if (isa<UndefValue>(Src)) {
253       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
254     }
255 
256     break;
257   }
258   case Intrinsic::amdgcn_class: {
259     enum {
260       S_NAN = 1 << 0,       // Signaling NaN
261       Q_NAN = 1 << 1,       // Quiet NaN
262       N_INFINITY = 1 << 2,  // Negative infinity
263       N_NORMAL = 1 << 3,    // Negative normal
264       N_SUBNORMAL = 1 << 4, // Negative subnormal
265       N_ZERO = 1 << 5,      // Negative zero
266       P_ZERO = 1 << 6,      // Positive zero
267       P_SUBNORMAL = 1 << 7, // Positive subnormal
268       P_NORMAL = 1 << 8,    // Positive normal
269       P_INFINITY = 1 << 9   // Positive infinity
270     };
271 
272     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
273                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
274                               P_NORMAL | P_INFINITY;
275 
276     Value *Src0 = II.getArgOperand(0);
277     Value *Src1 = II.getArgOperand(1);
278     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
279     if (!CMask) {
280       if (isa<UndefValue>(Src0)) {
281         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
282       }
283 
284       if (isa<UndefValue>(Src1)) {
285         return IC.replaceInstUsesWith(II,
286                                       ConstantInt::get(II.getType(), false));
287       }
288       break;
289     }
290 
291     uint32_t Mask = CMask->getZExtValue();
292 
293     // If all tests are made, it doesn't matter what the value is.
294     if ((Mask & FullMask) == FullMask) {
295       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
296     }
297 
298     if ((Mask & FullMask) == 0) {
299       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
300     }
301 
302     if (Mask == (S_NAN | Q_NAN)) {
303       // Equivalent of isnan. Replace with standard fcmp.
304       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
305       FCmp->takeName(&II);
306       return IC.replaceInstUsesWith(II, FCmp);
307     }
308 
309     if (Mask == (N_ZERO | P_ZERO)) {
310       // Equivalent of == 0.
311       Value *FCmp =
312           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
313 
314       FCmp->takeName(&II);
315       return IC.replaceInstUsesWith(II, FCmp);
316     }
317 
318     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
319     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
320         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
321       return IC.replaceOperand(
322           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
323     }
324 
325     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
326     if (!CVal) {
327       if (isa<UndefValue>(Src0)) {
328         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
329       }
330 
331       // Clamp mask to used bits
332       if ((Mask & FullMask) != Mask) {
333         CallInst *NewCall = IC.Builder.CreateCall(
334             II.getCalledFunction(),
335             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
336 
337         NewCall->takeName(&II);
338         return IC.replaceInstUsesWith(II, NewCall);
339       }
340 
341       break;
342     }
343 
344     const APFloat &Val = CVal->getValueAPF();
345 
346     bool Result =
347         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
348         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
349         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
350         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
351         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
352         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
353         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
354         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
355         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
356         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
357 
358     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
359   }
360   case Intrinsic::amdgcn_cvt_pkrtz: {
361     Value *Src0 = II.getArgOperand(0);
362     Value *Src1 = II.getArgOperand(1);
363     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
364       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
365         const fltSemantics &HalfSem =
366             II.getType()->getScalarType()->getFltSemantics();
367         bool LosesInfo;
368         APFloat Val0 = C0->getValueAPF();
369         APFloat Val1 = C1->getValueAPF();
370         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
371         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
372 
373         Constant *Folded =
374             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
375                                  ConstantFP::get(II.getContext(), Val1)});
376         return IC.replaceInstUsesWith(II, Folded);
377       }
378     }
379 
380     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
381       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
382     }
383 
384     break;
385   }
386   case Intrinsic::amdgcn_cvt_pknorm_i16:
387   case Intrinsic::amdgcn_cvt_pknorm_u16:
388   case Intrinsic::amdgcn_cvt_pk_i16:
389   case Intrinsic::amdgcn_cvt_pk_u16: {
390     Value *Src0 = II.getArgOperand(0);
391     Value *Src1 = II.getArgOperand(1);
392 
393     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
394       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
395     }
396 
397     break;
398   }
399   case Intrinsic::amdgcn_ubfe:
400   case Intrinsic::amdgcn_sbfe: {
401     // Decompose simple cases into standard shifts.
402     Value *Src = II.getArgOperand(0);
403     if (isa<UndefValue>(Src)) {
404       return IC.replaceInstUsesWith(II, Src);
405     }
406 
407     unsigned Width;
408     Type *Ty = II.getType();
409     unsigned IntSize = Ty->getIntegerBitWidth();
410 
411     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
412     if (CWidth) {
413       Width = CWidth->getZExtValue();
414       if ((Width & (IntSize - 1)) == 0) {
415         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
416       }
417 
418       // Hardware ignores high bits, so remove those.
419       if (Width >= IntSize) {
420         return IC.replaceOperand(
421             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
422       }
423     }
424 
425     unsigned Offset;
426     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
427     if (COffset) {
428       Offset = COffset->getZExtValue();
429       if (Offset >= IntSize) {
430         return IC.replaceOperand(
431             II, 1,
432             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
433       }
434     }
435 
436     bool Signed = IID == Intrinsic::amdgcn_sbfe;
437 
438     if (!CWidth || !COffset)
439       break;
440 
441     // The case of Width == 0 is handled above, which makes this tranformation
442     // safe.  If Width == 0, then the ashr and lshr instructions become poison
443     // value since the shift amount would be equal to the bit size.
444     assert(Width != 0);
445 
446     // TODO: This allows folding to undef when the hardware has specific
447     // behavior?
448     if (Offset + Width < IntSize) {
449       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
450       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
451                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
452       RightShift->takeName(&II);
453       return IC.replaceInstUsesWith(II, RightShift);
454     }
455 
456     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
457                                : IC.Builder.CreateLShr(Src, Offset);
458 
459     RightShift->takeName(&II);
460     return IC.replaceInstUsesWith(II, RightShift);
461   }
462   case Intrinsic::amdgcn_exp:
463   case Intrinsic::amdgcn_exp_compr: {
464     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
465     unsigned EnBits = En->getZExtValue();
466     if (EnBits == 0xf)
467       break; // All inputs enabled.
468 
469     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
470     bool Changed = false;
471     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
472       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
473           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
474         Value *Src = II.getArgOperand(I + 2);
475         if (!isa<UndefValue>(Src)) {
476           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
477           Changed = true;
478         }
479       }
480     }
481 
482     if (Changed) {
483       return &II;
484     }
485 
486     break;
487   }
488   case Intrinsic::amdgcn_fmed3: {
489     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
490     // for the shader.
491 
492     Value *Src0 = II.getArgOperand(0);
493     Value *Src1 = II.getArgOperand(1);
494     Value *Src2 = II.getArgOperand(2);
495 
496     // Checking for NaN before canonicalization provides better fidelity when
497     // mapping other operations onto fmed3 since the order of operands is
498     // unchanged.
499     CallInst *NewCall = nullptr;
500     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
501       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
502     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
503       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
504     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
505       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
506     }
507 
508     if (NewCall) {
509       NewCall->copyFastMathFlags(&II);
510       NewCall->takeName(&II);
511       return IC.replaceInstUsesWith(II, NewCall);
512     }
513 
514     bool Swap = false;
515     // Canonicalize constants to RHS operands.
516     //
517     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
518     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
519       std::swap(Src0, Src1);
520       Swap = true;
521     }
522 
523     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
524       std::swap(Src1, Src2);
525       Swap = true;
526     }
527 
528     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
529       std::swap(Src0, Src1);
530       Swap = true;
531     }
532 
533     if (Swap) {
534       II.setArgOperand(0, Src0);
535       II.setArgOperand(1, Src1);
536       II.setArgOperand(2, Src2);
537       return &II;
538     }
539 
540     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
541       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
542         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
543           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
544                                        C2->getValueAPF());
545           return IC.replaceInstUsesWith(
546               II, ConstantFP::get(IC.Builder.getContext(), Result));
547         }
548       }
549     }
550 
551     break;
552   }
553   case Intrinsic::amdgcn_icmp:
554   case Intrinsic::amdgcn_fcmp: {
555     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
556     // Guard against invalid arguments.
557     int64_t CCVal = CC->getZExtValue();
558     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
559     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
560                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
561         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
562                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
563       break;
564 
565     Value *Src0 = II.getArgOperand(0);
566     Value *Src1 = II.getArgOperand(1);
567 
568     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
569       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
570         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
571         if (CCmp->isNullValue()) {
572           return IC.replaceInstUsesWith(
573               II, ConstantExpr::getSExt(CCmp, II.getType()));
574         }
575 
576         // The result of V_ICMP/V_FCMP assembly instructions (which this
577         // intrinsic exposes) is one bit per thread, masked with the EXEC
578         // register (which contains the bitmask of live threads). So a
579         // comparison that always returns true is the same as a read of the
580         // EXEC register.
581         Function *NewF = Intrinsic::getDeclaration(
582             II.getModule(), Intrinsic::read_register, II.getType());
583         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
584         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
585         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
586         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
587         NewCall->addAttribute(AttributeList::FunctionIndex,
588                               Attribute::Convergent);
589         NewCall->takeName(&II);
590         return IC.replaceInstUsesWith(II, NewCall);
591       }
592 
593       // Canonicalize constants to RHS.
594       CmpInst::Predicate SwapPred =
595           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
596       II.setArgOperand(0, Src1);
597       II.setArgOperand(1, Src0);
598       II.setArgOperand(
599           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
600       return &II;
601     }
602 
603     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
604       break;
605 
606     // Canonicalize compare eq with true value to compare != 0
607     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
608     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
609     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
610     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
611     Value *ExtSrc;
612     if (CCVal == CmpInst::ICMP_EQ &&
613         ((match(Src1, PatternMatch::m_One()) &&
614           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
615          (match(Src1, PatternMatch::m_AllOnes()) &&
616           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
617         ExtSrc->getType()->isIntegerTy(1)) {
618       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
619       IC.replaceOperand(II, 2,
620                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
621       return &II;
622     }
623 
624     CmpInst::Predicate SrcPred;
625     Value *SrcLHS;
626     Value *SrcRHS;
627 
628     // Fold compare eq/ne with 0 from a compare result as the predicate to the
629     // intrinsic. The typical use is a wave vote function in the library, which
630     // will be fed from a user code condition compared with 0. Fold in the
631     // redundant compare.
632 
633     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
634     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
635     //
636     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
637     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
638     if (match(Src1, PatternMatch::m_Zero()) &&
639         match(Src0, PatternMatch::m_ZExtOrSExt(
640                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
641                               PatternMatch::m_Value(SrcRHS))))) {
642       if (CCVal == CmpInst::ICMP_EQ)
643         SrcPred = CmpInst::getInversePredicate(SrcPred);
644 
645       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
646                                  ? Intrinsic::amdgcn_fcmp
647                                  : Intrinsic::amdgcn_icmp;
648 
649       Type *Ty = SrcLHS->getType();
650       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
651         // Promote to next legal integer type.
652         unsigned Width = CmpType->getBitWidth();
653         unsigned NewWidth = Width;
654 
655         // Don't do anything for i1 comparisons.
656         if (Width == 1)
657           break;
658 
659         if (Width <= 16)
660           NewWidth = 16;
661         else if (Width <= 32)
662           NewWidth = 32;
663         else if (Width <= 64)
664           NewWidth = 64;
665         else if (Width > 64)
666           break; // Can't handle this.
667 
668         if (Width != NewWidth) {
669           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
670           if (CmpInst::isSigned(SrcPred)) {
671             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
672             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
673           } else {
674             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
675             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
676           }
677         }
678       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
679         break;
680 
681       Function *NewF = Intrinsic::getDeclaration(
682           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
683       Value *Args[] = {SrcLHS, SrcRHS,
684                        ConstantInt::get(CC->getType(), SrcPred)};
685       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
686       NewCall->takeName(&II);
687       return IC.replaceInstUsesWith(II, NewCall);
688     }
689 
690     break;
691   }
692   case Intrinsic::amdgcn_ballot: {
693     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
694       if (Src->isZero()) {
695         // amdgcn.ballot(i1 0) is zero.
696         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
697       }
698 
699       if (Src->isOne()) {
700         // amdgcn.ballot(i1 1) is exec.
701         const char *RegName = "exec";
702         if (II.getType()->isIntegerTy(32))
703           RegName = "exec_lo";
704         else if (!II.getType()->isIntegerTy(64))
705           break;
706 
707         Function *NewF = Intrinsic::getDeclaration(
708             II.getModule(), Intrinsic::read_register, II.getType());
709         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
710         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
711         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
712         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
713         NewCall->addAttribute(AttributeList::FunctionIndex,
714                               Attribute::Convergent);
715         NewCall->takeName(&II);
716         return IC.replaceInstUsesWith(II, NewCall);
717       }
718     }
719     break;
720   }
721   case Intrinsic::amdgcn_wqm_vote: {
722     // wqm_vote is identity when the argument is constant.
723     if (!isa<Constant>(II.getArgOperand(0)))
724       break;
725 
726     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
727   }
728   case Intrinsic::amdgcn_kill: {
729     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
730     if (!C || !C->getZExtValue())
731       break;
732 
733     // amdgcn.kill(i1 1) is a no-op
734     return IC.eraseInstFromFunction(II);
735   }
736   case Intrinsic::amdgcn_update_dpp: {
737     Value *Old = II.getArgOperand(0);
738 
739     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
740     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
741     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
742     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
743         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
744       break;
745 
746     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
747     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
748   }
749   case Intrinsic::amdgcn_permlane16:
750   case Intrinsic::amdgcn_permlanex16: {
751     // Discard vdst_in if it's not going to be read.
752     Value *VDstIn = II.getArgOperand(0);
753     if (isa<UndefValue>(VDstIn))
754       break;
755 
756     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
757     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
758     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
759       break;
760 
761     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
762   }
763   case Intrinsic::amdgcn_readfirstlane:
764   case Intrinsic::amdgcn_readlane: {
765     // A constant value is trivially uniform.
766     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
767       return IC.replaceInstUsesWith(II, C);
768     }
769 
770     // The rest of these may not be safe if the exec may not be the same between
771     // the def and use.
772     Value *Src = II.getArgOperand(0);
773     Instruction *SrcInst = dyn_cast<Instruction>(Src);
774     if (SrcInst && SrcInst->getParent() != II.getParent())
775       break;
776 
777     // readfirstlane (readfirstlane x) -> readfirstlane x
778     // readlane (readfirstlane x), y -> readfirstlane x
779     if (match(Src,
780               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
781       return IC.replaceInstUsesWith(II, Src);
782     }
783 
784     if (IID == Intrinsic::amdgcn_readfirstlane) {
785       // readfirstlane (readlane x, y) -> readlane x, y
786       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
787         return IC.replaceInstUsesWith(II, Src);
788       }
789     } else {
790       // readlane (readlane x, y), y -> readlane x, y
791       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
792                          PatternMatch::m_Value(),
793                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
794         return IC.replaceInstUsesWith(II, Src);
795       }
796     }
797 
798     break;
799   }
800   case Intrinsic::amdgcn_ldexp: {
801     // FIXME: This doesn't introduce new instructions and belongs in
802     // InstructionSimplify.
803     Type *Ty = II.getType();
804     Value *Op0 = II.getArgOperand(0);
805     Value *Op1 = II.getArgOperand(1);
806 
807     // Folding undef to qnan is safe regardless of the FP mode.
808     if (isa<UndefValue>(Op0)) {
809       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
810       return IC.replaceInstUsesWith(II, QNaN);
811     }
812 
813     const APFloat *C = nullptr;
814     match(Op0, PatternMatch::m_APFloat(C));
815 
816     // FIXME: Should flush denorms depending on FP mode, but that's ignored
817     // everywhere else.
818     //
819     // These cases should be safe, even with strictfp.
820     // ldexp(0.0, x) -> 0.0
821     // ldexp(-0.0, x) -> -0.0
822     // ldexp(inf, x) -> inf
823     // ldexp(-inf, x) -> -inf
824     if (C && (C->isZero() || C->isInfinity())) {
825       return IC.replaceInstUsesWith(II, Op0);
826     }
827 
828     // With strictfp, be more careful about possibly needing to flush denormals
829     // or not, and snan behavior depends on ieee_mode.
830     if (II.isStrictFP())
831       break;
832 
833     if (C && C->isNaN()) {
834       // FIXME: We just need to make the nan quiet here, but that's unavailable
835       // on APFloat, only IEEEfloat
836       auto *Quieted =
837           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
838       return IC.replaceInstUsesWith(II, Quieted);
839     }
840 
841     // ldexp(x, 0) -> x
842     // ldexp(x, undef) -> x
843     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
844       return IC.replaceInstUsesWith(II, Op0);
845     }
846 
847     break;
848   }
849   case Intrinsic::amdgcn_fmul_legacy: {
850     Value *Op0 = II.getArgOperand(0);
851     Value *Op1 = II.getArgOperand(1);
852 
853     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
854     // infinity, gives +0.0.
855     // TODO: Move to InstSimplify?
856     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
857         match(Op1, PatternMatch::m_AnyZeroFP()))
858       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
859 
860     // If we can prove we don't have one of the special cases then we can use a
861     // normal fmul instruction instead.
862     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
863       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
864       FMul->takeName(&II);
865       return IC.replaceInstUsesWith(II, FMul);
866     }
867     break;
868   }
869   case Intrinsic::amdgcn_fma_legacy: {
870     Value *Op0 = II.getArgOperand(0);
871     Value *Op1 = II.getArgOperand(1);
872     Value *Op2 = II.getArgOperand(2);
873 
874     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
875     // infinity, gives +0.0.
876     // TODO: Move to InstSimplify?
877     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
878         match(Op1, PatternMatch::m_AnyZeroFP())) {
879       // It's tempting to just return Op2 here, but that would give the wrong
880       // result if Op2 was -0.0.
881       auto *Zero = ConstantFP::getNullValue(II.getType());
882       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
883       FAdd->takeName(&II);
884       return IC.replaceInstUsesWith(II, FAdd);
885     }
886 
887     // If we can prove we don't have one of the special cases then we can use a
888     // normal fma instead.
889     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
890       II.setCalledOperand(Intrinsic::getDeclaration(
891           II.getModule(), Intrinsic::fma, II.getType()));
892       return &II;
893     }
894     break;
895   }
896   default: {
897     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
898             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
899       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
900     }
901   }
902   }
903   return None;
904 }
905 
906 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
907 ///
908 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
909 ///       struct returns.
910 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
911                                                     IntrinsicInst &II,
912                                                     APInt DemandedElts,
913                                                     int DMaskIdx = -1) {
914 
915   auto *IIVTy = cast<FixedVectorType>(II.getType());
916   unsigned VWidth = IIVTy->getNumElements();
917   if (VWidth == 1)
918     return nullptr;
919 
920   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
921   IC.Builder.SetInsertPoint(&II);
922 
923   // Assume the arguments are unchanged and later override them, if needed.
924   SmallVector<Value *, 16> Args(II.args());
925 
926   if (DMaskIdx < 0) {
927     // Buffer case.
928 
929     const unsigned ActiveBits = DemandedElts.getActiveBits();
930     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
931 
932     // Start assuming the prefix of elements is demanded, but possibly clear
933     // some other bits if there are trailing zeros (unused components at front)
934     // and update offset.
935     DemandedElts = (1 << ActiveBits) - 1;
936 
937     if (UnusedComponentsAtFront > 0) {
938       static const unsigned InvalidOffsetIdx = 0xf;
939 
940       unsigned OffsetIdx;
941       switch (II.getIntrinsicID()) {
942       case Intrinsic::amdgcn_raw_buffer_load:
943         OffsetIdx = 1;
944         break;
945       case Intrinsic::amdgcn_s_buffer_load:
946         // If resulting type is vec3, there is no point in trimming the
947         // load with updated offset, as the vec3 would most likely be widened to
948         // vec4 anyway during lowering.
949         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
950           OffsetIdx = InvalidOffsetIdx;
951         else
952           OffsetIdx = 1;
953         break;
954       case Intrinsic::amdgcn_struct_buffer_load:
955         OffsetIdx = 2;
956         break;
957       default:
958         // TODO: handle tbuffer* intrinsics.
959         OffsetIdx = InvalidOffsetIdx;
960         break;
961       }
962 
963       if (OffsetIdx != InvalidOffsetIdx) {
964         // Clear demanded bits and update the offset.
965         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
966         auto *Offset = II.getArgOperand(OffsetIdx);
967         unsigned SingleComponentSizeInBits =
968             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
969         unsigned OffsetAdd =
970             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
971         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
972         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
973       }
974     }
975   } else {
976     // Image case.
977 
978     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
979     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
980 
981     // Mask off values that are undefined because the dmask doesn't cover them
982     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
983 
984     unsigned NewDMaskVal = 0;
985     unsigned OrigLoadIdx = 0;
986     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
987       const unsigned Bit = 1 << SrcIdx;
988       if (!!(DMaskVal & Bit)) {
989         if (!!DemandedElts[OrigLoadIdx])
990           NewDMaskVal |= Bit;
991         OrigLoadIdx++;
992       }
993     }
994 
995     if (DMaskVal != NewDMaskVal)
996       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
997   }
998 
999   unsigned NewNumElts = DemandedElts.countPopulation();
1000   if (!NewNumElts)
1001     return UndefValue::get(II.getType());
1002 
1003   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1004     if (DMaskIdx >= 0)
1005       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1006     return nullptr;
1007   }
1008 
1009   // Validate function argument and return types, extracting overloaded types
1010   // along the way.
1011   SmallVector<Type *, 6> OverloadTys;
1012   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1013     return nullptr;
1014 
1015   Module *M = II.getParent()->getParent()->getParent();
1016   Type *EltTy = IIVTy->getElementType();
1017   Type *NewTy =
1018       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1019 
1020   OverloadTys[0] = NewTy;
1021   Function *NewIntrin =
1022       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1023 
1024   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1025   NewCall->takeName(&II);
1026   NewCall->copyMetadata(II);
1027 
1028   if (NewNumElts == 1) {
1029     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1030                                           NewCall,
1031                                           DemandedElts.countTrailingZeros());
1032   }
1033 
1034   SmallVector<int, 8> EltMask;
1035   unsigned NewLoadIdx = 0;
1036   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1037     if (!!DemandedElts[OrigLoadIdx])
1038       EltMask.push_back(NewLoadIdx++);
1039     else
1040       EltMask.push_back(NewNumElts);
1041   }
1042 
1043   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1044 
1045   return Shuffle;
1046 }
1047 
1048 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1049     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1050     APInt &UndefElts2, APInt &UndefElts3,
1051     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1052         SimplifyAndSetOp) const {
1053   switch (II.getIntrinsicID()) {
1054   case Intrinsic::amdgcn_buffer_load:
1055   case Intrinsic::amdgcn_buffer_load_format:
1056   case Intrinsic::amdgcn_raw_buffer_load:
1057   case Intrinsic::amdgcn_raw_buffer_load_format:
1058   case Intrinsic::amdgcn_raw_tbuffer_load:
1059   case Intrinsic::amdgcn_s_buffer_load:
1060   case Intrinsic::amdgcn_struct_buffer_load:
1061   case Intrinsic::amdgcn_struct_buffer_load_format:
1062   case Intrinsic::amdgcn_struct_tbuffer_load:
1063   case Intrinsic::amdgcn_tbuffer_load:
1064     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1065   default: {
1066     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1067       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1068     }
1069     break;
1070   }
1071   }
1072   return None;
1073 }
1074