1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "llvm/Support/KnownBits.h"
19 #include "llvm/Transforms/InstCombine/InstCombiner.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "AMDGPUtti"
24 
25 namespace {
26 
27 struct AMDGPUImageDMaskIntrinsic {
28   unsigned Intr;
29 };
30 
31 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
32 #include "InstCombineTables.inc"
33 
34 } // end anonymous namespace
35 
36 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
37 //
38 // A single NaN input is folded to minnum, so we rely on that folding for
39 // handling NaNs.
40 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
41                            const APFloat &Src2) {
42   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
43 
44   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
45   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
46   if (Cmp0 == APFloat::cmpEqual)
47     return maxnum(Src1, Src2);
48 
49   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
50   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp1 == APFloat::cmpEqual)
52     return maxnum(Src0, Src2);
53 
54   return maxnum(Src0, Src1);
55 }
56 
57 // Check if a value can be converted to a 16-bit value without losing
58 // precision.
59 static bool canSafelyConvertTo16Bit(Value &V) {
60   Type *VTy = V.getType();
61   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
62     // The value is already 16-bit, so we don't want to convert to 16-bit again!
63     return false;
64   }
65   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
66     // We need to check that if we cast the index down to a half, we do not lose
67     // precision.
68     APFloat FloatValue(ConstFloat->getValueAPF());
69     bool LosesInfo = true;
70     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
71     return !LosesInfo;
72   }
73   Value *CastSrc;
74   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
75       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
76       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
77     Type *CastSrcTy = CastSrc->getType();
78     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
79       return true;
80   }
81 
82   return false;
83 }
84 
85 // Convert a value to 16-bit.
86 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
87   Type *VTy = V.getType();
88   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
89     return cast<Instruction>(&V)->getOperand(0);
90   if (VTy->isIntegerTy())
91     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
92   if (VTy->isFloatingPointTy())
93     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
94 
95   llvm_unreachable("Should never be called!");
96 }
97 
98 static Optional<Instruction *>
99 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
100                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
101                              IntrinsicInst &II, InstCombiner &IC) {
102   if (!ST->hasA16() && !ST->hasG16())
103     return None;
104 
105   bool FloatCoord = false;
106   // true means derivatives can be converted to 16 bit, coordinates not
107   bool OnlyDerivatives = false;
108 
109   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
110        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
111     Value *Coord = II.getOperand(OperandIndex);
112     // If the values are not derived from 16-bit values, we cannot optimize.
113     if (!canSafelyConvertTo16Bit(*Coord)) {
114       if (OperandIndex < ImageDimIntr->CoordStart ||
115           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
116         return None;
117       }
118       // All gradients can be converted, so convert only them
119       OnlyDerivatives = true;
120       break;
121     }
122 
123     assert(OperandIndex == ImageDimIntr->GradientStart ||
124            FloatCoord == Coord->getType()->isFloatingPointTy());
125     FloatCoord = Coord->getType()->isFloatingPointTy();
126   }
127 
128   if (OnlyDerivatives) {
129     if (!ST->hasG16())
130       return None;
131   } else {
132     if (!ST->hasA16())
133       OnlyDerivatives = true; // Only supports G16
134   }
135 
136   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
137                                : Type::getInt16Ty(II.getContext());
138 
139   SmallVector<Type *, 4> ArgTys;
140   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
141     return None;
142 
143   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
144   if (!OnlyDerivatives)
145     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
146   Function *I =
147       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
148 
149   SmallVector<Value *, 8> Args(II.arg_operands());
150 
151   unsigned EndIndex =
152       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
153   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
154        OperandIndex < EndIndex; OperandIndex++) {
155     Args[OperandIndex] =
156         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
157   }
158 
159   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
160   NewCall->takeName(&II);
161   NewCall->copyMetadata(II);
162   NewCall->copyFastMathFlags(&II);
163   return IC.replaceInstUsesWith(II, NewCall);
164 }
165 
166 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
167                                            InstCombiner &IC) const {
168   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
169   // infinity, gives +0.0. If we can prove we don't have one of the special
170   // cases then we can use a normal multiply instead.
171   // TODO: Create and use isKnownFiniteNonZero instead of just matching
172   // constants here.
173   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
174       match(Op1, PatternMatch::m_FiniteNonZero())) {
175     // One operand is not zero or infinity or NaN.
176     return true;
177   }
178   auto *TLI = &IC.getTargetLibraryInfo();
179   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
180       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
181     // Neither operand is infinity or NaN.
182     return true;
183   }
184   return false;
185 }
186 
187 Optional<Instruction *>
188 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
189   Intrinsic::ID IID = II.getIntrinsicID();
190   switch (IID) {
191   case Intrinsic::amdgcn_rcp: {
192     Value *Src = II.getArgOperand(0);
193 
194     // TODO: Move to ConstantFolding/InstSimplify?
195     if (isa<UndefValue>(Src)) {
196       Type *Ty = II.getType();
197       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
198       return IC.replaceInstUsesWith(II, QNaN);
199     }
200 
201     if (II.isStrictFP())
202       break;
203 
204     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
205       const APFloat &ArgVal = C->getValueAPF();
206       APFloat Val(ArgVal.getSemantics(), 1);
207       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
208 
209       // This is more precise than the instruction may give.
210       //
211       // TODO: The instruction always flushes denormal results (except for f16),
212       // should this also?
213       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
214     }
215 
216     break;
217   }
218   case Intrinsic::amdgcn_rsq: {
219     Value *Src = II.getArgOperand(0);
220 
221     // TODO: Move to ConstantFolding/InstSimplify?
222     if (isa<UndefValue>(Src)) {
223       Type *Ty = II.getType();
224       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
225       return IC.replaceInstUsesWith(II, QNaN);
226     }
227 
228     break;
229   }
230   case Intrinsic::amdgcn_frexp_mant:
231   case Intrinsic::amdgcn_frexp_exp: {
232     Value *Src = II.getArgOperand(0);
233     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
234       int Exp;
235       APFloat Significand =
236           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
237 
238       if (IID == Intrinsic::amdgcn_frexp_mant) {
239         return IC.replaceInstUsesWith(
240             II, ConstantFP::get(II.getContext(), Significand));
241       }
242 
243       // Match instruction special case behavior.
244       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
245         Exp = 0;
246 
247       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
248     }
249 
250     if (isa<UndefValue>(Src)) {
251       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
252     }
253 
254     break;
255   }
256   case Intrinsic::amdgcn_class: {
257     enum {
258       S_NAN = 1 << 0,       // Signaling NaN
259       Q_NAN = 1 << 1,       // Quiet NaN
260       N_INFINITY = 1 << 2,  // Negative infinity
261       N_NORMAL = 1 << 3,    // Negative normal
262       N_SUBNORMAL = 1 << 4, // Negative subnormal
263       N_ZERO = 1 << 5,      // Negative zero
264       P_ZERO = 1 << 6,      // Positive zero
265       P_SUBNORMAL = 1 << 7, // Positive subnormal
266       P_NORMAL = 1 << 8,    // Positive normal
267       P_INFINITY = 1 << 9   // Positive infinity
268     };
269 
270     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
271                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
272                               P_NORMAL | P_INFINITY;
273 
274     Value *Src0 = II.getArgOperand(0);
275     Value *Src1 = II.getArgOperand(1);
276     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
277     if (!CMask) {
278       if (isa<UndefValue>(Src0)) {
279         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
280       }
281 
282       if (isa<UndefValue>(Src1)) {
283         return IC.replaceInstUsesWith(II,
284                                       ConstantInt::get(II.getType(), false));
285       }
286       break;
287     }
288 
289     uint32_t Mask = CMask->getZExtValue();
290 
291     // If all tests are made, it doesn't matter what the value is.
292     if ((Mask & FullMask) == FullMask) {
293       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
294     }
295 
296     if ((Mask & FullMask) == 0) {
297       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
298     }
299 
300     if (Mask == (S_NAN | Q_NAN)) {
301       // Equivalent of isnan. Replace with standard fcmp.
302       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
303       FCmp->takeName(&II);
304       return IC.replaceInstUsesWith(II, FCmp);
305     }
306 
307     if (Mask == (N_ZERO | P_ZERO)) {
308       // Equivalent of == 0.
309       Value *FCmp =
310           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
311 
312       FCmp->takeName(&II);
313       return IC.replaceInstUsesWith(II, FCmp);
314     }
315 
316     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
317     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
318         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
319       return IC.replaceOperand(
320           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
321     }
322 
323     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
324     if (!CVal) {
325       if (isa<UndefValue>(Src0)) {
326         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
327       }
328 
329       // Clamp mask to used bits
330       if ((Mask & FullMask) != Mask) {
331         CallInst *NewCall = IC.Builder.CreateCall(
332             II.getCalledFunction(),
333             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
334 
335         NewCall->takeName(&II);
336         return IC.replaceInstUsesWith(II, NewCall);
337       }
338 
339       break;
340     }
341 
342     const APFloat &Val = CVal->getValueAPF();
343 
344     bool Result =
345         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
346         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
347         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
348         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
349         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
350         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
351         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
352         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
353         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
354         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
355 
356     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
357   }
358   case Intrinsic::amdgcn_cvt_pkrtz: {
359     Value *Src0 = II.getArgOperand(0);
360     Value *Src1 = II.getArgOperand(1);
361     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
362       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
363         const fltSemantics &HalfSem =
364             II.getType()->getScalarType()->getFltSemantics();
365         bool LosesInfo;
366         APFloat Val0 = C0->getValueAPF();
367         APFloat Val1 = C1->getValueAPF();
368         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
369         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
370 
371         Constant *Folded =
372             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
373                                  ConstantFP::get(II.getContext(), Val1)});
374         return IC.replaceInstUsesWith(II, Folded);
375       }
376     }
377 
378     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
379       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
380     }
381 
382     break;
383   }
384   case Intrinsic::amdgcn_cvt_pknorm_i16:
385   case Intrinsic::amdgcn_cvt_pknorm_u16:
386   case Intrinsic::amdgcn_cvt_pk_i16:
387   case Intrinsic::amdgcn_cvt_pk_u16: {
388     Value *Src0 = II.getArgOperand(0);
389     Value *Src1 = II.getArgOperand(1);
390 
391     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
392       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
393     }
394 
395     break;
396   }
397   case Intrinsic::amdgcn_ubfe:
398   case Intrinsic::amdgcn_sbfe: {
399     // Decompose simple cases into standard shifts.
400     Value *Src = II.getArgOperand(0);
401     if (isa<UndefValue>(Src)) {
402       return IC.replaceInstUsesWith(II, Src);
403     }
404 
405     unsigned Width;
406     Type *Ty = II.getType();
407     unsigned IntSize = Ty->getIntegerBitWidth();
408 
409     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
410     if (CWidth) {
411       Width = CWidth->getZExtValue();
412       if ((Width & (IntSize - 1)) == 0) {
413         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
414       }
415 
416       // Hardware ignores high bits, so remove those.
417       if (Width >= IntSize) {
418         return IC.replaceOperand(
419             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
420       }
421     }
422 
423     unsigned Offset;
424     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
425     if (COffset) {
426       Offset = COffset->getZExtValue();
427       if (Offset >= IntSize) {
428         return IC.replaceOperand(
429             II, 1,
430             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
431       }
432     }
433 
434     bool Signed = IID == Intrinsic::amdgcn_sbfe;
435 
436     if (!CWidth || !COffset)
437       break;
438 
439     // The case of Width == 0 is handled above, which makes this tranformation
440     // safe.  If Width == 0, then the ashr and lshr instructions become poison
441     // value since the shift amount would be equal to the bit size.
442     assert(Width != 0);
443 
444     // TODO: This allows folding to undef when the hardware has specific
445     // behavior?
446     if (Offset + Width < IntSize) {
447       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
448       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
449                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
450       RightShift->takeName(&II);
451       return IC.replaceInstUsesWith(II, RightShift);
452     }
453 
454     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
455                                : IC.Builder.CreateLShr(Src, Offset);
456 
457     RightShift->takeName(&II);
458     return IC.replaceInstUsesWith(II, RightShift);
459   }
460   case Intrinsic::amdgcn_exp:
461   case Intrinsic::amdgcn_exp_compr: {
462     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
463     unsigned EnBits = En->getZExtValue();
464     if (EnBits == 0xf)
465       break; // All inputs enabled.
466 
467     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
468     bool Changed = false;
469     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
470       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
471           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
472         Value *Src = II.getArgOperand(I + 2);
473         if (!isa<UndefValue>(Src)) {
474           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
475           Changed = true;
476         }
477       }
478     }
479 
480     if (Changed) {
481       return &II;
482     }
483 
484     break;
485   }
486   case Intrinsic::amdgcn_fmed3: {
487     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
488     // for the shader.
489 
490     Value *Src0 = II.getArgOperand(0);
491     Value *Src1 = II.getArgOperand(1);
492     Value *Src2 = II.getArgOperand(2);
493 
494     // Checking for NaN before canonicalization provides better fidelity when
495     // mapping other operations onto fmed3 since the order of operands is
496     // unchanged.
497     CallInst *NewCall = nullptr;
498     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
499       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
500     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
501       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
502     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
503       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
504     }
505 
506     if (NewCall) {
507       NewCall->copyFastMathFlags(&II);
508       NewCall->takeName(&II);
509       return IC.replaceInstUsesWith(II, NewCall);
510     }
511 
512     bool Swap = false;
513     // Canonicalize constants to RHS operands.
514     //
515     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
516     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
517       std::swap(Src0, Src1);
518       Swap = true;
519     }
520 
521     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
522       std::swap(Src1, Src2);
523       Swap = true;
524     }
525 
526     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
527       std::swap(Src0, Src1);
528       Swap = true;
529     }
530 
531     if (Swap) {
532       II.setArgOperand(0, Src0);
533       II.setArgOperand(1, Src1);
534       II.setArgOperand(2, Src2);
535       return &II;
536     }
537 
538     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
539       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
540         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
541           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
542                                        C2->getValueAPF());
543           return IC.replaceInstUsesWith(
544               II, ConstantFP::get(IC.Builder.getContext(), Result));
545         }
546       }
547     }
548 
549     break;
550   }
551   case Intrinsic::amdgcn_icmp:
552   case Intrinsic::amdgcn_fcmp: {
553     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
554     // Guard against invalid arguments.
555     int64_t CCVal = CC->getZExtValue();
556     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
557     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
558                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
559         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
560                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
561       break;
562 
563     Value *Src0 = II.getArgOperand(0);
564     Value *Src1 = II.getArgOperand(1);
565 
566     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
567       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
568         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
569         if (CCmp->isNullValue()) {
570           return IC.replaceInstUsesWith(
571               II, ConstantExpr::getSExt(CCmp, II.getType()));
572         }
573 
574         // The result of V_ICMP/V_FCMP assembly instructions (which this
575         // intrinsic exposes) is one bit per thread, masked with the EXEC
576         // register (which contains the bitmask of live threads). So a
577         // comparison that always returns true is the same as a read of the
578         // EXEC register.
579         Function *NewF = Intrinsic::getDeclaration(
580             II.getModule(), Intrinsic::read_register, II.getType());
581         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
582         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
583         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
584         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
585         NewCall->addAttribute(AttributeList::FunctionIndex,
586                               Attribute::Convergent);
587         NewCall->takeName(&II);
588         return IC.replaceInstUsesWith(II, NewCall);
589       }
590 
591       // Canonicalize constants to RHS.
592       CmpInst::Predicate SwapPred =
593           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
594       II.setArgOperand(0, Src1);
595       II.setArgOperand(1, Src0);
596       II.setArgOperand(
597           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
598       return &II;
599     }
600 
601     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
602       break;
603 
604     // Canonicalize compare eq with true value to compare != 0
605     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
606     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
607     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
608     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
609     Value *ExtSrc;
610     if (CCVal == CmpInst::ICMP_EQ &&
611         ((match(Src1, PatternMatch::m_One()) &&
612           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
613          (match(Src1, PatternMatch::m_AllOnes()) &&
614           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
615         ExtSrc->getType()->isIntegerTy(1)) {
616       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
617       IC.replaceOperand(II, 2,
618                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
619       return &II;
620     }
621 
622     CmpInst::Predicate SrcPred;
623     Value *SrcLHS;
624     Value *SrcRHS;
625 
626     // Fold compare eq/ne with 0 from a compare result as the predicate to the
627     // intrinsic. The typical use is a wave vote function in the library, which
628     // will be fed from a user code condition compared with 0. Fold in the
629     // redundant compare.
630 
631     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
632     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
633     //
634     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
635     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
636     if (match(Src1, PatternMatch::m_Zero()) &&
637         match(Src0, PatternMatch::m_ZExtOrSExt(
638                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
639                               PatternMatch::m_Value(SrcRHS))))) {
640       if (CCVal == CmpInst::ICMP_EQ)
641         SrcPred = CmpInst::getInversePredicate(SrcPred);
642 
643       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
644                                  ? Intrinsic::amdgcn_fcmp
645                                  : Intrinsic::amdgcn_icmp;
646 
647       Type *Ty = SrcLHS->getType();
648       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
649         // Promote to next legal integer type.
650         unsigned Width = CmpType->getBitWidth();
651         unsigned NewWidth = Width;
652 
653         // Don't do anything for i1 comparisons.
654         if (Width == 1)
655           break;
656 
657         if (Width <= 16)
658           NewWidth = 16;
659         else if (Width <= 32)
660           NewWidth = 32;
661         else if (Width <= 64)
662           NewWidth = 64;
663         else if (Width > 64)
664           break; // Can't handle this.
665 
666         if (Width != NewWidth) {
667           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
668           if (CmpInst::isSigned(SrcPred)) {
669             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
670             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
671           } else {
672             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
673             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
674           }
675         }
676       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
677         break;
678 
679       Function *NewF = Intrinsic::getDeclaration(
680           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
681       Value *Args[] = {SrcLHS, SrcRHS,
682                        ConstantInt::get(CC->getType(), SrcPred)};
683       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
684       NewCall->takeName(&II);
685       return IC.replaceInstUsesWith(II, NewCall);
686     }
687 
688     break;
689   }
690   case Intrinsic::amdgcn_ballot: {
691     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
692       if (Src->isZero()) {
693         // amdgcn.ballot(i1 0) is zero.
694         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
695       }
696 
697       if (Src->isOne()) {
698         // amdgcn.ballot(i1 1) is exec.
699         const char *RegName = "exec";
700         if (II.getType()->isIntegerTy(32))
701           RegName = "exec_lo";
702         else if (!II.getType()->isIntegerTy(64))
703           break;
704 
705         Function *NewF = Intrinsic::getDeclaration(
706             II.getModule(), Intrinsic::read_register, II.getType());
707         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
708         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
709         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
710         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
711         NewCall->addAttribute(AttributeList::FunctionIndex,
712                               Attribute::Convergent);
713         NewCall->takeName(&II);
714         return IC.replaceInstUsesWith(II, NewCall);
715       }
716     }
717     break;
718   }
719   case Intrinsic::amdgcn_wqm_vote: {
720     // wqm_vote is identity when the argument is constant.
721     if (!isa<Constant>(II.getArgOperand(0)))
722       break;
723 
724     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
725   }
726   case Intrinsic::amdgcn_kill: {
727     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
728     if (!C || !C->getZExtValue())
729       break;
730 
731     // amdgcn.kill(i1 1) is a no-op
732     return IC.eraseInstFromFunction(II);
733   }
734   case Intrinsic::amdgcn_update_dpp: {
735     Value *Old = II.getArgOperand(0);
736 
737     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
738     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
739     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
740     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
741         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
742       break;
743 
744     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
745     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
746   }
747   case Intrinsic::amdgcn_permlane16:
748   case Intrinsic::amdgcn_permlanex16: {
749     // Discard vdst_in if it's not going to be read.
750     Value *VDstIn = II.getArgOperand(0);
751     if (isa<UndefValue>(VDstIn))
752       break;
753 
754     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
755     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
756     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
757       break;
758 
759     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
760   }
761   case Intrinsic::amdgcn_readfirstlane:
762   case Intrinsic::amdgcn_readlane: {
763     // A constant value is trivially uniform.
764     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
765       return IC.replaceInstUsesWith(II, C);
766     }
767 
768     // The rest of these may not be safe if the exec may not be the same between
769     // the def and use.
770     Value *Src = II.getArgOperand(0);
771     Instruction *SrcInst = dyn_cast<Instruction>(Src);
772     if (SrcInst && SrcInst->getParent() != II.getParent())
773       break;
774 
775     // readfirstlane (readfirstlane x) -> readfirstlane x
776     // readlane (readfirstlane x), y -> readfirstlane x
777     if (match(Src,
778               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
779       return IC.replaceInstUsesWith(II, Src);
780     }
781 
782     if (IID == Intrinsic::amdgcn_readfirstlane) {
783       // readfirstlane (readlane x, y) -> readlane x, y
784       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
785         return IC.replaceInstUsesWith(II, Src);
786       }
787     } else {
788       // readlane (readlane x, y), y -> readlane x, y
789       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
790                          PatternMatch::m_Value(),
791                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
792         return IC.replaceInstUsesWith(II, Src);
793       }
794     }
795 
796     break;
797   }
798   case Intrinsic::amdgcn_ldexp: {
799     // FIXME: This doesn't introduce new instructions and belongs in
800     // InstructionSimplify.
801     Type *Ty = II.getType();
802     Value *Op0 = II.getArgOperand(0);
803     Value *Op1 = II.getArgOperand(1);
804 
805     // Folding undef to qnan is safe regardless of the FP mode.
806     if (isa<UndefValue>(Op0)) {
807       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
808       return IC.replaceInstUsesWith(II, QNaN);
809     }
810 
811     const APFloat *C = nullptr;
812     match(Op0, PatternMatch::m_APFloat(C));
813 
814     // FIXME: Should flush denorms depending on FP mode, but that's ignored
815     // everywhere else.
816     //
817     // These cases should be safe, even with strictfp.
818     // ldexp(0.0, x) -> 0.0
819     // ldexp(-0.0, x) -> -0.0
820     // ldexp(inf, x) -> inf
821     // ldexp(-inf, x) -> -inf
822     if (C && (C->isZero() || C->isInfinity())) {
823       return IC.replaceInstUsesWith(II, Op0);
824     }
825 
826     // With strictfp, be more careful about possibly needing to flush denormals
827     // or not, and snan behavior depends on ieee_mode.
828     if (II.isStrictFP())
829       break;
830 
831     if (C && C->isNaN()) {
832       // FIXME: We just need to make the nan quiet here, but that's unavailable
833       // on APFloat, only IEEEfloat
834       auto *Quieted =
835           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
836       return IC.replaceInstUsesWith(II, Quieted);
837     }
838 
839     // ldexp(x, 0) -> x
840     // ldexp(x, undef) -> x
841     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
842       return IC.replaceInstUsesWith(II, Op0);
843     }
844 
845     break;
846   }
847   case Intrinsic::amdgcn_fmul_legacy: {
848     Value *Op0 = II.getArgOperand(0);
849     Value *Op1 = II.getArgOperand(1);
850 
851     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
852     // infinity, gives +0.0.
853     // TODO: Move to InstSimplify?
854     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
855         match(Op1, PatternMatch::m_AnyZeroFP()))
856       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
857 
858     // If we can prove we don't have one of the special cases then we can use a
859     // normal fmul instruction instead.
860     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
861       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
862       FMul->takeName(&II);
863       return IC.replaceInstUsesWith(II, FMul);
864     }
865     break;
866   }
867   case Intrinsic::amdgcn_fma_legacy: {
868     Value *Op0 = II.getArgOperand(0);
869     Value *Op1 = II.getArgOperand(1);
870     Value *Op2 = II.getArgOperand(2);
871 
872     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
873     // infinity, gives +0.0.
874     // TODO: Move to InstSimplify?
875     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
876         match(Op1, PatternMatch::m_AnyZeroFP())) {
877       // It's tempting to just return Op2 here, but that would give the wrong
878       // result if Op2 was -0.0.
879       auto *Zero = ConstantFP::getNullValue(II.getType());
880       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
881       FAdd->takeName(&II);
882       return IC.replaceInstUsesWith(II, FAdd);
883     }
884 
885     // If we can prove we don't have one of the special cases then we can use a
886     // normal fma instead.
887     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
888       II.setCalledOperand(Intrinsic::getDeclaration(
889           II.getModule(), Intrinsic::fma, II.getType()));
890       return &II;
891     }
892     break;
893   }
894   default: {
895     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
896             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
897       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
898     }
899   }
900   }
901   return None;
902 }
903 
904 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
905 ///
906 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
907 ///       struct returns.
908 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
909                                                     IntrinsicInst &II,
910                                                     APInt DemandedElts,
911                                                     int DMaskIdx = -1) {
912 
913   auto *IIVTy = cast<FixedVectorType>(II.getType());
914   unsigned VWidth = IIVTy->getNumElements();
915   if (VWidth == 1)
916     return nullptr;
917 
918   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
919   IC.Builder.SetInsertPoint(&II);
920 
921   // Assume the arguments are unchanged and later override them, if needed.
922   SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
923 
924   if (DMaskIdx < 0) {
925     // Buffer case.
926 
927     const unsigned ActiveBits = DemandedElts.getActiveBits();
928     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
929 
930     // Start assuming the prefix of elements is demanded, but possibly clear
931     // some other bits if there are trailing zeros (unused components at front)
932     // and update offset.
933     DemandedElts = (1 << ActiveBits) - 1;
934 
935     if (UnusedComponentsAtFront > 0) {
936       static const unsigned InvalidOffsetIdx = 0xf;
937 
938       unsigned OffsetIdx;
939       switch (II.getIntrinsicID()) {
940       case Intrinsic::amdgcn_raw_buffer_load:
941         OffsetIdx = 1;
942         break;
943       case Intrinsic::amdgcn_s_buffer_load:
944         // If resulting type is vec3, there is no point in trimming the
945         // load with updated offset, as the vec3 would most likely be widened to
946         // vec4 anyway during lowering.
947         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
948           OffsetIdx = InvalidOffsetIdx;
949         else
950           OffsetIdx = 1;
951         break;
952       case Intrinsic::amdgcn_struct_buffer_load:
953         OffsetIdx = 2;
954         break;
955       default:
956         // TODO: handle tbuffer* intrinsics.
957         OffsetIdx = InvalidOffsetIdx;
958         break;
959       }
960 
961       if (OffsetIdx != InvalidOffsetIdx) {
962         // Clear demanded bits and update the offset.
963         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
964         auto *Offset = II.getArgOperand(OffsetIdx);
965         unsigned SingleComponentSizeInBits =
966             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
967         unsigned OffsetAdd =
968             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
969         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
970         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
971       }
972     }
973   } else {
974     // Image case.
975 
976     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
977     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
978 
979     // Mask off values that are undefined because the dmask doesn't cover them
980     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
981 
982     unsigned NewDMaskVal = 0;
983     unsigned OrigLoadIdx = 0;
984     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
985       const unsigned Bit = 1 << SrcIdx;
986       if (!!(DMaskVal & Bit)) {
987         if (!!DemandedElts[OrigLoadIdx])
988           NewDMaskVal |= Bit;
989         OrigLoadIdx++;
990       }
991     }
992 
993     if (DMaskVal != NewDMaskVal)
994       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
995   }
996 
997   unsigned NewNumElts = DemandedElts.countPopulation();
998   if (!NewNumElts)
999     return UndefValue::get(II.getType());
1000 
1001   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1002     if (DMaskIdx >= 0)
1003       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1004     return nullptr;
1005   }
1006 
1007   // Validate function argument and return types, extracting overloaded types
1008   // along the way.
1009   SmallVector<Type *, 6> OverloadTys;
1010   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1011     return nullptr;
1012 
1013   Module *M = II.getParent()->getParent()->getParent();
1014   Type *EltTy = IIVTy->getElementType();
1015   Type *NewTy =
1016       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1017 
1018   OverloadTys[0] = NewTy;
1019   Function *NewIntrin =
1020       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1021 
1022   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1023   NewCall->takeName(&II);
1024   NewCall->copyMetadata(II);
1025 
1026   if (NewNumElts == 1) {
1027     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1028                                           NewCall,
1029                                           DemandedElts.countTrailingZeros());
1030   }
1031 
1032   SmallVector<int, 8> EltMask;
1033   unsigned NewLoadIdx = 0;
1034   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1035     if (!!DemandedElts[OrigLoadIdx])
1036       EltMask.push_back(NewLoadIdx++);
1037     else
1038       EltMask.push_back(NewNumElts);
1039   }
1040 
1041   Value *Shuffle =
1042       IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
1043 
1044   return Shuffle;
1045 }
1046 
1047 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1048     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1049     APInt &UndefElts2, APInt &UndefElts3,
1050     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1051         SimplifyAndSetOp) const {
1052   switch (II.getIntrinsicID()) {
1053   case Intrinsic::amdgcn_buffer_load:
1054   case Intrinsic::amdgcn_buffer_load_format:
1055   case Intrinsic::amdgcn_raw_buffer_load:
1056   case Intrinsic::amdgcn_raw_buffer_load_format:
1057   case Intrinsic::amdgcn_raw_tbuffer_load:
1058   case Intrinsic::amdgcn_s_buffer_load:
1059   case Intrinsic::amdgcn_struct_buffer_load:
1060   case Intrinsic::amdgcn_struct_buffer_load_format:
1061   case Intrinsic::amdgcn_struct_tbuffer_load:
1062   case Intrinsic::amdgcn_tbuffer_load:
1063     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1064   default: {
1065     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1066       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1067     }
1068     break;
1069   }
1070   }
1071   return None;
1072 }
1073