1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "llvm/Support/KnownBits.h"
19 #include "llvm/Transforms/InstCombine/InstCombiner.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "AMDGPUtti"
24 
25 namespace {
26 
27 struct AMDGPUImageDMaskIntrinsic {
28   unsigned Intr;
29 };
30 
31 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
32 #include "InstCombineTables.inc"
33 
34 } // end anonymous namespace
35 
36 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
37 //
38 // A single NaN input is folded to minnum, so we rely on that folding for
39 // handling NaNs.
40 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
41                            const APFloat &Src2) {
42   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
43 
44   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
45   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
46   if (Cmp0 == APFloat::cmpEqual)
47     return maxnum(Src1, Src2);
48 
49   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
50   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
51   if (Cmp1 == APFloat::cmpEqual)
52     return maxnum(Src0, Src2);
53 
54   return maxnum(Src0, Src1);
55 }
56 
57 // Check if a value can be converted to a 16-bit value without losing
58 // precision.
59 static bool canSafelyConvertTo16Bit(Value &V) {
60   Type *VTy = V.getType();
61   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
62     // The value is already 16-bit, so we don't want to convert to 16-bit again!
63     return false;
64   }
65   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
66     // We need to check that if we cast the index down to a half, we do not lose
67     // precision.
68     APFloat FloatValue(ConstFloat->getValueAPF());
69     bool LosesInfo = true;
70     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
71     return !LosesInfo;
72   }
73   Value *CastSrc;
74   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
75       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
76       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
77     Type *CastSrcTy = CastSrc->getType();
78     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
79       return true;
80   }
81 
82   return false;
83 }
84 
85 // Convert a value to 16-bit.
86 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
87   Type *VTy = V.getType();
88   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
89     return cast<Instruction>(&V)->getOperand(0);
90   if (VTy->isIntegerTy())
91     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
92   if (VTy->isFloatingPointTy())
93     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
94 
95   llvm_unreachable("Should never be called!");
96 }
97 
98 static Optional<Instruction *>
99 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
100                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
101                              IntrinsicInst &II, InstCombiner &IC) {
102   if (!ST->hasA16() && !ST->hasG16())
103     return None;
104 
105   bool FloatCoord = false;
106   // true means derivatives can be converted to 16 bit, coordinates not
107   bool OnlyDerivatives = false;
108 
109   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
110        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
111     Value *Coord = II.getOperand(OperandIndex);
112     // If the values are not derived from 16-bit values, we cannot optimize.
113     if (!canSafelyConvertTo16Bit(*Coord)) {
114       if (OperandIndex < ImageDimIntr->CoordStart ||
115           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
116         return None;
117       }
118       // All gradients can be converted, so convert only them
119       OnlyDerivatives = true;
120       break;
121     }
122 
123     assert(OperandIndex == ImageDimIntr->GradientStart ||
124            FloatCoord == Coord->getType()->isFloatingPointTy());
125     FloatCoord = Coord->getType()->isFloatingPointTy();
126   }
127 
128   if (OnlyDerivatives) {
129     if (!ST->hasG16())
130       return None;
131   } else {
132     if (!ST->hasA16())
133       OnlyDerivatives = true; // Only supports G16
134   }
135 
136   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
137                                : Type::getInt16Ty(II.getContext());
138 
139   SmallVector<Type *, 4> ArgTys;
140   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
141     return None;
142 
143   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
144   if (!OnlyDerivatives)
145     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
146   Function *I =
147       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
148 
149   SmallVector<Value *, 8> Args(II.arg_operands());
150 
151   unsigned EndIndex =
152       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
153   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
154        OperandIndex < EndIndex; OperandIndex++) {
155     Args[OperandIndex] =
156         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
157   }
158 
159   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
160   NewCall->takeName(&II);
161   NewCall->copyMetadata(II);
162   NewCall->copyFastMathFlags(&II);
163   return IC.replaceInstUsesWith(II, NewCall);
164 }
165 
166 Optional<Instruction *>
167 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
168   Intrinsic::ID IID = II.getIntrinsicID();
169   switch (IID) {
170   case Intrinsic::amdgcn_rcp: {
171     Value *Src = II.getArgOperand(0);
172 
173     // TODO: Move to ConstantFolding/InstSimplify?
174     if (isa<UndefValue>(Src)) {
175       Type *Ty = II.getType();
176       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
177       return IC.replaceInstUsesWith(II, QNaN);
178     }
179 
180     if (II.isStrictFP())
181       break;
182 
183     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
184       const APFloat &ArgVal = C->getValueAPF();
185       APFloat Val(ArgVal.getSemantics(), 1);
186       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
187 
188       // This is more precise than the instruction may give.
189       //
190       // TODO: The instruction always flushes denormal results (except for f16),
191       // should this also?
192       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
193     }
194 
195     break;
196   }
197   case Intrinsic::amdgcn_rsq: {
198     Value *Src = II.getArgOperand(0);
199 
200     // TODO: Move to ConstantFolding/InstSimplify?
201     if (isa<UndefValue>(Src)) {
202       Type *Ty = II.getType();
203       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
204       return IC.replaceInstUsesWith(II, QNaN);
205     }
206 
207     break;
208   }
209   case Intrinsic::amdgcn_frexp_mant:
210   case Intrinsic::amdgcn_frexp_exp: {
211     Value *Src = II.getArgOperand(0);
212     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
213       int Exp;
214       APFloat Significand =
215           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
216 
217       if (IID == Intrinsic::amdgcn_frexp_mant) {
218         return IC.replaceInstUsesWith(
219             II, ConstantFP::get(II.getContext(), Significand));
220       }
221 
222       // Match instruction special case behavior.
223       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
224         Exp = 0;
225 
226       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
227     }
228 
229     if (isa<UndefValue>(Src)) {
230       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
231     }
232 
233     break;
234   }
235   case Intrinsic::amdgcn_class: {
236     enum {
237       S_NAN = 1 << 0,       // Signaling NaN
238       Q_NAN = 1 << 1,       // Quiet NaN
239       N_INFINITY = 1 << 2,  // Negative infinity
240       N_NORMAL = 1 << 3,    // Negative normal
241       N_SUBNORMAL = 1 << 4, // Negative subnormal
242       N_ZERO = 1 << 5,      // Negative zero
243       P_ZERO = 1 << 6,      // Positive zero
244       P_SUBNORMAL = 1 << 7, // Positive subnormal
245       P_NORMAL = 1 << 8,    // Positive normal
246       P_INFINITY = 1 << 9   // Positive infinity
247     };
248 
249     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
250                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
251                               P_NORMAL | P_INFINITY;
252 
253     Value *Src0 = II.getArgOperand(0);
254     Value *Src1 = II.getArgOperand(1);
255     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
256     if (!CMask) {
257       if (isa<UndefValue>(Src0)) {
258         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
259       }
260 
261       if (isa<UndefValue>(Src1)) {
262         return IC.replaceInstUsesWith(II,
263                                       ConstantInt::get(II.getType(), false));
264       }
265       break;
266     }
267 
268     uint32_t Mask = CMask->getZExtValue();
269 
270     // If all tests are made, it doesn't matter what the value is.
271     if ((Mask & FullMask) == FullMask) {
272       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
273     }
274 
275     if ((Mask & FullMask) == 0) {
276       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
277     }
278 
279     if (Mask == (S_NAN | Q_NAN)) {
280       // Equivalent of isnan. Replace with standard fcmp.
281       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
282       FCmp->takeName(&II);
283       return IC.replaceInstUsesWith(II, FCmp);
284     }
285 
286     if (Mask == (N_ZERO | P_ZERO)) {
287       // Equivalent of == 0.
288       Value *FCmp =
289           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
290 
291       FCmp->takeName(&II);
292       return IC.replaceInstUsesWith(II, FCmp);
293     }
294 
295     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
296     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
297         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
298       return IC.replaceOperand(
299           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
300     }
301 
302     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
303     if (!CVal) {
304       if (isa<UndefValue>(Src0)) {
305         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
306       }
307 
308       // Clamp mask to used bits
309       if ((Mask & FullMask) != Mask) {
310         CallInst *NewCall = IC.Builder.CreateCall(
311             II.getCalledFunction(),
312             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
313 
314         NewCall->takeName(&II);
315         return IC.replaceInstUsesWith(II, NewCall);
316       }
317 
318       break;
319     }
320 
321     const APFloat &Val = CVal->getValueAPF();
322 
323     bool Result =
324         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
325         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
326         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
327         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
328         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
329         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
330         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
331         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
332         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
333         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
334 
335     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
336   }
337   case Intrinsic::amdgcn_cvt_pkrtz: {
338     Value *Src0 = II.getArgOperand(0);
339     Value *Src1 = II.getArgOperand(1);
340     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
341       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
342         const fltSemantics &HalfSem =
343             II.getType()->getScalarType()->getFltSemantics();
344         bool LosesInfo;
345         APFloat Val0 = C0->getValueAPF();
346         APFloat Val1 = C1->getValueAPF();
347         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
348         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
349 
350         Constant *Folded =
351             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
352                                  ConstantFP::get(II.getContext(), Val1)});
353         return IC.replaceInstUsesWith(II, Folded);
354       }
355     }
356 
357     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
358       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
359     }
360 
361     break;
362   }
363   case Intrinsic::amdgcn_cvt_pknorm_i16:
364   case Intrinsic::amdgcn_cvt_pknorm_u16:
365   case Intrinsic::amdgcn_cvt_pk_i16:
366   case Intrinsic::amdgcn_cvt_pk_u16: {
367     Value *Src0 = II.getArgOperand(0);
368     Value *Src1 = II.getArgOperand(1);
369 
370     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
371       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
372     }
373 
374     break;
375   }
376   case Intrinsic::amdgcn_ubfe:
377   case Intrinsic::amdgcn_sbfe: {
378     // Decompose simple cases into standard shifts.
379     Value *Src = II.getArgOperand(0);
380     if (isa<UndefValue>(Src)) {
381       return IC.replaceInstUsesWith(II, Src);
382     }
383 
384     unsigned Width;
385     Type *Ty = II.getType();
386     unsigned IntSize = Ty->getIntegerBitWidth();
387 
388     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
389     if (CWidth) {
390       Width = CWidth->getZExtValue();
391       if ((Width & (IntSize - 1)) == 0) {
392         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
393       }
394 
395       // Hardware ignores high bits, so remove those.
396       if (Width >= IntSize) {
397         return IC.replaceOperand(
398             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
399       }
400     }
401 
402     unsigned Offset;
403     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
404     if (COffset) {
405       Offset = COffset->getZExtValue();
406       if (Offset >= IntSize) {
407         return IC.replaceOperand(
408             II, 1,
409             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
410       }
411     }
412 
413     bool Signed = IID == Intrinsic::amdgcn_sbfe;
414 
415     if (!CWidth || !COffset)
416       break;
417 
418     // The case of Width == 0 is handled above, which makes this tranformation
419     // safe.  If Width == 0, then the ashr and lshr instructions become poison
420     // value since the shift amount would be equal to the bit size.
421     assert(Width != 0);
422 
423     // TODO: This allows folding to undef when the hardware has specific
424     // behavior?
425     if (Offset + Width < IntSize) {
426       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
427       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
428                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
429       RightShift->takeName(&II);
430       return IC.replaceInstUsesWith(II, RightShift);
431     }
432 
433     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
434                                : IC.Builder.CreateLShr(Src, Offset);
435 
436     RightShift->takeName(&II);
437     return IC.replaceInstUsesWith(II, RightShift);
438   }
439   case Intrinsic::amdgcn_exp:
440   case Intrinsic::amdgcn_exp_compr: {
441     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
442     unsigned EnBits = En->getZExtValue();
443     if (EnBits == 0xf)
444       break; // All inputs enabled.
445 
446     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
447     bool Changed = false;
448     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
449       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
450           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
451         Value *Src = II.getArgOperand(I + 2);
452         if (!isa<UndefValue>(Src)) {
453           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
454           Changed = true;
455         }
456       }
457     }
458 
459     if (Changed) {
460       return &II;
461     }
462 
463     break;
464   }
465   case Intrinsic::amdgcn_fmed3: {
466     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
467     // for the shader.
468 
469     Value *Src0 = II.getArgOperand(0);
470     Value *Src1 = II.getArgOperand(1);
471     Value *Src2 = II.getArgOperand(2);
472 
473     // Checking for NaN before canonicalization provides better fidelity when
474     // mapping other operations onto fmed3 since the order of operands is
475     // unchanged.
476     CallInst *NewCall = nullptr;
477     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
478       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
479     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
480       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
481     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
482       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
483     }
484 
485     if (NewCall) {
486       NewCall->copyFastMathFlags(&II);
487       NewCall->takeName(&II);
488       return IC.replaceInstUsesWith(II, NewCall);
489     }
490 
491     bool Swap = false;
492     // Canonicalize constants to RHS operands.
493     //
494     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
495     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
496       std::swap(Src0, Src1);
497       Swap = true;
498     }
499 
500     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
501       std::swap(Src1, Src2);
502       Swap = true;
503     }
504 
505     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
506       std::swap(Src0, Src1);
507       Swap = true;
508     }
509 
510     if (Swap) {
511       II.setArgOperand(0, Src0);
512       II.setArgOperand(1, Src1);
513       II.setArgOperand(2, Src2);
514       return &II;
515     }
516 
517     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
518       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
519         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
520           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
521                                        C2->getValueAPF());
522           return IC.replaceInstUsesWith(
523               II, ConstantFP::get(IC.Builder.getContext(), Result));
524         }
525       }
526     }
527 
528     break;
529   }
530   case Intrinsic::amdgcn_icmp:
531   case Intrinsic::amdgcn_fcmp: {
532     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
533     // Guard against invalid arguments.
534     int64_t CCVal = CC->getZExtValue();
535     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
536     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
537                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
538         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
539                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
540       break;
541 
542     Value *Src0 = II.getArgOperand(0);
543     Value *Src1 = II.getArgOperand(1);
544 
545     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
546       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
547         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
548         if (CCmp->isNullValue()) {
549           return IC.replaceInstUsesWith(
550               II, ConstantExpr::getSExt(CCmp, II.getType()));
551         }
552 
553         // The result of V_ICMP/V_FCMP assembly instructions (which this
554         // intrinsic exposes) is one bit per thread, masked with the EXEC
555         // register (which contains the bitmask of live threads). So a
556         // comparison that always returns true is the same as a read of the
557         // EXEC register.
558         Function *NewF = Intrinsic::getDeclaration(
559             II.getModule(), Intrinsic::read_register, II.getType());
560         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
561         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
562         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
563         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
564         NewCall->addAttribute(AttributeList::FunctionIndex,
565                               Attribute::Convergent);
566         NewCall->takeName(&II);
567         return IC.replaceInstUsesWith(II, NewCall);
568       }
569 
570       // Canonicalize constants to RHS.
571       CmpInst::Predicate SwapPred =
572           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
573       II.setArgOperand(0, Src1);
574       II.setArgOperand(1, Src0);
575       II.setArgOperand(
576           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
577       return &II;
578     }
579 
580     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
581       break;
582 
583     // Canonicalize compare eq with true value to compare != 0
584     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
585     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
586     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
587     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
588     Value *ExtSrc;
589     if (CCVal == CmpInst::ICMP_EQ &&
590         ((match(Src1, PatternMatch::m_One()) &&
591           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
592          (match(Src1, PatternMatch::m_AllOnes()) &&
593           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
594         ExtSrc->getType()->isIntegerTy(1)) {
595       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
596       IC.replaceOperand(II, 2,
597                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
598       return &II;
599     }
600 
601     CmpInst::Predicate SrcPred;
602     Value *SrcLHS;
603     Value *SrcRHS;
604 
605     // Fold compare eq/ne with 0 from a compare result as the predicate to the
606     // intrinsic. The typical use is a wave vote function in the library, which
607     // will be fed from a user code condition compared with 0. Fold in the
608     // redundant compare.
609 
610     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
611     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
612     //
613     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
614     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
615     if (match(Src1, PatternMatch::m_Zero()) &&
616         match(Src0, PatternMatch::m_ZExtOrSExt(
617                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
618                               PatternMatch::m_Value(SrcRHS))))) {
619       if (CCVal == CmpInst::ICMP_EQ)
620         SrcPred = CmpInst::getInversePredicate(SrcPred);
621 
622       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
623                                  ? Intrinsic::amdgcn_fcmp
624                                  : Intrinsic::amdgcn_icmp;
625 
626       Type *Ty = SrcLHS->getType();
627       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
628         // Promote to next legal integer type.
629         unsigned Width = CmpType->getBitWidth();
630         unsigned NewWidth = Width;
631 
632         // Don't do anything for i1 comparisons.
633         if (Width == 1)
634           break;
635 
636         if (Width <= 16)
637           NewWidth = 16;
638         else if (Width <= 32)
639           NewWidth = 32;
640         else if (Width <= 64)
641           NewWidth = 64;
642         else if (Width > 64)
643           break; // Can't handle this.
644 
645         if (Width != NewWidth) {
646           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
647           if (CmpInst::isSigned(SrcPred)) {
648             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
649             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
650           } else {
651             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
652             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
653           }
654         }
655       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
656         break;
657 
658       Function *NewF = Intrinsic::getDeclaration(
659           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
660       Value *Args[] = {SrcLHS, SrcRHS,
661                        ConstantInt::get(CC->getType(), SrcPred)};
662       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
663       NewCall->takeName(&II);
664       return IC.replaceInstUsesWith(II, NewCall);
665     }
666 
667     break;
668   }
669   case Intrinsic::amdgcn_ballot: {
670     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
671       if (Src->isZero()) {
672         // amdgcn.ballot(i1 0) is zero.
673         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
674       }
675 
676       if (Src->isOne()) {
677         // amdgcn.ballot(i1 1) is exec.
678         const char *RegName = "exec";
679         if (II.getType()->isIntegerTy(32))
680           RegName = "exec_lo";
681         else if (!II.getType()->isIntegerTy(64))
682           break;
683 
684         Function *NewF = Intrinsic::getDeclaration(
685             II.getModule(), Intrinsic::read_register, II.getType());
686         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
687         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
688         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
689         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
690         NewCall->addAttribute(AttributeList::FunctionIndex,
691                               Attribute::Convergent);
692         NewCall->takeName(&II);
693         return IC.replaceInstUsesWith(II, NewCall);
694       }
695     }
696     break;
697   }
698   case Intrinsic::amdgcn_wqm_vote: {
699     // wqm_vote is identity when the argument is constant.
700     if (!isa<Constant>(II.getArgOperand(0)))
701       break;
702 
703     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
704   }
705   case Intrinsic::amdgcn_kill: {
706     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
707     if (!C || !C->getZExtValue())
708       break;
709 
710     // amdgcn.kill(i1 1) is a no-op
711     return IC.eraseInstFromFunction(II);
712   }
713   case Intrinsic::amdgcn_update_dpp: {
714     Value *Old = II.getArgOperand(0);
715 
716     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
717     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
718     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
719     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
720         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
721       break;
722 
723     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
724     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
725   }
726   case Intrinsic::amdgcn_permlane16:
727   case Intrinsic::amdgcn_permlanex16: {
728     // Discard vdst_in if it's not going to be read.
729     Value *VDstIn = II.getArgOperand(0);
730     if (isa<UndefValue>(VDstIn))
731       break;
732 
733     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
734     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
735     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
736       break;
737 
738     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
739   }
740   case Intrinsic::amdgcn_readfirstlane:
741   case Intrinsic::amdgcn_readlane: {
742     // A constant value is trivially uniform.
743     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
744       return IC.replaceInstUsesWith(II, C);
745     }
746 
747     // The rest of these may not be safe if the exec may not be the same between
748     // the def and use.
749     Value *Src = II.getArgOperand(0);
750     Instruction *SrcInst = dyn_cast<Instruction>(Src);
751     if (SrcInst && SrcInst->getParent() != II.getParent())
752       break;
753 
754     // readfirstlane (readfirstlane x) -> readfirstlane x
755     // readlane (readfirstlane x), y -> readfirstlane x
756     if (match(Src,
757               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
758       return IC.replaceInstUsesWith(II, Src);
759     }
760 
761     if (IID == Intrinsic::amdgcn_readfirstlane) {
762       // readfirstlane (readlane x, y) -> readlane x, y
763       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
764         return IC.replaceInstUsesWith(II, Src);
765       }
766     } else {
767       // readlane (readlane x, y), y -> readlane x, y
768       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
769                          PatternMatch::m_Value(),
770                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
771         return IC.replaceInstUsesWith(II, Src);
772       }
773     }
774 
775     break;
776   }
777   case Intrinsic::amdgcn_ldexp: {
778     // FIXME: This doesn't introduce new instructions and belongs in
779     // InstructionSimplify.
780     Type *Ty = II.getType();
781     Value *Op0 = II.getArgOperand(0);
782     Value *Op1 = II.getArgOperand(1);
783 
784     // Folding undef to qnan is safe regardless of the FP mode.
785     if (isa<UndefValue>(Op0)) {
786       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
787       return IC.replaceInstUsesWith(II, QNaN);
788     }
789 
790     const APFloat *C = nullptr;
791     match(Op0, PatternMatch::m_APFloat(C));
792 
793     // FIXME: Should flush denorms depending on FP mode, but that's ignored
794     // everywhere else.
795     //
796     // These cases should be safe, even with strictfp.
797     // ldexp(0.0, x) -> 0.0
798     // ldexp(-0.0, x) -> -0.0
799     // ldexp(inf, x) -> inf
800     // ldexp(-inf, x) -> -inf
801     if (C && (C->isZero() || C->isInfinity())) {
802       return IC.replaceInstUsesWith(II, Op0);
803     }
804 
805     // With strictfp, be more careful about possibly needing to flush denormals
806     // or not, and snan behavior depends on ieee_mode.
807     if (II.isStrictFP())
808       break;
809 
810     if (C && C->isNaN()) {
811       // FIXME: We just need to make the nan quiet here, but that's unavailable
812       // on APFloat, only IEEEfloat
813       auto *Quieted =
814           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
815       return IC.replaceInstUsesWith(II, Quieted);
816     }
817 
818     // ldexp(x, 0) -> x
819     // ldexp(x, undef) -> x
820     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
821       return IC.replaceInstUsesWith(II, Op0);
822     }
823 
824     break;
825   }
826   default: {
827     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
828             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
829       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
830     }
831   }
832   }
833   return None;
834 }
835 
836 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
837 ///
838 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
839 ///       struct returns.
840 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
841                                                     IntrinsicInst &II,
842                                                     APInt DemandedElts,
843                                                     int DMaskIdx = -1) {
844 
845   auto *IIVTy = cast<FixedVectorType>(II.getType());
846   unsigned VWidth = IIVTy->getNumElements();
847   if (VWidth == 1)
848     return nullptr;
849 
850   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
851   IC.Builder.SetInsertPoint(&II);
852 
853   // Assume the arguments are unchanged and later override them, if needed.
854   SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
855 
856   if (DMaskIdx < 0) {
857     // Buffer case.
858 
859     const unsigned ActiveBits = DemandedElts.getActiveBits();
860     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
861 
862     // Start assuming the prefix of elements is demanded, but possibly clear
863     // some other bits if there are trailing zeros (unused components at front)
864     // and update offset.
865     DemandedElts = (1 << ActiveBits) - 1;
866 
867     if (UnusedComponentsAtFront > 0) {
868       static const unsigned InvalidOffsetIdx = 0xf;
869 
870       unsigned OffsetIdx;
871       switch (II.getIntrinsicID()) {
872       case Intrinsic::amdgcn_raw_buffer_load:
873         OffsetIdx = 1;
874         break;
875       case Intrinsic::amdgcn_s_buffer_load:
876         // If resulting type is vec3, there is no point in trimming the
877         // load with updated offset, as the vec3 would most likely be widened to
878         // vec4 anyway during lowering.
879         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
880           OffsetIdx = InvalidOffsetIdx;
881         else
882           OffsetIdx = 1;
883         break;
884       case Intrinsic::amdgcn_struct_buffer_load:
885         OffsetIdx = 2;
886         break;
887       default:
888         // TODO: handle tbuffer* intrinsics.
889         OffsetIdx = InvalidOffsetIdx;
890         break;
891       }
892 
893       if (OffsetIdx != InvalidOffsetIdx) {
894         // Clear demanded bits and update the offset.
895         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
896         auto *Offset = II.getArgOperand(OffsetIdx);
897         unsigned SingleComponentSizeInBits =
898             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
899         unsigned OffsetAdd =
900             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
901         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
902         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
903       }
904     }
905   } else {
906     // Image case.
907 
908     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
909     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
910 
911     // Mask off values that are undefined because the dmask doesn't cover them
912     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
913 
914     unsigned NewDMaskVal = 0;
915     unsigned OrigLoadIdx = 0;
916     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
917       const unsigned Bit = 1 << SrcIdx;
918       if (!!(DMaskVal & Bit)) {
919         if (!!DemandedElts[OrigLoadIdx])
920           NewDMaskVal |= Bit;
921         OrigLoadIdx++;
922       }
923     }
924 
925     if (DMaskVal != NewDMaskVal)
926       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
927   }
928 
929   unsigned NewNumElts = DemandedElts.countPopulation();
930   if (!NewNumElts)
931     return UndefValue::get(II.getType());
932 
933   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
934     if (DMaskIdx >= 0)
935       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
936     return nullptr;
937   }
938 
939   // Validate function argument and return types, extracting overloaded types
940   // along the way.
941   SmallVector<Type *, 6> OverloadTys;
942   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
943     return nullptr;
944 
945   Module *M = II.getParent()->getParent()->getParent();
946   Type *EltTy = IIVTy->getElementType();
947   Type *NewTy =
948       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
949 
950   OverloadTys[0] = NewTy;
951   Function *NewIntrin =
952       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
953 
954   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
955   NewCall->takeName(&II);
956   NewCall->copyMetadata(II);
957 
958   if (NewNumElts == 1) {
959     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
960                                           NewCall,
961                                           DemandedElts.countTrailingZeros());
962   }
963 
964   SmallVector<int, 8> EltMask;
965   unsigned NewLoadIdx = 0;
966   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
967     if (!!DemandedElts[OrigLoadIdx])
968       EltMask.push_back(NewLoadIdx++);
969     else
970       EltMask.push_back(NewNumElts);
971   }
972 
973   Value *Shuffle =
974       IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
975 
976   return Shuffle;
977 }
978 
979 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
980     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
981     APInt &UndefElts2, APInt &UndefElts3,
982     std::function<void(Instruction *, unsigned, APInt, APInt &)>
983         SimplifyAndSetOp) const {
984   switch (II.getIntrinsicID()) {
985   case Intrinsic::amdgcn_buffer_load:
986   case Intrinsic::amdgcn_buffer_load_format:
987   case Intrinsic::amdgcn_raw_buffer_load:
988   case Intrinsic::amdgcn_raw_buffer_load_format:
989   case Intrinsic::amdgcn_raw_tbuffer_load:
990   case Intrinsic::amdgcn_s_buffer_load:
991   case Intrinsic::amdgcn_struct_buffer_load:
992   case Intrinsic::amdgcn_struct_buffer_load_format:
993   case Intrinsic::amdgcn_struct_tbuffer_load:
994   case Intrinsic::amdgcn_tbuffer_load:
995     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
996   default: {
997     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
998       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
999     }
1000     break;
1001   }
1002   }
1003   return None;
1004 }
1005