1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "llvm/Transforms/InstCombine/InstCombiner.h"
19 
20 using namespace llvm;
21 
22 #define DEBUG_TYPE "AMDGPUtti"
23 
24 namespace {
25 
26 struct AMDGPUImageDMaskIntrinsic {
27   unsigned Intr;
28 };
29 
30 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
31 #include "InstCombineTables.inc"
32 
33 } // end anonymous namespace
34 
35 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
36 //
37 // A single NaN input is folded to minnum, so we rely on that folding for
38 // handling NaNs.
39 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
40                            const APFloat &Src2) {
41   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
42 
43   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
44   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
45   if (Cmp0 == APFloat::cmpEqual)
46     return maxnum(Src1, Src2);
47 
48   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
49   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
50   if (Cmp1 == APFloat::cmpEqual)
51     return maxnum(Src0, Src2);
52 
53   return maxnum(Src0, Src1);
54 }
55 
56 // Check if a value can be converted to a 16-bit value without losing
57 // precision.
58 static bool canSafelyConvertTo16Bit(Value &V) {
59   Type *VTy = V.getType();
60   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
61     // The value is already 16-bit, so we don't want to convert to 16-bit again!
62     return false;
63   }
64   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
65     // We need to check that if we cast the index down to a half, we do not lose
66     // precision.
67     APFloat FloatValue(ConstFloat->getValueAPF());
68     bool LosesInfo = true;
69     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
70     return !LosesInfo;
71   }
72   Value *CastSrc;
73   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
74       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
75       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
76     Type *CastSrcTy = CastSrc->getType();
77     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
78       return true;
79   }
80 
81   return false;
82 }
83 
84 // Convert a value to 16-bit.
85 Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
86   Type *VTy = V.getType();
87   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
88     return cast<Instruction>(&V)->getOperand(0);
89   if (VTy->isIntegerTy())
90     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
91   if (VTy->isFloatingPointTy())
92     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
93 
94   llvm_unreachable("Should never be called!");
95 }
96 
97 static Optional<Instruction *>
98 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
99                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
100                              IntrinsicInst &II, InstCombiner &IC) {
101   if (!ST->hasA16() && !ST->hasG16())
102     return None;
103 
104   bool FloatCoord = false;
105   // true means derivatives can be converted to 16 bit, coordinates not
106   bool OnlyDerivatives = false;
107 
108   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
109        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
110     Value *Coord = II.getOperand(OperandIndex);
111     // If the values are not derived from 16-bit values, we cannot optimize.
112     if (!canSafelyConvertTo16Bit(*Coord)) {
113       if (OperandIndex < ImageDimIntr->CoordStart ||
114           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
115         return None;
116       }
117       // All gradients can be converted, so convert only them
118       OnlyDerivatives = true;
119       break;
120     }
121 
122     assert(OperandIndex == ImageDimIntr->GradientStart ||
123            FloatCoord == Coord->getType()->isFloatingPointTy());
124     FloatCoord = Coord->getType()->isFloatingPointTy();
125   }
126 
127   if (OnlyDerivatives) {
128     if (!ST->hasG16())
129       return None;
130   } else {
131     if (!ST->hasA16())
132       OnlyDerivatives = true; // Only supports G16
133   }
134 
135   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
136                                : Type::getInt16Ty(II.getContext());
137 
138   SmallVector<Type *, 4> ArgTys;
139   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
140     return None;
141 
142   ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
143   if (!OnlyDerivatives)
144     ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
145   Function *I =
146       Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
147 
148   SmallVector<Value *, 8> Args(II.arg_operands());
149 
150   unsigned EndIndex =
151       OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
152   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
153        OperandIndex < EndIndex; OperandIndex++) {
154     Args[OperandIndex] =
155         convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
156   }
157 
158   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
159   NewCall->takeName(&II);
160   NewCall->copyMetadata(II);
161   NewCall->copyFastMathFlags(&II);
162   return IC.replaceInstUsesWith(II, NewCall);
163 }
164 
165 Optional<Instruction *>
166 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
167   Intrinsic::ID IID = II.getIntrinsicID();
168   switch (IID) {
169   case Intrinsic::amdgcn_rcp: {
170     Value *Src = II.getArgOperand(0);
171 
172     // TODO: Move to ConstantFolding/InstSimplify?
173     if (isa<UndefValue>(Src)) {
174       Type *Ty = II.getType();
175       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
176       return IC.replaceInstUsesWith(II, QNaN);
177     }
178 
179     if (II.isStrictFP())
180       break;
181 
182     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
183       const APFloat &ArgVal = C->getValueAPF();
184       APFloat Val(ArgVal.getSemantics(), 1);
185       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
186 
187       // This is more precise than the instruction may give.
188       //
189       // TODO: The instruction always flushes denormal results (except for f16),
190       // should this also?
191       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
192     }
193 
194     break;
195   }
196   case Intrinsic::amdgcn_rsq: {
197     Value *Src = II.getArgOperand(0);
198 
199     // TODO: Move to ConstantFolding/InstSimplify?
200     if (isa<UndefValue>(Src)) {
201       Type *Ty = II.getType();
202       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
203       return IC.replaceInstUsesWith(II, QNaN);
204     }
205 
206     break;
207   }
208   case Intrinsic::amdgcn_frexp_mant:
209   case Intrinsic::amdgcn_frexp_exp: {
210     Value *Src = II.getArgOperand(0);
211     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
212       int Exp;
213       APFloat Significand =
214           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
215 
216       if (IID == Intrinsic::amdgcn_frexp_mant) {
217         return IC.replaceInstUsesWith(
218             II, ConstantFP::get(II.getContext(), Significand));
219       }
220 
221       // Match instruction special case behavior.
222       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
223         Exp = 0;
224 
225       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
226     }
227 
228     if (isa<UndefValue>(Src)) {
229       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
230     }
231 
232     break;
233   }
234   case Intrinsic::amdgcn_class: {
235     enum {
236       S_NAN = 1 << 0,       // Signaling NaN
237       Q_NAN = 1 << 1,       // Quiet NaN
238       N_INFINITY = 1 << 2,  // Negative infinity
239       N_NORMAL = 1 << 3,    // Negative normal
240       N_SUBNORMAL = 1 << 4, // Negative subnormal
241       N_ZERO = 1 << 5,      // Negative zero
242       P_ZERO = 1 << 6,      // Positive zero
243       P_SUBNORMAL = 1 << 7, // Positive subnormal
244       P_NORMAL = 1 << 8,    // Positive normal
245       P_INFINITY = 1 << 9   // Positive infinity
246     };
247 
248     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
249                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
250                               P_NORMAL | P_INFINITY;
251 
252     Value *Src0 = II.getArgOperand(0);
253     Value *Src1 = II.getArgOperand(1);
254     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
255     if (!CMask) {
256       if (isa<UndefValue>(Src0)) {
257         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
258       }
259 
260       if (isa<UndefValue>(Src1)) {
261         return IC.replaceInstUsesWith(II,
262                                       ConstantInt::get(II.getType(), false));
263       }
264       break;
265     }
266 
267     uint32_t Mask = CMask->getZExtValue();
268 
269     // If all tests are made, it doesn't matter what the value is.
270     if ((Mask & FullMask) == FullMask) {
271       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
272     }
273 
274     if ((Mask & FullMask) == 0) {
275       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
276     }
277 
278     if (Mask == (S_NAN | Q_NAN)) {
279       // Equivalent of isnan. Replace with standard fcmp.
280       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
281       FCmp->takeName(&II);
282       return IC.replaceInstUsesWith(II, FCmp);
283     }
284 
285     if (Mask == (N_ZERO | P_ZERO)) {
286       // Equivalent of == 0.
287       Value *FCmp =
288           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
289 
290       FCmp->takeName(&II);
291       return IC.replaceInstUsesWith(II, FCmp);
292     }
293 
294     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
295     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
296         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
297       return IC.replaceOperand(
298           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
299     }
300 
301     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
302     if (!CVal) {
303       if (isa<UndefValue>(Src0)) {
304         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
305       }
306 
307       // Clamp mask to used bits
308       if ((Mask & FullMask) != Mask) {
309         CallInst *NewCall = IC.Builder.CreateCall(
310             II.getCalledFunction(),
311             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
312 
313         NewCall->takeName(&II);
314         return IC.replaceInstUsesWith(II, NewCall);
315       }
316 
317       break;
318     }
319 
320     const APFloat &Val = CVal->getValueAPF();
321 
322     bool Result =
323         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
324         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
325         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
326         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
327         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
328         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
329         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
330         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
331         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
332         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
333 
334     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
335   }
336   case Intrinsic::amdgcn_cvt_pkrtz: {
337     Value *Src0 = II.getArgOperand(0);
338     Value *Src1 = II.getArgOperand(1);
339     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
340       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
341         const fltSemantics &HalfSem =
342             II.getType()->getScalarType()->getFltSemantics();
343         bool LosesInfo;
344         APFloat Val0 = C0->getValueAPF();
345         APFloat Val1 = C1->getValueAPF();
346         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
347         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
348 
349         Constant *Folded =
350             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
351                                  ConstantFP::get(II.getContext(), Val1)});
352         return IC.replaceInstUsesWith(II, Folded);
353       }
354     }
355 
356     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
357       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
358     }
359 
360     break;
361   }
362   case Intrinsic::amdgcn_cvt_pknorm_i16:
363   case Intrinsic::amdgcn_cvt_pknorm_u16:
364   case Intrinsic::amdgcn_cvt_pk_i16:
365   case Intrinsic::amdgcn_cvt_pk_u16: {
366     Value *Src0 = II.getArgOperand(0);
367     Value *Src1 = II.getArgOperand(1);
368 
369     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
370       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
371     }
372 
373     break;
374   }
375   case Intrinsic::amdgcn_ubfe:
376   case Intrinsic::amdgcn_sbfe: {
377     // Decompose simple cases into standard shifts.
378     Value *Src = II.getArgOperand(0);
379     if (isa<UndefValue>(Src)) {
380       return IC.replaceInstUsesWith(II, Src);
381     }
382 
383     unsigned Width;
384     Type *Ty = II.getType();
385     unsigned IntSize = Ty->getIntegerBitWidth();
386 
387     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
388     if (CWidth) {
389       Width = CWidth->getZExtValue();
390       if ((Width & (IntSize - 1)) == 0) {
391         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
392       }
393 
394       // Hardware ignores high bits, so remove those.
395       if (Width >= IntSize) {
396         return IC.replaceOperand(
397             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
398       }
399     }
400 
401     unsigned Offset;
402     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
403     if (COffset) {
404       Offset = COffset->getZExtValue();
405       if (Offset >= IntSize) {
406         return IC.replaceOperand(
407             II, 1,
408             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
409       }
410     }
411 
412     bool Signed = IID == Intrinsic::amdgcn_sbfe;
413 
414     if (!CWidth || !COffset)
415       break;
416 
417     // The case of Width == 0 is handled above, which makes this tranformation
418     // safe.  If Width == 0, then the ashr and lshr instructions become poison
419     // value since the shift amount would be equal to the bit size.
420     assert(Width != 0);
421 
422     // TODO: This allows folding to undef when the hardware has specific
423     // behavior?
424     if (Offset + Width < IntSize) {
425       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
426       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
427                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
428       RightShift->takeName(&II);
429       return IC.replaceInstUsesWith(II, RightShift);
430     }
431 
432     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
433                                : IC.Builder.CreateLShr(Src, Offset);
434 
435     RightShift->takeName(&II);
436     return IC.replaceInstUsesWith(II, RightShift);
437   }
438   case Intrinsic::amdgcn_exp:
439   case Intrinsic::amdgcn_exp_compr: {
440     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
441     unsigned EnBits = En->getZExtValue();
442     if (EnBits == 0xf)
443       break; // All inputs enabled.
444 
445     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
446     bool Changed = false;
447     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
448       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
449           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
450         Value *Src = II.getArgOperand(I + 2);
451         if (!isa<UndefValue>(Src)) {
452           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
453           Changed = true;
454         }
455       }
456     }
457 
458     if (Changed) {
459       return &II;
460     }
461 
462     break;
463   }
464   case Intrinsic::amdgcn_fmed3: {
465     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
466     // for the shader.
467 
468     Value *Src0 = II.getArgOperand(0);
469     Value *Src1 = II.getArgOperand(1);
470     Value *Src2 = II.getArgOperand(2);
471 
472     // Checking for NaN before canonicalization provides better fidelity when
473     // mapping other operations onto fmed3 since the order of operands is
474     // unchanged.
475     CallInst *NewCall = nullptr;
476     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
477       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
478     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
479       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
480     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
481       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
482     }
483 
484     if (NewCall) {
485       NewCall->copyFastMathFlags(&II);
486       NewCall->takeName(&II);
487       return IC.replaceInstUsesWith(II, NewCall);
488     }
489 
490     bool Swap = false;
491     // Canonicalize constants to RHS operands.
492     //
493     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
494     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
495       std::swap(Src0, Src1);
496       Swap = true;
497     }
498 
499     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
500       std::swap(Src1, Src2);
501       Swap = true;
502     }
503 
504     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
505       std::swap(Src0, Src1);
506       Swap = true;
507     }
508 
509     if (Swap) {
510       II.setArgOperand(0, Src0);
511       II.setArgOperand(1, Src1);
512       II.setArgOperand(2, Src2);
513       return &II;
514     }
515 
516     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
517       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
518         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
519           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
520                                        C2->getValueAPF());
521           return IC.replaceInstUsesWith(
522               II, ConstantFP::get(IC.Builder.getContext(), Result));
523         }
524       }
525     }
526 
527     break;
528   }
529   case Intrinsic::amdgcn_icmp:
530   case Intrinsic::amdgcn_fcmp: {
531     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
532     // Guard against invalid arguments.
533     int64_t CCVal = CC->getZExtValue();
534     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
535     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
536                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
537         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
538                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
539       break;
540 
541     Value *Src0 = II.getArgOperand(0);
542     Value *Src1 = II.getArgOperand(1);
543 
544     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
545       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
546         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
547         if (CCmp->isNullValue()) {
548           return IC.replaceInstUsesWith(
549               II, ConstantExpr::getSExt(CCmp, II.getType()));
550         }
551 
552         // The result of V_ICMP/V_FCMP assembly instructions (which this
553         // intrinsic exposes) is one bit per thread, masked with the EXEC
554         // register (which contains the bitmask of live threads). So a
555         // comparison that always returns true is the same as a read of the
556         // EXEC register.
557         Function *NewF = Intrinsic::getDeclaration(
558             II.getModule(), Intrinsic::read_register, II.getType());
559         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
560         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
561         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
562         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
563         NewCall->addAttribute(AttributeList::FunctionIndex,
564                               Attribute::Convergent);
565         NewCall->takeName(&II);
566         return IC.replaceInstUsesWith(II, NewCall);
567       }
568 
569       // Canonicalize constants to RHS.
570       CmpInst::Predicate SwapPred =
571           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
572       II.setArgOperand(0, Src1);
573       II.setArgOperand(1, Src0);
574       II.setArgOperand(
575           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
576       return &II;
577     }
578 
579     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
580       break;
581 
582     // Canonicalize compare eq with true value to compare != 0
583     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
584     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
585     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
586     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
587     Value *ExtSrc;
588     if (CCVal == CmpInst::ICMP_EQ &&
589         ((match(Src1, PatternMatch::m_One()) &&
590           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
591          (match(Src1, PatternMatch::m_AllOnes()) &&
592           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
593         ExtSrc->getType()->isIntegerTy(1)) {
594       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
595       IC.replaceOperand(II, 2,
596                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
597       return &II;
598     }
599 
600     CmpInst::Predicate SrcPred;
601     Value *SrcLHS;
602     Value *SrcRHS;
603 
604     // Fold compare eq/ne with 0 from a compare result as the predicate to the
605     // intrinsic. The typical use is a wave vote function in the library, which
606     // will be fed from a user code condition compared with 0. Fold in the
607     // redundant compare.
608 
609     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
610     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
611     //
612     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
613     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
614     if (match(Src1, PatternMatch::m_Zero()) &&
615         match(Src0, PatternMatch::m_ZExtOrSExt(
616                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
617                               PatternMatch::m_Value(SrcRHS))))) {
618       if (CCVal == CmpInst::ICMP_EQ)
619         SrcPred = CmpInst::getInversePredicate(SrcPred);
620 
621       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
622                                  ? Intrinsic::amdgcn_fcmp
623                                  : Intrinsic::amdgcn_icmp;
624 
625       Type *Ty = SrcLHS->getType();
626       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
627         // Promote to next legal integer type.
628         unsigned Width = CmpType->getBitWidth();
629         unsigned NewWidth = Width;
630 
631         // Don't do anything for i1 comparisons.
632         if (Width == 1)
633           break;
634 
635         if (Width <= 16)
636           NewWidth = 16;
637         else if (Width <= 32)
638           NewWidth = 32;
639         else if (Width <= 64)
640           NewWidth = 64;
641         else if (Width > 64)
642           break; // Can't handle this.
643 
644         if (Width != NewWidth) {
645           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
646           if (CmpInst::isSigned(SrcPred)) {
647             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
648             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
649           } else {
650             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
651             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
652           }
653         }
654       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
655         break;
656 
657       Function *NewF = Intrinsic::getDeclaration(
658           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
659       Value *Args[] = {SrcLHS, SrcRHS,
660                        ConstantInt::get(CC->getType(), SrcPred)};
661       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
662       NewCall->takeName(&II);
663       return IC.replaceInstUsesWith(II, NewCall);
664     }
665 
666     break;
667   }
668   case Intrinsic::amdgcn_ballot: {
669     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
670       if (Src->isZero()) {
671         // amdgcn.ballot(i1 0) is zero.
672         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
673       }
674 
675       if (Src->isOne()) {
676         // amdgcn.ballot(i1 1) is exec.
677         const char *RegName = "exec";
678         if (II.getType()->isIntegerTy(32))
679           RegName = "exec_lo";
680         else if (!II.getType()->isIntegerTy(64))
681           break;
682 
683         Function *NewF = Intrinsic::getDeclaration(
684             II.getModule(), Intrinsic::read_register, II.getType());
685         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
686         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
687         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
688         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
689         NewCall->addAttribute(AttributeList::FunctionIndex,
690                               Attribute::Convergent);
691         NewCall->takeName(&II);
692         return IC.replaceInstUsesWith(II, NewCall);
693       }
694     }
695     break;
696   }
697   case Intrinsic::amdgcn_wqm_vote: {
698     // wqm_vote is identity when the argument is constant.
699     if (!isa<Constant>(II.getArgOperand(0)))
700       break;
701 
702     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
703   }
704   case Intrinsic::amdgcn_kill: {
705     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
706     if (!C || !C->getZExtValue())
707       break;
708 
709     // amdgcn.kill(i1 1) is a no-op
710     return IC.eraseInstFromFunction(II);
711   }
712   case Intrinsic::amdgcn_update_dpp: {
713     Value *Old = II.getArgOperand(0);
714 
715     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
716     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
717     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
718     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
719         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
720       break;
721 
722     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
723     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
724   }
725   case Intrinsic::amdgcn_permlane16:
726   case Intrinsic::amdgcn_permlanex16: {
727     // Discard vdst_in if it's not going to be read.
728     Value *VDstIn = II.getArgOperand(0);
729     if (isa<UndefValue>(VDstIn))
730       break;
731 
732     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
733     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
734     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
735       break;
736 
737     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
738   }
739   case Intrinsic::amdgcn_readfirstlane:
740   case Intrinsic::amdgcn_readlane: {
741     // A constant value is trivially uniform.
742     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
743       return IC.replaceInstUsesWith(II, C);
744     }
745 
746     // The rest of these may not be safe if the exec may not be the same between
747     // the def and use.
748     Value *Src = II.getArgOperand(0);
749     Instruction *SrcInst = dyn_cast<Instruction>(Src);
750     if (SrcInst && SrcInst->getParent() != II.getParent())
751       break;
752 
753     // readfirstlane (readfirstlane x) -> readfirstlane x
754     // readlane (readfirstlane x), y -> readfirstlane x
755     if (match(Src,
756               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
757       return IC.replaceInstUsesWith(II, Src);
758     }
759 
760     if (IID == Intrinsic::amdgcn_readfirstlane) {
761       // readfirstlane (readlane x, y) -> readlane x, y
762       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
763         return IC.replaceInstUsesWith(II, Src);
764       }
765     } else {
766       // readlane (readlane x, y), y -> readlane x, y
767       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
768                          PatternMatch::m_Value(),
769                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
770         return IC.replaceInstUsesWith(II, Src);
771       }
772     }
773 
774     break;
775   }
776   case Intrinsic::amdgcn_ldexp: {
777     // FIXME: This doesn't introduce new instructions and belongs in
778     // InstructionSimplify.
779     Type *Ty = II.getType();
780     Value *Op0 = II.getArgOperand(0);
781     Value *Op1 = II.getArgOperand(1);
782 
783     // Folding undef to qnan is safe regardless of the FP mode.
784     if (isa<UndefValue>(Op0)) {
785       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
786       return IC.replaceInstUsesWith(II, QNaN);
787     }
788 
789     const APFloat *C = nullptr;
790     match(Op0, PatternMatch::m_APFloat(C));
791 
792     // FIXME: Should flush denorms depending on FP mode, but that's ignored
793     // everywhere else.
794     //
795     // These cases should be safe, even with strictfp.
796     // ldexp(0.0, x) -> 0.0
797     // ldexp(-0.0, x) -> -0.0
798     // ldexp(inf, x) -> inf
799     // ldexp(-inf, x) -> -inf
800     if (C && (C->isZero() || C->isInfinity())) {
801       return IC.replaceInstUsesWith(II, Op0);
802     }
803 
804     // With strictfp, be more careful about possibly needing to flush denormals
805     // or not, and snan behavior depends on ieee_mode.
806     if (II.isStrictFP())
807       break;
808 
809     if (C && C->isNaN()) {
810       // FIXME: We just need to make the nan quiet here, but that's unavailable
811       // on APFloat, only IEEEfloat
812       auto *Quieted =
813           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
814       return IC.replaceInstUsesWith(II, Quieted);
815     }
816 
817     // ldexp(x, 0) -> x
818     // ldexp(x, undef) -> x
819     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
820       return IC.replaceInstUsesWith(II, Op0);
821     }
822 
823     break;
824   }
825   default: {
826     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
827             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
828       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
829     }
830   }
831   }
832   return None;
833 }
834 
835 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
836 ///
837 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
838 ///       struct returns.
839 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
840                                                     IntrinsicInst &II,
841                                                     APInt DemandedElts,
842                                                     int DMaskIdx = -1) {
843 
844   auto *IIVTy = cast<FixedVectorType>(II.getType());
845   unsigned VWidth = IIVTy->getNumElements();
846   if (VWidth == 1)
847     return nullptr;
848 
849   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
850   IC.Builder.SetInsertPoint(&II);
851 
852   // Assume the arguments are unchanged and later override them, if needed.
853   SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
854 
855   if (DMaskIdx < 0) {
856     // Buffer case.
857 
858     const unsigned ActiveBits = DemandedElts.getActiveBits();
859     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
860 
861     // Start assuming the prefix of elements is demanded, but possibly clear
862     // some other bits if there are trailing zeros (unused components at front)
863     // and update offset.
864     DemandedElts = (1 << ActiveBits) - 1;
865 
866     if (UnusedComponentsAtFront > 0) {
867       static const unsigned InvalidOffsetIdx = 0xf;
868 
869       unsigned OffsetIdx;
870       switch (II.getIntrinsicID()) {
871       case Intrinsic::amdgcn_raw_buffer_load:
872         OffsetIdx = 1;
873         break;
874       case Intrinsic::amdgcn_s_buffer_load:
875         // If resulting type is vec3, there is no point in trimming the
876         // load with updated offset, as the vec3 would most likely be widened to
877         // vec4 anyway during lowering.
878         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
879           OffsetIdx = InvalidOffsetIdx;
880         else
881           OffsetIdx = 1;
882         break;
883       case Intrinsic::amdgcn_struct_buffer_load:
884         OffsetIdx = 2;
885         break;
886       default:
887         // TODO: handle tbuffer* intrinsics.
888         OffsetIdx = InvalidOffsetIdx;
889         break;
890       }
891 
892       if (OffsetIdx != InvalidOffsetIdx) {
893         // Clear demanded bits and update the offset.
894         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
895         auto *Offset = II.getArgOperand(OffsetIdx);
896         unsigned SingleComponentSizeInBits =
897             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
898         unsigned OffsetAdd =
899             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
900         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
901         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
902       }
903     }
904   } else {
905     // Image case.
906 
907     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
908     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
909 
910     // Mask off values that are undefined because the dmask doesn't cover them
911     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
912 
913     unsigned NewDMaskVal = 0;
914     unsigned OrigLoadIdx = 0;
915     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
916       const unsigned Bit = 1 << SrcIdx;
917       if (!!(DMaskVal & Bit)) {
918         if (!!DemandedElts[OrigLoadIdx])
919           NewDMaskVal |= Bit;
920         OrigLoadIdx++;
921       }
922     }
923 
924     if (DMaskVal != NewDMaskVal)
925       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
926   }
927 
928   unsigned NewNumElts = DemandedElts.countPopulation();
929   if (!NewNumElts)
930     return UndefValue::get(II.getType());
931 
932   // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
933   // fully supported.
934   if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
935     return nullptr;
936 
937   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
938     if (DMaskIdx >= 0)
939       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
940     return nullptr;
941   }
942 
943   // Validate function argument and return types, extracting overloaded types
944   // along the way.
945   SmallVector<Type *, 6> OverloadTys;
946   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
947     return nullptr;
948 
949   Module *M = II.getParent()->getParent()->getParent();
950   Type *EltTy = IIVTy->getElementType();
951   Type *NewTy =
952       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
953 
954   OverloadTys[0] = NewTy;
955   Function *NewIntrin =
956       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
957 
958   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
959   NewCall->takeName(&II);
960   NewCall->copyMetadata(II);
961 
962   if (NewNumElts == 1) {
963     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
964                                           NewCall,
965                                           DemandedElts.countTrailingZeros());
966   }
967 
968   SmallVector<int, 8> EltMask;
969   unsigned NewLoadIdx = 0;
970   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
971     if (!!DemandedElts[OrigLoadIdx])
972       EltMask.push_back(NewLoadIdx++);
973     else
974       EltMask.push_back(NewNumElts);
975   }
976 
977   Value *Shuffle =
978       IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
979 
980   return Shuffle;
981 }
982 
983 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
984     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
985     APInt &UndefElts2, APInt &UndefElts3,
986     std::function<void(Instruction *, unsigned, APInt, APInt &)>
987         SimplifyAndSetOp) const {
988   switch (II.getIntrinsicID()) {
989   case Intrinsic::amdgcn_buffer_load:
990   case Intrinsic::amdgcn_buffer_load_format:
991   case Intrinsic::amdgcn_raw_buffer_load:
992   case Intrinsic::amdgcn_raw_buffer_load_format:
993   case Intrinsic::amdgcn_raw_tbuffer_load:
994   case Intrinsic::amdgcn_s_buffer_load:
995   case Intrinsic::amdgcn_struct_buffer_load:
996   case Intrinsic::amdgcn_struct_buffer_load_format:
997   case Intrinsic::amdgcn_struct_tbuffer_load:
998   case Intrinsic::amdgcn_tbuffer_load:
999     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1000   default: {
1001     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1002       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1003     }
1004     break;
1005   }
1006   }
1007   return None;
1008 }
1009