1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUTargetTransformInfo.h"
18 #include "llvm/Transforms/InstCombine/InstCombiner.h"
19 
20 using namespace llvm;
21 
22 #define DEBUG_TYPE "AMDGPUtti"
23 
24 namespace {
25 
26 struct AMDGPUImageDMaskIntrinsic {
27   unsigned Intr;
28 };
29 
30 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
31 #include "InstCombineTables.inc"
32 
33 } // end anonymous namespace
34 
35 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
36 //
37 // A single NaN input is folded to minnum, so we rely on that folding for
38 // handling NaNs.
39 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
40                            const APFloat &Src2) {
41   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
42 
43   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
44   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
45   if (Cmp0 == APFloat::cmpEqual)
46     return maxnum(Src1, Src2);
47 
48   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
49   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
50   if (Cmp1 == APFloat::cmpEqual)
51     return maxnum(Src0, Src2);
52 
53   return maxnum(Src0, Src1);
54 }
55 
56 Optional<Instruction *>
57 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
58   Intrinsic::ID IID = II.getIntrinsicID();
59   switch (IID) {
60   default:
61     break;
62   case Intrinsic::amdgcn_rcp: {
63     Value *Src = II.getArgOperand(0);
64 
65     // TODO: Move to ConstantFolding/InstSimplify?
66     if (isa<UndefValue>(Src)) {
67       Type *Ty = II.getType();
68       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
69       return IC.replaceInstUsesWith(II, QNaN);
70     }
71 
72     if (II.isStrictFP())
73       break;
74 
75     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
76       const APFloat &ArgVal = C->getValueAPF();
77       APFloat Val(ArgVal.getSemantics(), 1);
78       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
79 
80       // This is more precise than the instruction may give.
81       //
82       // TODO: The instruction always flushes denormal results (except for f16),
83       // should this also?
84       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
85     }
86 
87     break;
88   }
89   case Intrinsic::amdgcn_rsq: {
90     Value *Src = II.getArgOperand(0);
91 
92     // TODO: Move to ConstantFolding/InstSimplify?
93     if (isa<UndefValue>(Src)) {
94       Type *Ty = II.getType();
95       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
96       return IC.replaceInstUsesWith(II, QNaN);
97     }
98 
99     break;
100   }
101   case Intrinsic::amdgcn_frexp_mant:
102   case Intrinsic::amdgcn_frexp_exp: {
103     Value *Src = II.getArgOperand(0);
104     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
105       int Exp;
106       APFloat Significand =
107           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
108 
109       if (IID == Intrinsic::amdgcn_frexp_mant) {
110         return IC.replaceInstUsesWith(
111             II, ConstantFP::get(II.getContext(), Significand));
112       }
113 
114       // Match instruction special case behavior.
115       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
116         Exp = 0;
117 
118       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
119     }
120 
121     if (isa<UndefValue>(Src)) {
122       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
123     }
124 
125     break;
126   }
127   case Intrinsic::amdgcn_class: {
128     enum {
129       S_NAN = 1 << 0,       // Signaling NaN
130       Q_NAN = 1 << 1,       // Quiet NaN
131       N_INFINITY = 1 << 2,  // Negative infinity
132       N_NORMAL = 1 << 3,    // Negative normal
133       N_SUBNORMAL = 1 << 4, // Negative subnormal
134       N_ZERO = 1 << 5,      // Negative zero
135       P_ZERO = 1 << 6,      // Positive zero
136       P_SUBNORMAL = 1 << 7, // Positive subnormal
137       P_NORMAL = 1 << 8,    // Positive normal
138       P_INFINITY = 1 << 9   // Positive infinity
139     };
140 
141     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
142                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
143                               P_NORMAL | P_INFINITY;
144 
145     Value *Src0 = II.getArgOperand(0);
146     Value *Src1 = II.getArgOperand(1);
147     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
148     if (!CMask) {
149       if (isa<UndefValue>(Src0)) {
150         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
151       }
152 
153       if (isa<UndefValue>(Src1)) {
154         return IC.replaceInstUsesWith(II,
155                                       ConstantInt::get(II.getType(), false));
156       }
157       break;
158     }
159 
160     uint32_t Mask = CMask->getZExtValue();
161 
162     // If all tests are made, it doesn't matter what the value is.
163     if ((Mask & FullMask) == FullMask) {
164       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
165     }
166 
167     if ((Mask & FullMask) == 0) {
168       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
169     }
170 
171     if (Mask == (S_NAN | Q_NAN)) {
172       // Equivalent of isnan. Replace with standard fcmp.
173       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
174       FCmp->takeName(&II);
175       return IC.replaceInstUsesWith(II, FCmp);
176     }
177 
178     if (Mask == (N_ZERO | P_ZERO)) {
179       // Equivalent of == 0.
180       Value *FCmp =
181           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
182 
183       FCmp->takeName(&II);
184       return IC.replaceInstUsesWith(II, FCmp);
185     }
186 
187     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
188     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
189         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
190       return IC.replaceOperand(
191           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
192     }
193 
194     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
195     if (!CVal) {
196       if (isa<UndefValue>(Src0)) {
197         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
198       }
199 
200       // Clamp mask to used bits
201       if ((Mask & FullMask) != Mask) {
202         CallInst *NewCall = IC.Builder.CreateCall(
203             II.getCalledFunction(),
204             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
205 
206         NewCall->takeName(&II);
207         return IC.replaceInstUsesWith(II, NewCall);
208       }
209 
210       break;
211     }
212 
213     const APFloat &Val = CVal->getValueAPF();
214 
215     bool Result =
216         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
217         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
218         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
219         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
220         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
221         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
222         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
223         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
224         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
225         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
226 
227     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
228   }
229   case Intrinsic::amdgcn_cvt_pkrtz: {
230     Value *Src0 = II.getArgOperand(0);
231     Value *Src1 = II.getArgOperand(1);
232     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
233       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
234         const fltSemantics &HalfSem =
235             II.getType()->getScalarType()->getFltSemantics();
236         bool LosesInfo;
237         APFloat Val0 = C0->getValueAPF();
238         APFloat Val1 = C1->getValueAPF();
239         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
240         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
241 
242         Constant *Folded =
243             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
244                                  ConstantFP::get(II.getContext(), Val1)});
245         return IC.replaceInstUsesWith(II, Folded);
246       }
247     }
248 
249     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
250       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
251     }
252 
253     break;
254   }
255   case Intrinsic::amdgcn_cvt_pknorm_i16:
256   case Intrinsic::amdgcn_cvt_pknorm_u16:
257   case Intrinsic::amdgcn_cvt_pk_i16:
258   case Intrinsic::amdgcn_cvt_pk_u16: {
259     Value *Src0 = II.getArgOperand(0);
260     Value *Src1 = II.getArgOperand(1);
261 
262     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
263       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
264     }
265 
266     break;
267   }
268   case Intrinsic::amdgcn_ubfe:
269   case Intrinsic::amdgcn_sbfe: {
270     // Decompose simple cases into standard shifts.
271     Value *Src = II.getArgOperand(0);
272     if (isa<UndefValue>(Src)) {
273       return IC.replaceInstUsesWith(II, Src);
274     }
275 
276     unsigned Width;
277     Type *Ty = II.getType();
278     unsigned IntSize = Ty->getIntegerBitWidth();
279 
280     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
281     if (CWidth) {
282       Width = CWidth->getZExtValue();
283       if ((Width & (IntSize - 1)) == 0) {
284         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
285       }
286 
287       // Hardware ignores high bits, so remove those.
288       if (Width >= IntSize) {
289         return IC.replaceOperand(
290             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
291       }
292     }
293 
294     unsigned Offset;
295     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
296     if (COffset) {
297       Offset = COffset->getZExtValue();
298       if (Offset >= IntSize) {
299         return IC.replaceOperand(
300             II, 1,
301             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
302       }
303     }
304 
305     bool Signed = IID == Intrinsic::amdgcn_sbfe;
306 
307     if (!CWidth || !COffset)
308       break;
309 
310     // The case of Width == 0 is handled above, which makes this tranformation
311     // safe.  If Width == 0, then the ashr and lshr instructions become poison
312     // value since the shift amount would be equal to the bit size.
313     assert(Width != 0);
314 
315     // TODO: This allows folding to undef when the hardware has specific
316     // behavior?
317     if (Offset + Width < IntSize) {
318       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
319       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
320                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
321       RightShift->takeName(&II);
322       return IC.replaceInstUsesWith(II, RightShift);
323     }
324 
325     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
326                                : IC.Builder.CreateLShr(Src, Offset);
327 
328     RightShift->takeName(&II);
329     return IC.replaceInstUsesWith(II, RightShift);
330   }
331   case Intrinsic::amdgcn_exp:
332   case Intrinsic::amdgcn_exp_compr: {
333     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
334     unsigned EnBits = En->getZExtValue();
335     if (EnBits == 0xf)
336       break; // All inputs enabled.
337 
338     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
339     bool Changed = false;
340     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
341       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
342           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
343         Value *Src = II.getArgOperand(I + 2);
344         if (!isa<UndefValue>(Src)) {
345           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
346           Changed = true;
347         }
348       }
349     }
350 
351     if (Changed) {
352       return &II;
353     }
354 
355     break;
356   }
357   case Intrinsic::amdgcn_fmed3: {
358     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
359     // for the shader.
360 
361     Value *Src0 = II.getArgOperand(0);
362     Value *Src1 = II.getArgOperand(1);
363     Value *Src2 = II.getArgOperand(2);
364 
365     // Checking for NaN before canonicalization provides better fidelity when
366     // mapping other operations onto fmed3 since the order of operands is
367     // unchanged.
368     CallInst *NewCall = nullptr;
369     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
370       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
371     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
372       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
373     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
374       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
375     }
376 
377     if (NewCall) {
378       NewCall->copyFastMathFlags(&II);
379       NewCall->takeName(&II);
380       return IC.replaceInstUsesWith(II, NewCall);
381     }
382 
383     bool Swap = false;
384     // Canonicalize constants to RHS operands.
385     //
386     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
387     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
388       std::swap(Src0, Src1);
389       Swap = true;
390     }
391 
392     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
393       std::swap(Src1, Src2);
394       Swap = true;
395     }
396 
397     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
398       std::swap(Src0, Src1);
399       Swap = true;
400     }
401 
402     if (Swap) {
403       II.setArgOperand(0, Src0);
404       II.setArgOperand(1, Src1);
405       II.setArgOperand(2, Src2);
406       return &II;
407     }
408 
409     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
410       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
411         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
412           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
413                                        C2->getValueAPF());
414           return IC.replaceInstUsesWith(
415               II, ConstantFP::get(IC.Builder.getContext(), Result));
416         }
417       }
418     }
419 
420     break;
421   }
422   case Intrinsic::amdgcn_icmp:
423   case Intrinsic::amdgcn_fcmp: {
424     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
425     // Guard against invalid arguments.
426     int64_t CCVal = CC->getZExtValue();
427     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
428     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
429                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
430         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
431                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
432       break;
433 
434     Value *Src0 = II.getArgOperand(0);
435     Value *Src1 = II.getArgOperand(1);
436 
437     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
438       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
439         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
440         if (CCmp->isNullValue()) {
441           return IC.replaceInstUsesWith(
442               II, ConstantExpr::getSExt(CCmp, II.getType()));
443         }
444 
445         // The result of V_ICMP/V_FCMP assembly instructions (which this
446         // intrinsic exposes) is one bit per thread, masked with the EXEC
447         // register (which contains the bitmask of live threads). So a
448         // comparison that always returns true is the same as a read of the
449         // EXEC register.
450         Function *NewF = Intrinsic::getDeclaration(
451             II.getModule(), Intrinsic::read_register, II.getType());
452         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
453         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
454         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
455         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
456         NewCall->addAttribute(AttributeList::FunctionIndex,
457                               Attribute::Convergent);
458         NewCall->takeName(&II);
459         return IC.replaceInstUsesWith(II, NewCall);
460       }
461 
462       // Canonicalize constants to RHS.
463       CmpInst::Predicate SwapPred =
464           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
465       II.setArgOperand(0, Src1);
466       II.setArgOperand(1, Src0);
467       II.setArgOperand(
468           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
469       return &II;
470     }
471 
472     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
473       break;
474 
475     // Canonicalize compare eq with true value to compare != 0
476     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
477     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
478     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
479     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
480     Value *ExtSrc;
481     if (CCVal == CmpInst::ICMP_EQ &&
482         ((match(Src1, PatternMatch::m_One()) &&
483           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
484          (match(Src1, PatternMatch::m_AllOnes()) &&
485           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
486         ExtSrc->getType()->isIntegerTy(1)) {
487       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
488       IC.replaceOperand(II, 2,
489                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
490       return &II;
491     }
492 
493     CmpInst::Predicate SrcPred;
494     Value *SrcLHS;
495     Value *SrcRHS;
496 
497     // Fold compare eq/ne with 0 from a compare result as the predicate to the
498     // intrinsic. The typical use is a wave vote function in the library, which
499     // will be fed from a user code condition compared with 0. Fold in the
500     // redundant compare.
501 
502     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
503     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
504     //
505     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
506     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
507     if (match(Src1, PatternMatch::m_Zero()) &&
508         match(Src0, PatternMatch::m_ZExtOrSExt(
509                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
510                               PatternMatch::m_Value(SrcRHS))))) {
511       if (CCVal == CmpInst::ICMP_EQ)
512         SrcPred = CmpInst::getInversePredicate(SrcPred);
513 
514       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
515                                  ? Intrinsic::amdgcn_fcmp
516                                  : Intrinsic::amdgcn_icmp;
517 
518       Type *Ty = SrcLHS->getType();
519       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
520         // Promote to next legal integer type.
521         unsigned Width = CmpType->getBitWidth();
522         unsigned NewWidth = Width;
523 
524         // Don't do anything for i1 comparisons.
525         if (Width == 1)
526           break;
527 
528         if (Width <= 16)
529           NewWidth = 16;
530         else if (Width <= 32)
531           NewWidth = 32;
532         else if (Width <= 64)
533           NewWidth = 64;
534         else if (Width > 64)
535           break; // Can't handle this.
536 
537         if (Width != NewWidth) {
538           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
539           if (CmpInst::isSigned(SrcPred)) {
540             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
541             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
542           } else {
543             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
544             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
545           }
546         }
547       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
548         break;
549 
550       Function *NewF = Intrinsic::getDeclaration(
551           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
552       Value *Args[] = {SrcLHS, SrcRHS,
553                        ConstantInt::get(CC->getType(), SrcPred)};
554       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
555       NewCall->takeName(&II);
556       return IC.replaceInstUsesWith(II, NewCall);
557     }
558 
559     break;
560   }
561   case Intrinsic::amdgcn_ballot: {
562     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
563       if (Src->isZero()) {
564         // amdgcn.ballot(i1 0) is zero.
565         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
566       }
567 
568       if (Src->isOne()) {
569         // amdgcn.ballot(i1 1) is exec.
570         const char *RegName = "exec";
571         if (II.getType()->isIntegerTy(32))
572           RegName = "exec_lo";
573         else if (!II.getType()->isIntegerTy(64))
574           break;
575 
576         Function *NewF = Intrinsic::getDeclaration(
577             II.getModule(), Intrinsic::read_register, II.getType());
578         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
579         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
580         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
581         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
582         NewCall->addAttribute(AttributeList::FunctionIndex,
583                               Attribute::Convergent);
584         NewCall->takeName(&II);
585         return IC.replaceInstUsesWith(II, NewCall);
586       }
587     }
588     break;
589   }
590   case Intrinsic::amdgcn_wqm_vote: {
591     // wqm_vote is identity when the argument is constant.
592     if (!isa<Constant>(II.getArgOperand(0)))
593       break;
594 
595     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
596   }
597   case Intrinsic::amdgcn_kill: {
598     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
599     if (!C || !C->getZExtValue())
600       break;
601 
602     // amdgcn.kill(i1 1) is a no-op
603     return IC.eraseInstFromFunction(II);
604   }
605   case Intrinsic::amdgcn_update_dpp: {
606     Value *Old = II.getArgOperand(0);
607 
608     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
609     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
610     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
611     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
612         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
613       break;
614 
615     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
616     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
617   }
618   case Intrinsic::amdgcn_permlane16:
619   case Intrinsic::amdgcn_permlanex16: {
620     // Discard vdst_in if it's not going to be read.
621     Value *VDstIn = II.getArgOperand(0);
622     if (isa<UndefValue>(VDstIn))
623       break;
624 
625     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
626     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
627     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
628       break;
629 
630     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
631   }
632   case Intrinsic::amdgcn_readfirstlane:
633   case Intrinsic::amdgcn_readlane: {
634     // A constant value is trivially uniform.
635     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
636       return IC.replaceInstUsesWith(II, C);
637     }
638 
639     // The rest of these may not be safe if the exec may not be the same between
640     // the def and use.
641     Value *Src = II.getArgOperand(0);
642     Instruction *SrcInst = dyn_cast<Instruction>(Src);
643     if (SrcInst && SrcInst->getParent() != II.getParent())
644       break;
645 
646     // readfirstlane (readfirstlane x) -> readfirstlane x
647     // readlane (readfirstlane x), y -> readfirstlane x
648     if (match(Src,
649               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
650       return IC.replaceInstUsesWith(II, Src);
651     }
652 
653     if (IID == Intrinsic::amdgcn_readfirstlane) {
654       // readfirstlane (readlane x, y) -> readlane x, y
655       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
656         return IC.replaceInstUsesWith(II, Src);
657       }
658     } else {
659       // readlane (readlane x, y), y -> readlane x, y
660       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
661                          PatternMatch::m_Value(),
662                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
663         return IC.replaceInstUsesWith(II, Src);
664       }
665     }
666 
667     break;
668   }
669   case Intrinsic::amdgcn_ldexp: {
670     // FIXME: This doesn't introduce new instructions and belongs in
671     // InstructionSimplify.
672     Type *Ty = II.getType();
673     Value *Op0 = II.getArgOperand(0);
674     Value *Op1 = II.getArgOperand(1);
675 
676     // Folding undef to qnan is safe regardless of the FP mode.
677     if (isa<UndefValue>(Op0)) {
678       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
679       return IC.replaceInstUsesWith(II, QNaN);
680     }
681 
682     const APFloat *C = nullptr;
683     match(Op0, PatternMatch::m_APFloat(C));
684 
685     // FIXME: Should flush denorms depending on FP mode, but that's ignored
686     // everywhere else.
687     //
688     // These cases should be safe, even with strictfp.
689     // ldexp(0.0, x) -> 0.0
690     // ldexp(-0.0, x) -> -0.0
691     // ldexp(inf, x) -> inf
692     // ldexp(-inf, x) -> -inf
693     if (C && (C->isZero() || C->isInfinity())) {
694       return IC.replaceInstUsesWith(II, Op0);
695     }
696 
697     // With strictfp, be more careful about possibly needing to flush denormals
698     // or not, and snan behavior depends on ieee_mode.
699     if (II.isStrictFP())
700       break;
701 
702     if (C && C->isNaN()) {
703       // FIXME: We just need to make the nan quiet here, but that's unavailable
704       // on APFloat, only IEEEfloat
705       auto *Quieted =
706           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
707       return IC.replaceInstUsesWith(II, Quieted);
708     }
709 
710     // ldexp(x, 0) -> x
711     // ldexp(x, undef) -> x
712     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
713       return IC.replaceInstUsesWith(II, Op0);
714     }
715 
716     break;
717   }
718   }
719   return None;
720 }
721 
722 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
723 ///
724 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
725 ///       struct returns.
726 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
727                                                     IntrinsicInst &II,
728                                                     APInt DemandedElts,
729                                                     int DMaskIdx = -1) {
730 
731   auto *IIVTy = cast<FixedVectorType>(II.getType());
732   unsigned VWidth = IIVTy->getNumElements();
733   if (VWidth == 1)
734     return nullptr;
735 
736   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
737   IC.Builder.SetInsertPoint(&II);
738 
739   // Assume the arguments are unchanged and later override them, if needed.
740   SmallVector<Value *, 16> Args(II.arg_begin(), II.arg_end());
741 
742   if (DMaskIdx < 0) {
743     // Buffer case.
744 
745     const unsigned ActiveBits = DemandedElts.getActiveBits();
746     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
747 
748     // Start assuming the prefix of elements is demanded, but possibly clear
749     // some other bits if there are trailing zeros (unused components at front)
750     // and update offset.
751     DemandedElts = (1 << ActiveBits) - 1;
752 
753     if (UnusedComponentsAtFront > 0) {
754       static const unsigned InvalidOffsetIdx = 0xf;
755 
756       unsigned OffsetIdx;
757       switch (II.getIntrinsicID()) {
758       case Intrinsic::amdgcn_raw_buffer_load:
759         OffsetIdx = 1;
760         break;
761       case Intrinsic::amdgcn_s_buffer_load:
762         // If resulting type is vec3, there is no point in trimming the
763         // load with updated offset, as the vec3 would most likely be widened to
764         // vec4 anyway during lowering.
765         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
766           OffsetIdx = InvalidOffsetIdx;
767         else
768           OffsetIdx = 1;
769         break;
770       case Intrinsic::amdgcn_struct_buffer_load:
771         OffsetIdx = 2;
772         break;
773       default:
774         // TODO: handle tbuffer* intrinsics.
775         OffsetIdx = InvalidOffsetIdx;
776         break;
777       }
778 
779       if (OffsetIdx != InvalidOffsetIdx) {
780         // Clear demanded bits and update the offset.
781         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
782         auto *Offset = II.getArgOperand(OffsetIdx);
783         unsigned SingleComponentSizeInBits =
784             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
785         unsigned OffsetAdd =
786             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
787         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
788         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
789       }
790     }
791   } else {
792     // Image case.
793 
794     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
795     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
796 
797     // Mask off values that are undefined because the dmask doesn't cover them
798     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
799 
800     unsigned NewDMaskVal = 0;
801     unsigned OrigLoadIdx = 0;
802     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
803       const unsigned Bit = 1 << SrcIdx;
804       if (!!(DMaskVal & Bit)) {
805         if (!!DemandedElts[OrigLoadIdx])
806           NewDMaskVal |= Bit;
807         OrigLoadIdx++;
808       }
809     }
810 
811     if (DMaskVal != NewDMaskVal)
812       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
813   }
814 
815   unsigned NewNumElts = DemandedElts.countPopulation();
816   if (!NewNumElts)
817     return UndefValue::get(II.getType());
818 
819   // FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
820   // fully supported.
821   if (II.getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
822     return nullptr;
823 
824   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
825     if (DMaskIdx >= 0)
826       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
827     return nullptr;
828   }
829 
830   // Validate function argument and return types, extracting overloaded types
831   // along the way.
832   SmallVector<Type *, 6> OverloadTys;
833   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
834     return nullptr;
835 
836   Module *M = II.getParent()->getParent()->getParent();
837   Type *EltTy = IIVTy->getElementType();
838   Type *NewTy =
839       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
840 
841   OverloadTys[0] = NewTy;
842   Function *NewIntrin =
843       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
844 
845   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
846   NewCall->takeName(&II);
847   NewCall->copyMetadata(II);
848 
849   if (NewNumElts == 1) {
850     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
851                                           NewCall,
852                                           DemandedElts.countTrailingZeros());
853   }
854 
855   SmallVector<int, 8> EltMask;
856   unsigned NewLoadIdx = 0;
857   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
858     if (!!DemandedElts[OrigLoadIdx])
859       EltMask.push_back(NewLoadIdx++);
860     else
861       EltMask.push_back(NewNumElts);
862   }
863 
864   Value *Shuffle =
865       IC.Builder.CreateShuffleVector(NewCall, UndefValue::get(NewTy), EltMask);
866 
867   return Shuffle;
868 }
869 
870 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
871     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
872     APInt &UndefElts2, APInt &UndefElts3,
873     std::function<void(Instruction *, unsigned, APInt, APInt &)>
874         SimplifyAndSetOp) const {
875   switch (II.getIntrinsicID()) {
876   case Intrinsic::amdgcn_buffer_load:
877   case Intrinsic::amdgcn_buffer_load_format:
878   case Intrinsic::amdgcn_raw_buffer_load:
879   case Intrinsic::amdgcn_raw_buffer_load_format:
880   case Intrinsic::amdgcn_raw_tbuffer_load:
881   case Intrinsic::amdgcn_s_buffer_load:
882   case Intrinsic::amdgcn_struct_buffer_load:
883   case Intrinsic::amdgcn_struct_buffer_load_format:
884   case Intrinsic::amdgcn_struct_tbuffer_load:
885   case Intrinsic::amdgcn_tbuffer_load:
886     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
887   default: {
888     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
889       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
890     }
891     break;
892   }
893   }
894   return None;
895 }
896