1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22 
23 using namespace llvm;
24 
25 #define DEBUG_TYPE "AMDGPUtti"
26 
27 namespace {
28 
29 struct AMDGPUImageDMaskIntrinsic {
30   unsigned Intr;
31 };
32 
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35 
36 } // end anonymous namespace
37 
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43                            const APFloat &Src2) {
44   APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45 
46   APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47   assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48   if (Cmp0 == APFloat::cmpEqual)
49     return maxnum(Src1, Src2);
50 
51   APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52   assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53   if (Cmp1 == APFloat::cmpEqual)
54     return maxnum(Src0, Src2);
55 
56   return maxnum(Src0, Src1);
57 }
58 
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 static bool canSafelyConvertTo16Bit(Value &V) {
62   Type *VTy = V.getType();
63   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
64     // The value is already 16-bit, so we don't want to convert to 16-bit again!
65     return false;
66   }
67   if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
68     // We need to check that if we cast the index down to a half, we do not lose
69     // precision.
70     APFloat FloatValue(ConstFloat->getValueAPF());
71     bool LosesInfo = true;
72     FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
73     return !LosesInfo;
74   }
75   Value *CastSrc;
76   if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
77       match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
78       match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
79     Type *CastSrcTy = CastSrc->getType();
80     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
81       return true;
82   }
83 
84   return false;
85 }
86 
87 // Convert a value to 16-bit.
88 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
89   Type *VTy = V.getType();
90   if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
91     return cast<Instruction>(&V)->getOperand(0);
92   if (VTy->isIntegerTy())
93     return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
94   if (VTy->isFloatingPointTy())
95     return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
96 
97   llvm_unreachable("Should never be called!");
98 }
99 
100 /// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
101 /// the modified arguments.
102 static Optional<Instruction *> modifyIntrinsicCall(
103     IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
104     std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
105         Func) {
106   SmallVector<Type *, 4> ArgTys;
107   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
108     return None;
109 
110   SmallVector<Value *, 8> Args(II.args());
111 
112   // Modify arguments and types
113   Func(Args, ArgTys);
114 
115   Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
116 
117   CallInst *NewCall = IC.Builder.CreateCall(I, Args);
118   NewCall->takeName(&II);
119   NewCall->copyMetadata(II);
120   if (isa<FPMathOperator>(NewCall))
121     NewCall->copyFastMathFlags(&II);
122 
123   // Erase and replace uses
124   if (!II.getType()->isVoidTy())
125     IC.replaceInstUsesWith(II, NewCall);
126   return IC.eraseInstFromFunction(II);
127 }
128 
129 static Optional<Instruction *>
130 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
131                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
132                              IntrinsicInst &II, InstCombiner &IC) {
133   // Optimize _L to _LZ when _L is zero
134   if (const auto *LZMappingInfo =
135           AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
136     if (auto *ConstantLod =
137             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
138       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
139         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
140             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
141                                                      ImageDimIntr->Dim);
142         return modifyIntrinsicCall(
143             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
144               Args.erase(Args.begin() + ImageDimIntr->LodIndex);
145             });
146       }
147     }
148   }
149 
150   // Optimize _mip away, when 'lod' is zero
151   if (const auto *MIPMappingInfo =
152           AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
153     if (auto *ConstantMip =
154             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
155       if (ConstantMip->isZero()) {
156         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
157             AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
158                                                      ImageDimIntr->Dim);
159         return modifyIntrinsicCall(
160             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
161               Args.erase(Args.begin() + ImageDimIntr->MipIndex);
162             });
163       }
164     }
165   }
166 
167   // Optimize _bias away when 'bias' is zero
168   if (const auto *BiasMappingInfo =
169           AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
170     if (auto *ConstantBias =
171             dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
172       if (ConstantBias->isZero()) {
173         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
174             AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
175                                                      ImageDimIntr->Dim);
176         return modifyIntrinsicCall(
177             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
178               Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
179               ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
180             });
181       }
182     }
183   }
184 
185   // Optimize _offset away when 'offset' is zero
186   if (const auto *OffsetMappingInfo =
187           AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
188     if (auto *ConstantOffset =
189             dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
190       if (ConstantOffset->isZero()) {
191         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
192             AMDGPU::getImageDimIntrinsicByBaseOpcode(
193                 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
194         return modifyIntrinsicCall(
195             II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
196               Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
197             });
198       }
199     }
200   }
201 
202   // Try to use A16 or G16
203   if (!ST->hasA16() && !ST->hasG16())
204     return None;
205 
206   bool FloatCoord = false;
207   // true means derivatives can be converted to 16 bit, coordinates not
208   bool OnlyDerivatives = false;
209 
210   for (unsigned OperandIndex = ImageDimIntr->GradientStart;
211        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
212     Value *Coord = II.getOperand(OperandIndex);
213     // If the values are not derived from 16-bit values, we cannot optimize.
214     if (!canSafelyConvertTo16Bit(*Coord)) {
215       if (OperandIndex < ImageDimIntr->CoordStart ||
216           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
217         return None;
218       }
219       // All gradients can be converted, so convert only them
220       OnlyDerivatives = true;
221       break;
222     }
223 
224     assert(OperandIndex == ImageDimIntr->GradientStart ||
225            FloatCoord == Coord->getType()->isFloatingPointTy());
226     FloatCoord = Coord->getType()->isFloatingPointTy();
227   }
228 
229   if (!OnlyDerivatives && !ST->hasA16())
230     OnlyDerivatives = true; // Only supports G16
231 
232   // Check if there is a bias parameter and if it can be converted to f16
233   if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
234     Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
235     if (!canSafelyConvertTo16Bit(*Bias))
236       OnlyDerivatives = true;
237   }
238 
239   if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
240                                                ImageDimIntr->CoordStart))
241     return None;
242 
243   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
244                                : Type::getInt16Ty(II.getContext());
245 
246   return modifyIntrinsicCall(
247       II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
248         ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
249         if (!OnlyDerivatives) {
250           ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
251 
252           // Change the bias type
253           if (ImageDimIntr->NumBiasArgs != 0)
254             ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
255         }
256 
257         unsigned EndIndex =
258             OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
259         for (unsigned OperandIndex = ImageDimIntr->GradientStart;
260              OperandIndex < EndIndex; OperandIndex++) {
261           Args[OperandIndex] =
262               convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
263         }
264 
265         // Convert the bias
266         if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
267           Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
268           Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
269         }
270       });
271 }
272 
273 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
274                                            InstCombiner &IC) const {
275   // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
276   // infinity, gives +0.0. If we can prove we don't have one of the special
277   // cases then we can use a normal multiply instead.
278   // TODO: Create and use isKnownFiniteNonZero instead of just matching
279   // constants here.
280   if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
281       match(Op1, PatternMatch::m_FiniteNonZero())) {
282     // One operand is not zero or infinity or NaN.
283     return true;
284   }
285   auto *TLI = &IC.getTargetLibraryInfo();
286   if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
287       isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
288     // Neither operand is infinity or NaN.
289     return true;
290   }
291   return false;
292 }
293 
294 Optional<Instruction *>
295 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
296   Intrinsic::ID IID = II.getIntrinsicID();
297   switch (IID) {
298   case Intrinsic::amdgcn_rcp: {
299     Value *Src = II.getArgOperand(0);
300 
301     // TODO: Move to ConstantFolding/InstSimplify?
302     if (isa<UndefValue>(Src)) {
303       Type *Ty = II.getType();
304       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
305       return IC.replaceInstUsesWith(II, QNaN);
306     }
307 
308     if (II.isStrictFP())
309       break;
310 
311     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
312       const APFloat &ArgVal = C->getValueAPF();
313       APFloat Val(ArgVal.getSemantics(), 1);
314       Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
315 
316       // This is more precise than the instruction may give.
317       //
318       // TODO: The instruction always flushes denormal results (except for f16),
319       // should this also?
320       return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
321     }
322 
323     break;
324   }
325   case Intrinsic::amdgcn_rsq: {
326     Value *Src = II.getArgOperand(0);
327 
328     // TODO: Move to ConstantFolding/InstSimplify?
329     if (isa<UndefValue>(Src)) {
330       Type *Ty = II.getType();
331       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
332       return IC.replaceInstUsesWith(II, QNaN);
333     }
334 
335     break;
336   }
337   case Intrinsic::amdgcn_frexp_mant:
338   case Intrinsic::amdgcn_frexp_exp: {
339     Value *Src = II.getArgOperand(0);
340     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
341       int Exp;
342       APFloat Significand =
343           frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
344 
345       if (IID == Intrinsic::amdgcn_frexp_mant) {
346         return IC.replaceInstUsesWith(
347             II, ConstantFP::get(II.getContext(), Significand));
348       }
349 
350       // Match instruction special case behavior.
351       if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
352         Exp = 0;
353 
354       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
355     }
356 
357     if (isa<UndefValue>(Src)) {
358       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
359     }
360 
361     break;
362   }
363   case Intrinsic::amdgcn_class: {
364     enum {
365       S_NAN = 1 << 0,       // Signaling NaN
366       Q_NAN = 1 << 1,       // Quiet NaN
367       N_INFINITY = 1 << 2,  // Negative infinity
368       N_NORMAL = 1 << 3,    // Negative normal
369       N_SUBNORMAL = 1 << 4, // Negative subnormal
370       N_ZERO = 1 << 5,      // Negative zero
371       P_ZERO = 1 << 6,      // Positive zero
372       P_SUBNORMAL = 1 << 7, // Positive subnormal
373       P_NORMAL = 1 << 8,    // Positive normal
374       P_INFINITY = 1 << 9   // Positive infinity
375     };
376 
377     const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
378                               N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
379                               P_NORMAL | P_INFINITY;
380 
381     Value *Src0 = II.getArgOperand(0);
382     Value *Src1 = II.getArgOperand(1);
383     const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
384     if (!CMask) {
385       if (isa<UndefValue>(Src0)) {
386         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
387       }
388 
389       if (isa<UndefValue>(Src1)) {
390         return IC.replaceInstUsesWith(II,
391                                       ConstantInt::get(II.getType(), false));
392       }
393       break;
394     }
395 
396     uint32_t Mask = CMask->getZExtValue();
397 
398     // If all tests are made, it doesn't matter what the value is.
399     if ((Mask & FullMask) == FullMask) {
400       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
401     }
402 
403     if ((Mask & FullMask) == 0) {
404       return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
405     }
406 
407     if (Mask == (S_NAN | Q_NAN)) {
408       // Equivalent of isnan. Replace with standard fcmp.
409       Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
410       FCmp->takeName(&II);
411       return IC.replaceInstUsesWith(II, FCmp);
412     }
413 
414     if (Mask == (N_ZERO | P_ZERO)) {
415       // Equivalent of == 0.
416       Value *FCmp =
417           IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
418 
419       FCmp->takeName(&II);
420       return IC.replaceInstUsesWith(II, FCmp);
421     }
422 
423     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
424     if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
425         isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
426       return IC.replaceOperand(
427           II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
428     }
429 
430     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
431     if (!CVal) {
432       if (isa<UndefValue>(Src0)) {
433         return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
434       }
435 
436       // Clamp mask to used bits
437       if ((Mask & FullMask) != Mask) {
438         CallInst *NewCall = IC.Builder.CreateCall(
439             II.getCalledFunction(),
440             {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
441 
442         NewCall->takeName(&II);
443         return IC.replaceInstUsesWith(II, NewCall);
444       }
445 
446       break;
447     }
448 
449     const APFloat &Val = CVal->getValueAPF();
450 
451     bool Result =
452         ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
453         ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
454         ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
455         ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
456         ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
457         ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
458         ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
459         ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
460         ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
461         ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
462 
463     return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
464   }
465   case Intrinsic::amdgcn_cvt_pkrtz: {
466     Value *Src0 = II.getArgOperand(0);
467     Value *Src1 = II.getArgOperand(1);
468     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
469       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
470         const fltSemantics &HalfSem =
471             II.getType()->getScalarType()->getFltSemantics();
472         bool LosesInfo;
473         APFloat Val0 = C0->getValueAPF();
474         APFloat Val1 = C1->getValueAPF();
475         Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
476         Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
477 
478         Constant *Folded =
479             ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
480                                  ConstantFP::get(II.getContext(), Val1)});
481         return IC.replaceInstUsesWith(II, Folded);
482       }
483     }
484 
485     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
486       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
487     }
488 
489     break;
490   }
491   case Intrinsic::amdgcn_cvt_pknorm_i16:
492   case Intrinsic::amdgcn_cvt_pknorm_u16:
493   case Intrinsic::amdgcn_cvt_pk_i16:
494   case Intrinsic::amdgcn_cvt_pk_u16: {
495     Value *Src0 = II.getArgOperand(0);
496     Value *Src1 = II.getArgOperand(1);
497 
498     if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
499       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
500     }
501 
502     break;
503   }
504   case Intrinsic::amdgcn_ubfe:
505   case Intrinsic::amdgcn_sbfe: {
506     // Decompose simple cases into standard shifts.
507     Value *Src = II.getArgOperand(0);
508     if (isa<UndefValue>(Src)) {
509       return IC.replaceInstUsesWith(II, Src);
510     }
511 
512     unsigned Width;
513     Type *Ty = II.getType();
514     unsigned IntSize = Ty->getIntegerBitWidth();
515 
516     ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
517     if (CWidth) {
518       Width = CWidth->getZExtValue();
519       if ((Width & (IntSize - 1)) == 0) {
520         return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
521       }
522 
523       // Hardware ignores high bits, so remove those.
524       if (Width >= IntSize) {
525         return IC.replaceOperand(
526             II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
527       }
528     }
529 
530     unsigned Offset;
531     ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
532     if (COffset) {
533       Offset = COffset->getZExtValue();
534       if (Offset >= IntSize) {
535         return IC.replaceOperand(
536             II, 1,
537             ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
538       }
539     }
540 
541     bool Signed = IID == Intrinsic::amdgcn_sbfe;
542 
543     if (!CWidth || !COffset)
544       break;
545 
546     // The case of Width == 0 is handled above, which makes this transformation
547     // safe.  If Width == 0, then the ashr and lshr instructions become poison
548     // value since the shift amount would be equal to the bit size.
549     assert(Width != 0);
550 
551     // TODO: This allows folding to undef when the hardware has specific
552     // behavior?
553     if (Offset + Width < IntSize) {
554       Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
555       Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
556                                  : IC.Builder.CreateLShr(Shl, IntSize - Width);
557       RightShift->takeName(&II);
558       return IC.replaceInstUsesWith(II, RightShift);
559     }
560 
561     Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
562                                : IC.Builder.CreateLShr(Src, Offset);
563 
564     RightShift->takeName(&II);
565     return IC.replaceInstUsesWith(II, RightShift);
566   }
567   case Intrinsic::amdgcn_exp:
568   case Intrinsic::amdgcn_exp_compr: {
569     ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
570     unsigned EnBits = En->getZExtValue();
571     if (EnBits == 0xf)
572       break; // All inputs enabled.
573 
574     bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
575     bool Changed = false;
576     for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
577       if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
578           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
579         Value *Src = II.getArgOperand(I + 2);
580         if (!isa<UndefValue>(Src)) {
581           IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
582           Changed = true;
583         }
584       }
585     }
586 
587     if (Changed) {
588       return &II;
589     }
590 
591     break;
592   }
593   case Intrinsic::amdgcn_fmed3: {
594     // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
595     // for the shader.
596 
597     Value *Src0 = II.getArgOperand(0);
598     Value *Src1 = II.getArgOperand(1);
599     Value *Src2 = II.getArgOperand(2);
600 
601     // Checking for NaN before canonicalization provides better fidelity when
602     // mapping other operations onto fmed3 since the order of operands is
603     // unchanged.
604     CallInst *NewCall = nullptr;
605     if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
606       NewCall = IC.Builder.CreateMinNum(Src1, Src2);
607     } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
608       NewCall = IC.Builder.CreateMinNum(Src0, Src2);
609     } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
610       NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
611     }
612 
613     if (NewCall) {
614       NewCall->copyFastMathFlags(&II);
615       NewCall->takeName(&II);
616       return IC.replaceInstUsesWith(II, NewCall);
617     }
618 
619     bool Swap = false;
620     // Canonicalize constants to RHS operands.
621     //
622     // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
623     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
624       std::swap(Src0, Src1);
625       Swap = true;
626     }
627 
628     if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
629       std::swap(Src1, Src2);
630       Swap = true;
631     }
632 
633     if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
634       std::swap(Src0, Src1);
635       Swap = true;
636     }
637 
638     if (Swap) {
639       II.setArgOperand(0, Src0);
640       II.setArgOperand(1, Src1);
641       II.setArgOperand(2, Src2);
642       return &II;
643     }
644 
645     if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
646       if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
647         if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
648           APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
649                                        C2->getValueAPF());
650           return IC.replaceInstUsesWith(
651               II, ConstantFP::get(IC.Builder.getContext(), Result));
652         }
653       }
654     }
655 
656     break;
657   }
658   case Intrinsic::amdgcn_icmp:
659   case Intrinsic::amdgcn_fcmp: {
660     const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
661     // Guard against invalid arguments.
662     int64_t CCVal = CC->getZExtValue();
663     bool IsInteger = IID == Intrinsic::amdgcn_icmp;
664     if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
665                        CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
666         (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
667                         CCVal > CmpInst::LAST_FCMP_PREDICATE)))
668       break;
669 
670     Value *Src0 = II.getArgOperand(0);
671     Value *Src1 = II.getArgOperand(1);
672 
673     if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
674       if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
675         Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
676         if (CCmp->isNullValue()) {
677           return IC.replaceInstUsesWith(
678               II, ConstantExpr::getSExt(CCmp, II.getType()));
679         }
680 
681         // The result of V_ICMP/V_FCMP assembly instructions (which this
682         // intrinsic exposes) is one bit per thread, masked with the EXEC
683         // register (which contains the bitmask of live threads). So a
684         // comparison that always returns true is the same as a read of the
685         // EXEC register.
686         Function *NewF = Intrinsic::getDeclaration(
687             II.getModule(), Intrinsic::read_register, II.getType());
688         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
689         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
690         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
691         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
692         NewCall->addFnAttr(Attribute::Convergent);
693         NewCall->takeName(&II);
694         return IC.replaceInstUsesWith(II, NewCall);
695       }
696 
697       // Canonicalize constants to RHS.
698       CmpInst::Predicate SwapPred =
699           CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
700       II.setArgOperand(0, Src1);
701       II.setArgOperand(1, Src0);
702       II.setArgOperand(
703           2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
704       return &II;
705     }
706 
707     if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
708       break;
709 
710     // Canonicalize compare eq with true value to compare != 0
711     // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
712     //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
713     // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
714     //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
715     Value *ExtSrc;
716     if (CCVal == CmpInst::ICMP_EQ &&
717         ((match(Src1, PatternMatch::m_One()) &&
718           match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
719          (match(Src1, PatternMatch::m_AllOnes()) &&
720           match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
721         ExtSrc->getType()->isIntegerTy(1)) {
722       IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
723       IC.replaceOperand(II, 2,
724                         ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
725       return &II;
726     }
727 
728     CmpInst::Predicate SrcPred;
729     Value *SrcLHS;
730     Value *SrcRHS;
731 
732     // Fold compare eq/ne with 0 from a compare result as the predicate to the
733     // intrinsic. The typical use is a wave vote function in the library, which
734     // will be fed from a user code condition compared with 0. Fold in the
735     // redundant compare.
736 
737     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
738     //   -> llvm.amdgcn.[if]cmp(a, b, pred)
739     //
740     // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
741     //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
742     if (match(Src1, PatternMatch::m_Zero()) &&
743         match(Src0, PatternMatch::m_ZExtOrSExt(
744                         m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
745                               PatternMatch::m_Value(SrcRHS))))) {
746       if (CCVal == CmpInst::ICMP_EQ)
747         SrcPred = CmpInst::getInversePredicate(SrcPred);
748 
749       Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
750                                  ? Intrinsic::amdgcn_fcmp
751                                  : Intrinsic::amdgcn_icmp;
752 
753       Type *Ty = SrcLHS->getType();
754       if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
755         // Promote to next legal integer type.
756         unsigned Width = CmpType->getBitWidth();
757         unsigned NewWidth = Width;
758 
759         // Don't do anything for i1 comparisons.
760         if (Width == 1)
761           break;
762 
763         if (Width <= 16)
764           NewWidth = 16;
765         else if (Width <= 32)
766           NewWidth = 32;
767         else if (Width <= 64)
768           NewWidth = 64;
769         else if (Width > 64)
770           break; // Can't handle this.
771 
772         if (Width != NewWidth) {
773           IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
774           if (CmpInst::isSigned(SrcPred)) {
775             SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
776             SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
777           } else {
778             SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
779             SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
780           }
781         }
782       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
783         break;
784 
785       Function *NewF = Intrinsic::getDeclaration(
786           II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
787       Value *Args[] = {SrcLHS, SrcRHS,
788                        ConstantInt::get(CC->getType(), SrcPred)};
789       CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
790       NewCall->takeName(&II);
791       return IC.replaceInstUsesWith(II, NewCall);
792     }
793 
794     break;
795   }
796   case Intrinsic::amdgcn_ballot: {
797     if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
798       if (Src->isZero()) {
799         // amdgcn.ballot(i1 0) is zero.
800         return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
801       }
802 
803       if (Src->isOne()) {
804         // amdgcn.ballot(i1 1) is exec.
805         const char *RegName = "exec";
806         if (II.getType()->isIntegerTy(32))
807           RegName = "exec_lo";
808         else if (!II.getType()->isIntegerTy(64))
809           break;
810 
811         Function *NewF = Intrinsic::getDeclaration(
812             II.getModule(), Intrinsic::read_register, II.getType());
813         Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
814         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
815         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
816         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
817         NewCall->addFnAttr(Attribute::Convergent);
818         NewCall->takeName(&II);
819         return IC.replaceInstUsesWith(II, NewCall);
820       }
821     }
822     break;
823   }
824   case Intrinsic::amdgcn_wqm_vote: {
825     // wqm_vote is identity when the argument is constant.
826     if (!isa<Constant>(II.getArgOperand(0)))
827       break;
828 
829     return IC.replaceInstUsesWith(II, II.getArgOperand(0));
830   }
831   case Intrinsic::amdgcn_kill: {
832     const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
833     if (!C || !C->getZExtValue())
834       break;
835 
836     // amdgcn.kill(i1 1) is a no-op
837     return IC.eraseInstFromFunction(II);
838   }
839   case Intrinsic::amdgcn_update_dpp: {
840     Value *Old = II.getArgOperand(0);
841 
842     auto *BC = cast<ConstantInt>(II.getArgOperand(5));
843     auto *RM = cast<ConstantInt>(II.getArgOperand(3));
844     auto *BM = cast<ConstantInt>(II.getArgOperand(4));
845     if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
846         BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
847       break;
848 
849     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
850     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
851   }
852   case Intrinsic::amdgcn_permlane16:
853   case Intrinsic::amdgcn_permlanex16: {
854     // Discard vdst_in if it's not going to be read.
855     Value *VDstIn = II.getArgOperand(0);
856     if (isa<UndefValue>(VDstIn))
857       break;
858 
859     ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
860     ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
861     if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
862       break;
863 
864     return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
865   }
866   case Intrinsic::amdgcn_readfirstlane:
867   case Intrinsic::amdgcn_readlane: {
868     // A constant value is trivially uniform.
869     if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
870       return IC.replaceInstUsesWith(II, C);
871     }
872 
873     // The rest of these may not be safe if the exec may not be the same between
874     // the def and use.
875     Value *Src = II.getArgOperand(0);
876     Instruction *SrcInst = dyn_cast<Instruction>(Src);
877     if (SrcInst && SrcInst->getParent() != II.getParent())
878       break;
879 
880     // readfirstlane (readfirstlane x) -> readfirstlane x
881     // readlane (readfirstlane x), y -> readfirstlane x
882     if (match(Src,
883               PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
884       return IC.replaceInstUsesWith(II, Src);
885     }
886 
887     if (IID == Intrinsic::amdgcn_readfirstlane) {
888       // readfirstlane (readlane x, y) -> readlane x, y
889       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
890         return IC.replaceInstUsesWith(II, Src);
891       }
892     } else {
893       // readlane (readlane x, y), y -> readlane x, y
894       if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
895                          PatternMatch::m_Value(),
896                          PatternMatch::m_Specific(II.getArgOperand(1))))) {
897         return IC.replaceInstUsesWith(II, Src);
898       }
899     }
900 
901     break;
902   }
903   case Intrinsic::amdgcn_ldexp: {
904     // FIXME: This doesn't introduce new instructions and belongs in
905     // InstructionSimplify.
906     Type *Ty = II.getType();
907     Value *Op0 = II.getArgOperand(0);
908     Value *Op1 = II.getArgOperand(1);
909 
910     // Folding undef to qnan is safe regardless of the FP mode.
911     if (isa<UndefValue>(Op0)) {
912       auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
913       return IC.replaceInstUsesWith(II, QNaN);
914     }
915 
916     const APFloat *C = nullptr;
917     match(Op0, PatternMatch::m_APFloat(C));
918 
919     // FIXME: Should flush denorms depending on FP mode, but that's ignored
920     // everywhere else.
921     //
922     // These cases should be safe, even with strictfp.
923     // ldexp(0.0, x) -> 0.0
924     // ldexp(-0.0, x) -> -0.0
925     // ldexp(inf, x) -> inf
926     // ldexp(-inf, x) -> -inf
927     if (C && (C->isZero() || C->isInfinity())) {
928       return IC.replaceInstUsesWith(II, Op0);
929     }
930 
931     // With strictfp, be more careful about possibly needing to flush denormals
932     // or not, and snan behavior depends on ieee_mode.
933     if (II.isStrictFP())
934       break;
935 
936     if (C && C->isNaN()) {
937       // FIXME: We just need to make the nan quiet here, but that's unavailable
938       // on APFloat, only IEEEfloat
939       auto *Quieted =
940           ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
941       return IC.replaceInstUsesWith(II, Quieted);
942     }
943 
944     // ldexp(x, 0) -> x
945     // ldexp(x, undef) -> x
946     if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
947       return IC.replaceInstUsesWith(II, Op0);
948     }
949 
950     break;
951   }
952   case Intrinsic::amdgcn_fmul_legacy: {
953     Value *Op0 = II.getArgOperand(0);
954     Value *Op1 = II.getArgOperand(1);
955 
956     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
957     // infinity, gives +0.0.
958     // TODO: Move to InstSimplify?
959     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
960         match(Op1, PatternMatch::m_AnyZeroFP()))
961       return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
962 
963     // If we can prove we don't have one of the special cases then we can use a
964     // normal fmul instruction instead.
965     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
966       auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
967       FMul->takeName(&II);
968       return IC.replaceInstUsesWith(II, FMul);
969     }
970     break;
971   }
972   case Intrinsic::amdgcn_fma_legacy: {
973     Value *Op0 = II.getArgOperand(0);
974     Value *Op1 = II.getArgOperand(1);
975     Value *Op2 = II.getArgOperand(2);
976 
977     // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
978     // infinity, gives +0.0.
979     // TODO: Move to InstSimplify?
980     if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
981         match(Op1, PatternMatch::m_AnyZeroFP())) {
982       // It's tempting to just return Op2 here, but that would give the wrong
983       // result if Op2 was -0.0.
984       auto *Zero = ConstantFP::getNullValue(II.getType());
985       auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
986       FAdd->takeName(&II);
987       return IC.replaceInstUsesWith(II, FAdd);
988     }
989 
990     // If we can prove we don't have one of the special cases then we can use a
991     // normal fma instead.
992     if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
993       II.setCalledOperand(Intrinsic::getDeclaration(
994           II.getModule(), Intrinsic::fma, II.getType()));
995       return &II;
996     }
997     break;
998   }
999   case Intrinsic::amdgcn_is_shared:
1000   case Intrinsic::amdgcn_is_private: {
1001     if (isa<UndefValue>(II.getArgOperand(0)))
1002       return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1003 
1004     if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1005       return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1006     break;
1007   }
1008   default: {
1009     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1010             AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1011       return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1012     }
1013   }
1014   }
1015   return None;
1016 }
1017 
1018 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1019 ///
1020 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1021 ///       struct returns.
1022 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1023                                                     IntrinsicInst &II,
1024                                                     APInt DemandedElts,
1025                                                     int DMaskIdx = -1) {
1026 
1027   auto *IIVTy = cast<FixedVectorType>(II.getType());
1028   unsigned VWidth = IIVTy->getNumElements();
1029   if (VWidth == 1)
1030     return nullptr;
1031 
1032   IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1033   IC.Builder.SetInsertPoint(&II);
1034 
1035   // Assume the arguments are unchanged and later override them, if needed.
1036   SmallVector<Value *, 16> Args(II.args());
1037 
1038   if (DMaskIdx < 0) {
1039     // Buffer case.
1040 
1041     const unsigned ActiveBits = DemandedElts.getActiveBits();
1042     const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1043 
1044     // Start assuming the prefix of elements is demanded, but possibly clear
1045     // some other bits if there are trailing zeros (unused components at front)
1046     // and update offset.
1047     DemandedElts = (1 << ActiveBits) - 1;
1048 
1049     if (UnusedComponentsAtFront > 0) {
1050       static const unsigned InvalidOffsetIdx = 0xf;
1051 
1052       unsigned OffsetIdx;
1053       switch (II.getIntrinsicID()) {
1054       case Intrinsic::amdgcn_raw_buffer_load:
1055         OffsetIdx = 1;
1056         break;
1057       case Intrinsic::amdgcn_s_buffer_load:
1058         // If resulting type is vec3, there is no point in trimming the
1059         // load with updated offset, as the vec3 would most likely be widened to
1060         // vec4 anyway during lowering.
1061         if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1062           OffsetIdx = InvalidOffsetIdx;
1063         else
1064           OffsetIdx = 1;
1065         break;
1066       case Intrinsic::amdgcn_struct_buffer_load:
1067         OffsetIdx = 2;
1068         break;
1069       default:
1070         // TODO: handle tbuffer* intrinsics.
1071         OffsetIdx = InvalidOffsetIdx;
1072         break;
1073       }
1074 
1075       if (OffsetIdx != InvalidOffsetIdx) {
1076         // Clear demanded bits and update the offset.
1077         DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1078         auto *Offset = II.getArgOperand(OffsetIdx);
1079         unsigned SingleComponentSizeInBits =
1080             IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1081         unsigned OffsetAdd =
1082             UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1083         auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1084         Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1085       }
1086     }
1087   } else {
1088     // Image case.
1089 
1090     ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1091     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1092 
1093     // Mask off values that are undefined because the dmask doesn't cover them
1094     DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1095 
1096     unsigned NewDMaskVal = 0;
1097     unsigned OrigLoadIdx = 0;
1098     for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1099       const unsigned Bit = 1 << SrcIdx;
1100       if (!!(DMaskVal & Bit)) {
1101         if (!!DemandedElts[OrigLoadIdx])
1102           NewDMaskVal |= Bit;
1103         OrigLoadIdx++;
1104       }
1105     }
1106 
1107     if (DMaskVal != NewDMaskVal)
1108       Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1109   }
1110 
1111   unsigned NewNumElts = DemandedElts.countPopulation();
1112   if (!NewNumElts)
1113     return UndefValue::get(II.getType());
1114 
1115   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1116     if (DMaskIdx >= 0)
1117       II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1118     return nullptr;
1119   }
1120 
1121   // Validate function argument and return types, extracting overloaded types
1122   // along the way.
1123   SmallVector<Type *, 6> OverloadTys;
1124   if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1125     return nullptr;
1126 
1127   Module *M = II.getParent()->getParent()->getParent();
1128   Type *EltTy = IIVTy->getElementType();
1129   Type *NewTy =
1130       (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1131 
1132   OverloadTys[0] = NewTy;
1133   Function *NewIntrin =
1134       Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1135 
1136   CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1137   NewCall->takeName(&II);
1138   NewCall->copyMetadata(II);
1139 
1140   if (NewNumElts == 1) {
1141     return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1142                                           NewCall,
1143                                           DemandedElts.countTrailingZeros());
1144   }
1145 
1146   SmallVector<int, 8> EltMask;
1147   unsigned NewLoadIdx = 0;
1148   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1149     if (!!DemandedElts[OrigLoadIdx])
1150       EltMask.push_back(NewLoadIdx++);
1151     else
1152       EltMask.push_back(NewNumElts);
1153   }
1154 
1155   Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1156 
1157   return Shuffle;
1158 }
1159 
1160 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1161     InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1162     APInt &UndefElts2, APInt &UndefElts3,
1163     std::function<void(Instruction *, unsigned, APInt, APInt &)>
1164         SimplifyAndSetOp) const {
1165   switch (II.getIntrinsicID()) {
1166   case Intrinsic::amdgcn_buffer_load:
1167   case Intrinsic::amdgcn_buffer_load_format:
1168   case Intrinsic::amdgcn_raw_buffer_load:
1169   case Intrinsic::amdgcn_raw_buffer_load_format:
1170   case Intrinsic::amdgcn_raw_tbuffer_load:
1171   case Intrinsic::amdgcn_s_buffer_load:
1172   case Intrinsic::amdgcn_struct_buffer_load:
1173   case Intrinsic::amdgcn_struct_buffer_load_format:
1174   case Intrinsic::amdgcn_struct_tbuffer_load:
1175   case Intrinsic::amdgcn_tbuffer_load:
1176     return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1177   default: {
1178     if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1179       return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1180     }
1181     break;
1182   }
1183   }
1184   return None;
1185 }
1186