1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/Transforms/InstCombine/InstCombiner.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "AMDGPUtti"
26
27 namespace {
28
29 struct AMDGPUImageDMaskIntrinsic {
30 unsigned Intr;
31 };
32
33 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
34 #include "InstCombineTables.inc"
35
36 } // end anonymous namespace
37
38 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
39 //
40 // A single NaN input is folded to minnum, so we rely on that folding for
41 // handling NaNs.
fmed3AMDGCN(const APFloat & Src0,const APFloat & Src1,const APFloat & Src2)42 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
43 const APFloat &Src2) {
44 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
45
46 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
47 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
48 if (Cmp0 == APFloat::cmpEqual)
49 return maxnum(Src1, Src2);
50
51 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
52 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
53 if (Cmp1 == APFloat::cmpEqual)
54 return maxnum(Src0, Src2);
55
56 return maxnum(Src0, Src1);
57 }
58
59 // Check if a value can be converted to a 16-bit value without losing
60 // precision.
61 // The value is expected to be either a float (IsFloat = true) or an unsigned
62 // integer (IsFloat = false).
canSafelyConvertTo16Bit(Value & V,bool IsFloat)63 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
64 Type *VTy = V.getType();
65 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
66 // The value is already 16-bit, so we don't want to convert to 16-bit again!
67 return false;
68 }
69 if (IsFloat) {
70 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
71 // We need to check that if we cast the index down to a half, we do not
72 // lose precision.
73 APFloat FloatValue(ConstFloat->getValueAPF());
74 bool LosesInfo = true;
75 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
76 &LosesInfo);
77 return !LosesInfo;
78 }
79 } else {
80 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
81 // We need to check that if we cast the index down to an i16, we do not
82 // lose precision.
83 APInt IntValue(ConstInt->getValue());
84 return IntValue.getActiveBits() <= 16;
85 }
86 }
87
88 Value *CastSrc;
89 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
90 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
91 if (IsExt) {
92 Type *CastSrcTy = CastSrc->getType();
93 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
94 return true;
95 }
96
97 return false;
98 }
99
100 // Convert a value to 16-bit.
convertTo16Bit(Value & V,InstCombiner::BuilderTy & Builder)101 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
102 Type *VTy = V.getType();
103 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
104 return cast<Instruction>(&V)->getOperand(0);
105 if (VTy->isIntegerTy())
106 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
107 if (VTy->isFloatingPointTy())
108 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
109
110 llvm_unreachable("Should never be called!");
111 }
112
113 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
114 /// modified arguments (based on OldIntr) and replaces InstToReplace with
115 /// this newly created intrinsic call.
modifyIntrinsicCall(IntrinsicInst & OldIntr,Instruction & InstToReplace,unsigned NewIntr,InstCombiner & IC,std::function<void (SmallVectorImpl<Value * > &,SmallVectorImpl<Type * > &)> Func)116 static Optional<Instruction *> modifyIntrinsicCall(
117 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
118 InstCombiner &IC,
119 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
120 Func) {
121 SmallVector<Type *, 4> ArgTys;
122 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
123 return None;
124
125 SmallVector<Value *, 8> Args(OldIntr.args());
126
127 // Modify arguments and types
128 Func(Args, ArgTys);
129
130 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
131
132 CallInst *NewCall = IC.Builder.CreateCall(I, Args);
133 NewCall->takeName(&OldIntr);
134 NewCall->copyMetadata(OldIntr);
135 if (isa<FPMathOperator>(NewCall))
136 NewCall->copyFastMathFlags(&OldIntr);
137
138 // Erase and replace uses
139 if (!InstToReplace.getType()->isVoidTy())
140 IC.replaceInstUsesWith(InstToReplace, NewCall);
141
142 bool RemoveOldIntr = &OldIntr != &InstToReplace;
143
144 auto RetValue = IC.eraseInstFromFunction(InstToReplace);
145 if (RemoveOldIntr)
146 IC.eraseInstFromFunction(OldIntr);
147
148 return RetValue;
149 }
150
151 static Optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget * ST,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,IntrinsicInst & II,InstCombiner & IC)152 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
153 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
154 IntrinsicInst &II, InstCombiner &IC) {
155 // Optimize _L to _LZ when _L is zero
156 if (const auto *LZMappingInfo =
157 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
158 if (auto *ConstantLod =
159 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
160 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
161 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
162 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
163 ImageDimIntr->Dim);
164 return modifyIntrinsicCall(
165 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
166 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
167 });
168 }
169 }
170 }
171
172 // Optimize _mip away, when 'lod' is zero
173 if (const auto *MIPMappingInfo =
174 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
175 if (auto *ConstantMip =
176 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
177 if (ConstantMip->isZero()) {
178 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
179 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
180 ImageDimIntr->Dim);
181 return modifyIntrinsicCall(
182 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
183 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
184 });
185 }
186 }
187 }
188
189 // Optimize _bias away when 'bias' is zero
190 if (const auto *BiasMappingInfo =
191 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
192 if (auto *ConstantBias =
193 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
194 if (ConstantBias->isZero()) {
195 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
196 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
197 ImageDimIntr->Dim);
198 return modifyIntrinsicCall(
199 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
200 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
201 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
202 });
203 }
204 }
205 }
206
207 // Optimize _offset away when 'offset' is zero
208 if (const auto *OffsetMappingInfo =
209 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
210 if (auto *ConstantOffset =
211 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
212 if (ConstantOffset->isZero()) {
213 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
214 AMDGPU::getImageDimIntrinsicByBaseOpcode(
215 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
216 return modifyIntrinsicCall(
217 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
218 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
219 });
220 }
221 }
222 }
223
224 // Try to use D16
225 if (ST->hasD16Images()) {
226
227 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
228 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
229
230 if (BaseOpcode->HasD16) {
231
232 // If the only use of image intrinsic is a fptrunc (with conversion to
233 // half) then both fptrunc and image intrinsic will be replaced with image
234 // intrinsic with D16 flag.
235 if (II.hasOneUse()) {
236 Instruction *User = II.user_back();
237
238 if (User->getOpcode() == Instruction::FPTrunc &&
239 User->getType()->getScalarType()->isHalfTy()) {
240
241 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
242 [&](auto &Args, auto &ArgTys) {
243 // Change return type of image intrinsic.
244 // Set it to return type of fptrunc.
245 ArgTys[0] = User->getType();
246 });
247 }
248 }
249 }
250 }
251
252 // Try to use A16 or G16
253 if (!ST->hasA16() && !ST->hasG16())
254 return None;
255
256 // Address is interpreted as float if the instruction has a sampler or as
257 // unsigned int if there is no sampler.
258 bool HasSampler =
259 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
260 bool FloatCoord = false;
261 // true means derivatives can be converted to 16 bit, coordinates not
262 bool OnlyDerivatives = false;
263
264 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
265 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
266 Value *Coord = II.getOperand(OperandIndex);
267 // If the values are not derived from 16-bit values, we cannot optimize.
268 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
269 if (OperandIndex < ImageDimIntr->CoordStart ||
270 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
271 return None;
272 }
273 // All gradients can be converted, so convert only them
274 OnlyDerivatives = true;
275 break;
276 }
277
278 assert(OperandIndex == ImageDimIntr->GradientStart ||
279 FloatCoord == Coord->getType()->isFloatingPointTy());
280 FloatCoord = Coord->getType()->isFloatingPointTy();
281 }
282
283 if (!OnlyDerivatives && !ST->hasA16())
284 OnlyDerivatives = true; // Only supports G16
285
286 // Check if there is a bias parameter and if it can be converted to f16
287 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
288 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
289 assert(HasSampler &&
290 "Only image instructions with a sampler can have a bias");
291 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
292 OnlyDerivatives = true;
293 }
294
295 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
296 ImageDimIntr->CoordStart))
297 return None;
298
299 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
300 : Type::getInt16Ty(II.getContext());
301
302 return modifyIntrinsicCall(
303 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
304 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
305 if (!OnlyDerivatives) {
306 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
307
308 // Change the bias type
309 if (ImageDimIntr->NumBiasArgs != 0)
310 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
311 }
312
313 unsigned EndIndex =
314 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
315 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
316 OperandIndex < EndIndex; OperandIndex++) {
317 Args[OperandIndex] =
318 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
319 }
320
321 // Convert the bias
322 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
323 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
324 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
325 }
326 });
327 }
328
canSimplifyLegacyMulToMul(const Value * Op0,const Value * Op1,InstCombiner & IC) const329 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
330 InstCombiner &IC) const {
331 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
332 // infinity, gives +0.0. If we can prove we don't have one of the special
333 // cases then we can use a normal multiply instead.
334 // TODO: Create and use isKnownFiniteNonZero instead of just matching
335 // constants here.
336 if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
337 match(Op1, PatternMatch::m_FiniteNonZero())) {
338 // One operand is not zero or infinity or NaN.
339 return true;
340 }
341 auto *TLI = &IC.getTargetLibraryInfo();
342 if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
343 isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
344 // Neither operand is infinity or NaN.
345 return true;
346 }
347 return false;
348 }
349
350 Optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const351 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
352 Intrinsic::ID IID = II.getIntrinsicID();
353 switch (IID) {
354 case Intrinsic::amdgcn_rcp: {
355 Value *Src = II.getArgOperand(0);
356
357 // TODO: Move to ConstantFolding/InstSimplify?
358 if (isa<UndefValue>(Src)) {
359 Type *Ty = II.getType();
360 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
361 return IC.replaceInstUsesWith(II, QNaN);
362 }
363
364 if (II.isStrictFP())
365 break;
366
367 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
368 const APFloat &ArgVal = C->getValueAPF();
369 APFloat Val(ArgVal.getSemantics(), 1);
370 Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
371
372 // This is more precise than the instruction may give.
373 //
374 // TODO: The instruction always flushes denormal results (except for f16),
375 // should this also?
376 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
377 }
378
379 break;
380 }
381 case Intrinsic::amdgcn_rsq: {
382 Value *Src = II.getArgOperand(0);
383
384 // TODO: Move to ConstantFolding/InstSimplify?
385 if (isa<UndefValue>(Src)) {
386 Type *Ty = II.getType();
387 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
388 return IC.replaceInstUsesWith(II, QNaN);
389 }
390
391 break;
392 }
393 case Intrinsic::amdgcn_frexp_mant:
394 case Intrinsic::amdgcn_frexp_exp: {
395 Value *Src = II.getArgOperand(0);
396 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
397 int Exp;
398 APFloat Significand =
399 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
400
401 if (IID == Intrinsic::amdgcn_frexp_mant) {
402 return IC.replaceInstUsesWith(
403 II, ConstantFP::get(II.getContext(), Significand));
404 }
405
406 // Match instruction special case behavior.
407 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
408 Exp = 0;
409
410 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
411 }
412
413 if (isa<UndefValue>(Src)) {
414 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
415 }
416
417 break;
418 }
419 case Intrinsic::amdgcn_class: {
420 enum {
421 S_NAN = 1 << 0, // Signaling NaN
422 Q_NAN = 1 << 1, // Quiet NaN
423 N_INFINITY = 1 << 2, // Negative infinity
424 N_NORMAL = 1 << 3, // Negative normal
425 N_SUBNORMAL = 1 << 4, // Negative subnormal
426 N_ZERO = 1 << 5, // Negative zero
427 P_ZERO = 1 << 6, // Positive zero
428 P_SUBNORMAL = 1 << 7, // Positive subnormal
429 P_NORMAL = 1 << 8, // Positive normal
430 P_INFINITY = 1 << 9 // Positive infinity
431 };
432
433 const uint32_t FullMask = S_NAN | Q_NAN | N_INFINITY | N_NORMAL |
434 N_SUBNORMAL | N_ZERO | P_ZERO | P_SUBNORMAL |
435 P_NORMAL | P_INFINITY;
436
437 Value *Src0 = II.getArgOperand(0);
438 Value *Src1 = II.getArgOperand(1);
439 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
440 if (!CMask) {
441 if (isa<UndefValue>(Src0)) {
442 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
443 }
444
445 if (isa<UndefValue>(Src1)) {
446 return IC.replaceInstUsesWith(II,
447 ConstantInt::get(II.getType(), false));
448 }
449 break;
450 }
451
452 uint32_t Mask = CMask->getZExtValue();
453
454 // If all tests are made, it doesn't matter what the value is.
455 if ((Mask & FullMask) == FullMask) {
456 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
457 }
458
459 if ((Mask & FullMask) == 0) {
460 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
461 }
462
463 if (Mask == (S_NAN | Q_NAN)) {
464 // Equivalent of isnan. Replace with standard fcmp.
465 Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
466 FCmp->takeName(&II);
467 return IC.replaceInstUsesWith(II, FCmp);
468 }
469
470 if (Mask == (N_ZERO | P_ZERO)) {
471 // Equivalent of == 0.
472 Value *FCmp =
473 IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
474
475 FCmp->takeName(&II);
476 return IC.replaceInstUsesWith(II, FCmp);
477 }
478
479 // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
480 if (((Mask & S_NAN) || (Mask & Q_NAN)) &&
481 isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
482 return IC.replaceOperand(
483 II, 1, ConstantInt::get(Src1->getType(), Mask & ~(S_NAN | Q_NAN)));
484 }
485
486 const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
487 if (!CVal) {
488 if (isa<UndefValue>(Src0)) {
489 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
490 }
491
492 // Clamp mask to used bits
493 if ((Mask & FullMask) != Mask) {
494 CallInst *NewCall = IC.Builder.CreateCall(
495 II.getCalledFunction(),
496 {Src0, ConstantInt::get(Src1->getType(), Mask & FullMask)});
497
498 NewCall->takeName(&II);
499 return IC.replaceInstUsesWith(II, NewCall);
500 }
501
502 break;
503 }
504
505 const APFloat &Val = CVal->getValueAPF();
506
507 bool Result =
508 ((Mask & S_NAN) && Val.isNaN() && Val.isSignaling()) ||
509 ((Mask & Q_NAN) && Val.isNaN() && !Val.isSignaling()) ||
510 ((Mask & N_INFINITY) && Val.isInfinity() && Val.isNegative()) ||
511 ((Mask & N_NORMAL) && Val.isNormal() && Val.isNegative()) ||
512 ((Mask & N_SUBNORMAL) && Val.isDenormal() && Val.isNegative()) ||
513 ((Mask & N_ZERO) && Val.isZero() && Val.isNegative()) ||
514 ((Mask & P_ZERO) && Val.isZero() && !Val.isNegative()) ||
515 ((Mask & P_SUBNORMAL) && Val.isDenormal() && !Val.isNegative()) ||
516 ((Mask & P_NORMAL) && Val.isNormal() && !Val.isNegative()) ||
517 ((Mask & P_INFINITY) && Val.isInfinity() && !Val.isNegative());
518
519 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
520 }
521 case Intrinsic::amdgcn_cvt_pkrtz: {
522 Value *Src0 = II.getArgOperand(0);
523 Value *Src1 = II.getArgOperand(1);
524 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
525 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
526 const fltSemantics &HalfSem =
527 II.getType()->getScalarType()->getFltSemantics();
528 bool LosesInfo;
529 APFloat Val0 = C0->getValueAPF();
530 APFloat Val1 = C1->getValueAPF();
531 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
532 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
533
534 Constant *Folded =
535 ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
536 ConstantFP::get(II.getContext(), Val1)});
537 return IC.replaceInstUsesWith(II, Folded);
538 }
539 }
540
541 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
542 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
543 }
544
545 break;
546 }
547 case Intrinsic::amdgcn_cvt_pknorm_i16:
548 case Intrinsic::amdgcn_cvt_pknorm_u16:
549 case Intrinsic::amdgcn_cvt_pk_i16:
550 case Intrinsic::amdgcn_cvt_pk_u16: {
551 Value *Src0 = II.getArgOperand(0);
552 Value *Src1 = II.getArgOperand(1);
553
554 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
555 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
556 }
557
558 break;
559 }
560 case Intrinsic::amdgcn_ubfe:
561 case Intrinsic::amdgcn_sbfe: {
562 // Decompose simple cases into standard shifts.
563 Value *Src = II.getArgOperand(0);
564 if (isa<UndefValue>(Src)) {
565 return IC.replaceInstUsesWith(II, Src);
566 }
567
568 unsigned Width;
569 Type *Ty = II.getType();
570 unsigned IntSize = Ty->getIntegerBitWidth();
571
572 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
573 if (CWidth) {
574 Width = CWidth->getZExtValue();
575 if ((Width & (IntSize - 1)) == 0) {
576 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
577 }
578
579 // Hardware ignores high bits, so remove those.
580 if (Width >= IntSize) {
581 return IC.replaceOperand(
582 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
583 }
584 }
585
586 unsigned Offset;
587 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
588 if (COffset) {
589 Offset = COffset->getZExtValue();
590 if (Offset >= IntSize) {
591 return IC.replaceOperand(
592 II, 1,
593 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
594 }
595 }
596
597 bool Signed = IID == Intrinsic::amdgcn_sbfe;
598
599 if (!CWidth || !COffset)
600 break;
601
602 // The case of Width == 0 is handled above, which makes this transformation
603 // safe. If Width == 0, then the ashr and lshr instructions become poison
604 // value since the shift amount would be equal to the bit size.
605 assert(Width != 0);
606
607 // TODO: This allows folding to undef when the hardware has specific
608 // behavior?
609 if (Offset + Width < IntSize) {
610 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
611 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
612 : IC.Builder.CreateLShr(Shl, IntSize - Width);
613 RightShift->takeName(&II);
614 return IC.replaceInstUsesWith(II, RightShift);
615 }
616
617 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
618 : IC.Builder.CreateLShr(Src, Offset);
619
620 RightShift->takeName(&II);
621 return IC.replaceInstUsesWith(II, RightShift);
622 }
623 case Intrinsic::amdgcn_exp:
624 case Intrinsic::amdgcn_exp_row:
625 case Intrinsic::amdgcn_exp_compr: {
626 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
627 unsigned EnBits = En->getZExtValue();
628 if (EnBits == 0xf)
629 break; // All inputs enabled.
630
631 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
632 bool Changed = false;
633 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
634 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
635 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
636 Value *Src = II.getArgOperand(I + 2);
637 if (!isa<UndefValue>(Src)) {
638 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
639 Changed = true;
640 }
641 }
642 }
643
644 if (Changed) {
645 return &II;
646 }
647
648 break;
649 }
650 case Intrinsic::amdgcn_fmed3: {
651 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
652 // for the shader.
653
654 Value *Src0 = II.getArgOperand(0);
655 Value *Src1 = II.getArgOperand(1);
656 Value *Src2 = II.getArgOperand(2);
657
658 // Checking for NaN before canonicalization provides better fidelity when
659 // mapping other operations onto fmed3 since the order of operands is
660 // unchanged.
661 CallInst *NewCall = nullptr;
662 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
663 NewCall = IC.Builder.CreateMinNum(Src1, Src2);
664 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
665 NewCall = IC.Builder.CreateMinNum(Src0, Src2);
666 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
667 NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
668 }
669
670 if (NewCall) {
671 NewCall->copyFastMathFlags(&II);
672 NewCall->takeName(&II);
673 return IC.replaceInstUsesWith(II, NewCall);
674 }
675
676 bool Swap = false;
677 // Canonicalize constants to RHS operands.
678 //
679 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
680 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
681 std::swap(Src0, Src1);
682 Swap = true;
683 }
684
685 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
686 std::swap(Src1, Src2);
687 Swap = true;
688 }
689
690 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
691 std::swap(Src0, Src1);
692 Swap = true;
693 }
694
695 if (Swap) {
696 II.setArgOperand(0, Src0);
697 II.setArgOperand(1, Src1);
698 II.setArgOperand(2, Src2);
699 return &II;
700 }
701
702 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
703 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
704 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
705 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
706 C2->getValueAPF());
707 return IC.replaceInstUsesWith(
708 II, ConstantFP::get(IC.Builder.getContext(), Result));
709 }
710 }
711 }
712
713 break;
714 }
715 case Intrinsic::amdgcn_icmp:
716 case Intrinsic::amdgcn_fcmp: {
717 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
718 // Guard against invalid arguments.
719 int64_t CCVal = CC->getZExtValue();
720 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
721 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
722 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
723 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
724 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
725 break;
726
727 Value *Src0 = II.getArgOperand(0);
728 Value *Src1 = II.getArgOperand(1);
729
730 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
731 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
732 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
733 if (CCmp->isNullValue()) {
734 return IC.replaceInstUsesWith(
735 II, ConstantExpr::getSExt(CCmp, II.getType()));
736 }
737
738 // The result of V_ICMP/V_FCMP assembly instructions (which this
739 // intrinsic exposes) is one bit per thread, masked with the EXEC
740 // register (which contains the bitmask of live threads). So a
741 // comparison that always returns true is the same as a read of the
742 // EXEC register.
743 Function *NewF = Intrinsic::getDeclaration(
744 II.getModule(), Intrinsic::read_register, II.getType());
745 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
746 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
747 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
748 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
749 NewCall->addFnAttr(Attribute::Convergent);
750 NewCall->takeName(&II);
751 return IC.replaceInstUsesWith(II, NewCall);
752 }
753
754 // Canonicalize constants to RHS.
755 CmpInst::Predicate SwapPred =
756 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
757 II.setArgOperand(0, Src1);
758 II.setArgOperand(1, Src0);
759 II.setArgOperand(
760 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
761 return &II;
762 }
763
764 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
765 break;
766
767 // Canonicalize compare eq with true value to compare != 0
768 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
769 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
770 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
771 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
772 Value *ExtSrc;
773 if (CCVal == CmpInst::ICMP_EQ &&
774 ((match(Src1, PatternMatch::m_One()) &&
775 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
776 (match(Src1, PatternMatch::m_AllOnes()) &&
777 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
778 ExtSrc->getType()->isIntegerTy(1)) {
779 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
780 IC.replaceOperand(II, 2,
781 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
782 return &II;
783 }
784
785 CmpInst::Predicate SrcPred;
786 Value *SrcLHS;
787 Value *SrcRHS;
788
789 // Fold compare eq/ne with 0 from a compare result as the predicate to the
790 // intrinsic. The typical use is a wave vote function in the library, which
791 // will be fed from a user code condition compared with 0. Fold in the
792 // redundant compare.
793
794 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
795 // -> llvm.amdgcn.[if]cmp(a, b, pred)
796 //
797 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
798 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
799 if (match(Src1, PatternMatch::m_Zero()) &&
800 match(Src0, PatternMatch::m_ZExtOrSExt(
801 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
802 PatternMatch::m_Value(SrcRHS))))) {
803 if (CCVal == CmpInst::ICMP_EQ)
804 SrcPred = CmpInst::getInversePredicate(SrcPred);
805
806 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
807 ? Intrinsic::amdgcn_fcmp
808 : Intrinsic::amdgcn_icmp;
809
810 Type *Ty = SrcLHS->getType();
811 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
812 // Promote to next legal integer type.
813 unsigned Width = CmpType->getBitWidth();
814 unsigned NewWidth = Width;
815
816 // Don't do anything for i1 comparisons.
817 if (Width == 1)
818 break;
819
820 if (Width <= 16)
821 NewWidth = 16;
822 else if (Width <= 32)
823 NewWidth = 32;
824 else if (Width <= 64)
825 NewWidth = 64;
826 else if (Width > 64)
827 break; // Can't handle this.
828
829 if (Width != NewWidth) {
830 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
831 if (CmpInst::isSigned(SrcPred)) {
832 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
833 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
834 } else {
835 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
836 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
837 }
838 }
839 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
840 break;
841
842 Function *NewF = Intrinsic::getDeclaration(
843 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
844 Value *Args[] = {SrcLHS, SrcRHS,
845 ConstantInt::get(CC->getType(), SrcPred)};
846 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
847 NewCall->takeName(&II);
848 return IC.replaceInstUsesWith(II, NewCall);
849 }
850
851 break;
852 }
853 case Intrinsic::amdgcn_ballot: {
854 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
855 if (Src->isZero()) {
856 // amdgcn.ballot(i1 0) is zero.
857 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
858 }
859
860 if (Src->isOne()) {
861 // amdgcn.ballot(i1 1) is exec.
862 const char *RegName = "exec";
863 if (II.getType()->isIntegerTy(32))
864 RegName = "exec_lo";
865 else if (!II.getType()->isIntegerTy(64))
866 break;
867
868 Function *NewF = Intrinsic::getDeclaration(
869 II.getModule(), Intrinsic::read_register, II.getType());
870 Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
871 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
872 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
873 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
874 NewCall->addFnAttr(Attribute::Convergent);
875 NewCall->takeName(&II);
876 return IC.replaceInstUsesWith(II, NewCall);
877 }
878 }
879 break;
880 }
881 case Intrinsic::amdgcn_wqm_vote: {
882 // wqm_vote is identity when the argument is constant.
883 if (!isa<Constant>(II.getArgOperand(0)))
884 break;
885
886 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
887 }
888 case Intrinsic::amdgcn_kill: {
889 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
890 if (!C || !C->getZExtValue())
891 break;
892
893 // amdgcn.kill(i1 1) is a no-op
894 return IC.eraseInstFromFunction(II);
895 }
896 case Intrinsic::amdgcn_update_dpp: {
897 Value *Old = II.getArgOperand(0);
898
899 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
900 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
901 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
902 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
903 BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
904 break;
905
906 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
907 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
908 }
909 case Intrinsic::amdgcn_permlane16:
910 case Intrinsic::amdgcn_permlanex16: {
911 // Discard vdst_in if it's not going to be read.
912 Value *VDstIn = II.getArgOperand(0);
913 if (isa<UndefValue>(VDstIn))
914 break;
915
916 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
917 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
918 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
919 break;
920
921 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
922 }
923 case Intrinsic::amdgcn_permlane64:
924 // A constant value is trivially uniform.
925 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
926 return IC.replaceInstUsesWith(II, C);
927 }
928 break;
929 case Intrinsic::amdgcn_readfirstlane:
930 case Intrinsic::amdgcn_readlane: {
931 // A constant value is trivially uniform.
932 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
933 return IC.replaceInstUsesWith(II, C);
934 }
935
936 // The rest of these may not be safe if the exec may not be the same between
937 // the def and use.
938 Value *Src = II.getArgOperand(0);
939 Instruction *SrcInst = dyn_cast<Instruction>(Src);
940 if (SrcInst && SrcInst->getParent() != II.getParent())
941 break;
942
943 // readfirstlane (readfirstlane x) -> readfirstlane x
944 // readlane (readfirstlane x), y -> readfirstlane x
945 if (match(Src,
946 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
947 return IC.replaceInstUsesWith(II, Src);
948 }
949
950 if (IID == Intrinsic::amdgcn_readfirstlane) {
951 // readfirstlane (readlane x, y) -> readlane x, y
952 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
953 return IC.replaceInstUsesWith(II, Src);
954 }
955 } else {
956 // readlane (readlane x, y), y -> readlane x, y
957 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
958 PatternMatch::m_Value(),
959 PatternMatch::m_Specific(II.getArgOperand(1))))) {
960 return IC.replaceInstUsesWith(II, Src);
961 }
962 }
963
964 break;
965 }
966 case Intrinsic::amdgcn_ldexp: {
967 // FIXME: This doesn't introduce new instructions and belongs in
968 // InstructionSimplify.
969 Type *Ty = II.getType();
970 Value *Op0 = II.getArgOperand(0);
971 Value *Op1 = II.getArgOperand(1);
972
973 // Folding undef to qnan is safe regardless of the FP mode.
974 if (isa<UndefValue>(Op0)) {
975 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
976 return IC.replaceInstUsesWith(II, QNaN);
977 }
978
979 const APFloat *C = nullptr;
980 match(Op0, PatternMatch::m_APFloat(C));
981
982 // FIXME: Should flush denorms depending on FP mode, but that's ignored
983 // everywhere else.
984 //
985 // These cases should be safe, even with strictfp.
986 // ldexp(0.0, x) -> 0.0
987 // ldexp(-0.0, x) -> -0.0
988 // ldexp(inf, x) -> inf
989 // ldexp(-inf, x) -> -inf
990 if (C && (C->isZero() || C->isInfinity())) {
991 return IC.replaceInstUsesWith(II, Op0);
992 }
993
994 // With strictfp, be more careful about possibly needing to flush denormals
995 // or not, and snan behavior depends on ieee_mode.
996 if (II.isStrictFP())
997 break;
998
999 if (C && C->isNaN()) {
1000 // FIXME: We just need to make the nan quiet here, but that's unavailable
1001 // on APFloat, only IEEEfloat
1002 auto *Quieted =
1003 ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
1004 return IC.replaceInstUsesWith(II, Quieted);
1005 }
1006
1007 // ldexp(x, 0) -> x
1008 // ldexp(x, undef) -> x
1009 if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
1010 return IC.replaceInstUsesWith(II, Op0);
1011 }
1012
1013 break;
1014 }
1015 case Intrinsic::amdgcn_fmul_legacy: {
1016 Value *Op0 = II.getArgOperand(0);
1017 Value *Op1 = II.getArgOperand(1);
1018
1019 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1020 // infinity, gives +0.0.
1021 // TODO: Move to InstSimplify?
1022 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1023 match(Op1, PatternMatch::m_AnyZeroFP()))
1024 return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
1025
1026 // If we can prove we don't have one of the special cases then we can use a
1027 // normal fmul instruction instead.
1028 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1029 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1030 FMul->takeName(&II);
1031 return IC.replaceInstUsesWith(II, FMul);
1032 }
1033 break;
1034 }
1035 case Intrinsic::amdgcn_fma_legacy: {
1036 Value *Op0 = II.getArgOperand(0);
1037 Value *Op1 = II.getArgOperand(1);
1038 Value *Op2 = II.getArgOperand(2);
1039
1040 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1041 // infinity, gives +0.0.
1042 // TODO: Move to InstSimplify?
1043 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1044 match(Op1, PatternMatch::m_AnyZeroFP())) {
1045 // It's tempting to just return Op2 here, but that would give the wrong
1046 // result if Op2 was -0.0.
1047 auto *Zero = ConstantFP::getNullValue(II.getType());
1048 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1049 FAdd->takeName(&II);
1050 return IC.replaceInstUsesWith(II, FAdd);
1051 }
1052
1053 // If we can prove we don't have one of the special cases then we can use a
1054 // normal fma instead.
1055 if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
1056 II.setCalledOperand(Intrinsic::getDeclaration(
1057 II.getModule(), Intrinsic::fma, II.getType()));
1058 return &II;
1059 }
1060 break;
1061 }
1062 case Intrinsic::amdgcn_is_shared:
1063 case Intrinsic::amdgcn_is_private: {
1064 if (isa<UndefValue>(II.getArgOperand(0)))
1065 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1066
1067 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1068 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1069 break;
1070 }
1071 default: {
1072 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1073 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1074 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1075 }
1076 }
1077 }
1078 return None;
1079 }
1080
1081 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1082 ///
1083 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1084 /// struct returns.
simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,int DMaskIdx=-1)1085 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1086 IntrinsicInst &II,
1087 APInt DemandedElts,
1088 int DMaskIdx = -1) {
1089
1090 auto *IIVTy = cast<FixedVectorType>(II.getType());
1091 unsigned VWidth = IIVTy->getNumElements();
1092 if (VWidth == 1)
1093 return nullptr;
1094
1095 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1096 IC.Builder.SetInsertPoint(&II);
1097
1098 // Assume the arguments are unchanged and later override them, if needed.
1099 SmallVector<Value *, 16> Args(II.args());
1100
1101 if (DMaskIdx < 0) {
1102 // Buffer case.
1103
1104 const unsigned ActiveBits = DemandedElts.getActiveBits();
1105 const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
1106
1107 // Start assuming the prefix of elements is demanded, but possibly clear
1108 // some other bits if there are trailing zeros (unused components at front)
1109 // and update offset.
1110 DemandedElts = (1 << ActiveBits) - 1;
1111
1112 if (UnusedComponentsAtFront > 0) {
1113 static const unsigned InvalidOffsetIdx = 0xf;
1114
1115 unsigned OffsetIdx;
1116 switch (II.getIntrinsicID()) {
1117 case Intrinsic::amdgcn_raw_buffer_load:
1118 OffsetIdx = 1;
1119 break;
1120 case Intrinsic::amdgcn_s_buffer_load:
1121 // If resulting type is vec3, there is no point in trimming the
1122 // load with updated offset, as the vec3 would most likely be widened to
1123 // vec4 anyway during lowering.
1124 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1125 OffsetIdx = InvalidOffsetIdx;
1126 else
1127 OffsetIdx = 1;
1128 break;
1129 case Intrinsic::amdgcn_struct_buffer_load:
1130 OffsetIdx = 2;
1131 break;
1132 default:
1133 // TODO: handle tbuffer* intrinsics.
1134 OffsetIdx = InvalidOffsetIdx;
1135 break;
1136 }
1137
1138 if (OffsetIdx != InvalidOffsetIdx) {
1139 // Clear demanded bits and update the offset.
1140 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1141 auto *Offset = II.getArgOperand(OffsetIdx);
1142 unsigned SingleComponentSizeInBits =
1143 IC.getDataLayout().getTypeSizeInBits(II.getType()->getScalarType());
1144 unsigned OffsetAdd =
1145 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1146 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1147 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1148 }
1149 }
1150 } else {
1151 // Image case.
1152
1153 ConstantInt *DMask = cast<ConstantInt>(II.getArgOperand(DMaskIdx));
1154 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1155
1156 // Mask off values that are undefined because the dmask doesn't cover them
1157 DemandedElts &= (1 << countPopulation(DMaskVal)) - 1;
1158
1159 unsigned NewDMaskVal = 0;
1160 unsigned OrigLoadIdx = 0;
1161 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1162 const unsigned Bit = 1 << SrcIdx;
1163 if (!!(DMaskVal & Bit)) {
1164 if (!!DemandedElts[OrigLoadIdx])
1165 NewDMaskVal |= Bit;
1166 OrigLoadIdx++;
1167 }
1168 }
1169
1170 if (DMaskVal != NewDMaskVal)
1171 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1172 }
1173
1174 unsigned NewNumElts = DemandedElts.countPopulation();
1175 if (!NewNumElts)
1176 return UndefValue::get(II.getType());
1177
1178 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1179 if (DMaskIdx >= 0)
1180 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1181 return nullptr;
1182 }
1183
1184 // Validate function argument and return types, extracting overloaded types
1185 // along the way.
1186 SmallVector<Type *, 6> OverloadTys;
1187 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1188 return nullptr;
1189
1190 Module *M = II.getParent()->getParent()->getParent();
1191 Type *EltTy = IIVTy->getElementType();
1192 Type *NewTy =
1193 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1194
1195 OverloadTys[0] = NewTy;
1196 Function *NewIntrin =
1197 Intrinsic::getDeclaration(M, II.getIntrinsicID(), OverloadTys);
1198
1199 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1200 NewCall->takeName(&II);
1201 NewCall->copyMetadata(II);
1202
1203 if (NewNumElts == 1) {
1204 return IC.Builder.CreateInsertElement(UndefValue::get(II.getType()),
1205 NewCall,
1206 DemandedElts.countTrailingZeros());
1207 }
1208
1209 SmallVector<int, 8> EltMask;
1210 unsigned NewLoadIdx = 0;
1211 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1212 if (!!DemandedElts[OrigLoadIdx])
1213 EltMask.push_back(NewLoadIdx++);
1214 else
1215 EltMask.push_back(NewNumElts);
1216 }
1217
1218 Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1219
1220 return Shuffle;
1221 }
1222
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const1223 Optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1224 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1225 APInt &UndefElts2, APInt &UndefElts3,
1226 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1227 SimplifyAndSetOp) const {
1228 switch (II.getIntrinsicID()) {
1229 case Intrinsic::amdgcn_buffer_load:
1230 case Intrinsic::amdgcn_buffer_load_format:
1231 case Intrinsic::amdgcn_raw_buffer_load:
1232 case Intrinsic::amdgcn_raw_buffer_load_format:
1233 case Intrinsic::amdgcn_raw_tbuffer_load:
1234 case Intrinsic::amdgcn_s_buffer_load:
1235 case Intrinsic::amdgcn_struct_buffer_load:
1236 case Intrinsic::amdgcn_struct_buffer_load_format:
1237 case Intrinsic::amdgcn_struct_tbuffer_load:
1238 case Intrinsic::amdgcn_tbuffer_load:
1239 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1240 default: {
1241 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1242 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1243 }
1244 break;
1245 }
1246 }
1247 return None;
1248 }
1249