1 //===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUTargetTransformInfo.h"
19 #include "GCNSubtarget.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/Transforms/InstCombine/InstCombiner.h"
23 #include <optional>
24
25 using namespace llvm;
26 using namespace llvm::PatternMatch;
27
28 #define DEBUG_TYPE "AMDGPUtti"
29
30 namespace {
31
32 struct AMDGPUImageDMaskIntrinsic {
33 unsigned Intr;
34 };
35
36 #define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
37 #include "InstCombineTables.inc"
38
39 } // end anonymous namespace
40
41 // Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
42 //
43 // A single NaN input is folded to minnum, so we rely on that folding for
44 // handling NaNs.
fmed3AMDGCN(const APFloat & Src0,const APFloat & Src1,const APFloat & Src2)45 static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
46 const APFloat &Src2) {
47 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
48
49 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
50 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
51 if (Cmp0 == APFloat::cmpEqual)
52 return maxnum(Src1, Src2);
53
54 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
55 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
56 if (Cmp1 == APFloat::cmpEqual)
57 return maxnum(Src0, Src2);
58
59 return maxnum(Src0, Src1);
60 }
61
62 // Check if a value can be converted to a 16-bit value without losing
63 // precision.
64 // The value is expected to be either a float (IsFloat = true) or an unsigned
65 // integer (IsFloat = false).
canSafelyConvertTo16Bit(Value & V,bool IsFloat)66 static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
67 Type *VTy = V.getType();
68 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
69 // The value is already 16-bit, so we don't want to convert to 16-bit again!
70 return false;
71 }
72 if (IsFloat) {
73 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
74 // We need to check that if we cast the index down to a half, we do not
75 // lose precision.
76 APFloat FloatValue(ConstFloat->getValueAPF());
77 bool LosesInfo = true;
78 FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
79 &LosesInfo);
80 return !LosesInfo;
81 }
82 } else {
83 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
84 // We need to check that if we cast the index down to an i16, we do not
85 // lose precision.
86 APInt IntValue(ConstInt->getValue());
87 return IntValue.getActiveBits() <= 16;
88 }
89 }
90
91 Value *CastSrc;
92 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
93 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
94 if (IsExt) {
95 Type *CastSrcTy = CastSrc->getType();
96 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
97 return true;
98 }
99
100 return false;
101 }
102
103 // Convert a value to 16-bit.
convertTo16Bit(Value & V,InstCombiner::BuilderTy & Builder)104 static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
105 Type *VTy = V.getType();
106 if (isa<FPExtInst>(&V) || isa<SExtInst>(&V) || isa<ZExtInst>(&V))
107 return cast<Instruction>(&V)->getOperand(0);
108 if (VTy->isIntegerTy())
109 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
110 if (VTy->isFloatingPointTy())
111 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
112
113 llvm_unreachable("Should never be called!");
114 }
115
116 /// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
117 /// modified arguments (based on OldIntr) and replaces InstToReplace with
118 /// this newly created intrinsic call.
modifyIntrinsicCall(IntrinsicInst & OldIntr,Instruction & InstToReplace,unsigned NewIntr,InstCombiner & IC,std::function<void (SmallVectorImpl<Value * > &,SmallVectorImpl<Type * > &)> Func)119 static std::optional<Instruction *> modifyIntrinsicCall(
120 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
121 InstCombiner &IC,
122 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
123 Func) {
124 SmallVector<Type *, 4> ArgTys;
125 if (!Intrinsic::getIntrinsicSignature(OldIntr.getCalledFunction(), ArgTys))
126 return std::nullopt;
127
128 SmallVector<Value *, 8> Args(OldIntr.args());
129
130 // Modify arguments and types
131 Func(Args, ArgTys);
132
133 Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
134
135 CallInst *NewCall = IC.Builder.CreateCall(I, Args);
136 NewCall->takeName(&OldIntr);
137 NewCall->copyMetadata(OldIntr);
138 if (isa<FPMathOperator>(NewCall))
139 NewCall->copyFastMathFlags(&OldIntr);
140
141 // Erase and replace uses
142 if (!InstToReplace.getType()->isVoidTy())
143 IC.replaceInstUsesWith(InstToReplace, NewCall);
144
145 bool RemoveOldIntr = &OldIntr != &InstToReplace;
146
147 auto RetValue = IC.eraseInstFromFunction(InstToReplace);
148 if (RemoveOldIntr)
149 IC.eraseInstFromFunction(OldIntr);
150
151 return RetValue;
152 }
153
154 static std::optional<Instruction *>
simplifyAMDGCNImageIntrinsic(const GCNSubtarget * ST,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr,IntrinsicInst & II,InstCombiner & IC)155 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
156 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
157 IntrinsicInst &II, InstCombiner &IC) {
158 // Optimize _L to _LZ when _L is zero
159 if (const auto *LZMappingInfo =
160 AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
161 if (auto *ConstantLod =
162 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
163 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
164 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
165 AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
166 ImageDimIntr->Dim);
167 return modifyIntrinsicCall(
168 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
169 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
170 });
171 }
172 }
173 }
174
175 // Optimize _mip away, when 'lod' is zero
176 if (const auto *MIPMappingInfo =
177 AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
178 if (auto *ConstantMip =
179 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
180 if (ConstantMip->isZero()) {
181 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
182 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
183 ImageDimIntr->Dim);
184 return modifyIntrinsicCall(
185 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
186 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
187 });
188 }
189 }
190 }
191
192 // Optimize _bias away when 'bias' is zero
193 if (const auto *BiasMappingInfo =
194 AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
195 if (auto *ConstantBias =
196 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
197 if (ConstantBias->isZero()) {
198 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
199 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
200 ImageDimIntr->Dim);
201 return modifyIntrinsicCall(
202 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
203 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
204 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
205 });
206 }
207 }
208 }
209
210 // Optimize _offset away when 'offset' is zero
211 if (const auto *OffsetMappingInfo =
212 AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
213 if (auto *ConstantOffset =
214 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
215 if (ConstantOffset->isZero()) {
216 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
217 AMDGPU::getImageDimIntrinsicByBaseOpcode(
218 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
219 return modifyIntrinsicCall(
220 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
221 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
222 });
223 }
224 }
225 }
226
227 // Try to use D16
228 if (ST->hasD16Images()) {
229
230 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
231 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
232
233 if (BaseOpcode->HasD16) {
234
235 // If the only use of image intrinsic is a fptrunc (with conversion to
236 // half) then both fptrunc and image intrinsic will be replaced with image
237 // intrinsic with D16 flag.
238 if (II.hasOneUse()) {
239 Instruction *User = II.user_back();
240
241 if (User->getOpcode() == Instruction::FPTrunc &&
242 User->getType()->getScalarType()->isHalfTy()) {
243
244 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
245 [&](auto &Args, auto &ArgTys) {
246 // Change return type of image intrinsic.
247 // Set it to return type of fptrunc.
248 ArgTys[0] = User->getType();
249 });
250 }
251 }
252 }
253 }
254
255 // Try to use A16 or G16
256 if (!ST->hasA16() && !ST->hasG16())
257 return std::nullopt;
258
259 // Address is interpreted as float if the instruction has a sampler or as
260 // unsigned int if there is no sampler.
261 bool HasSampler =
262 AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
263 bool FloatCoord = false;
264 // true means derivatives can be converted to 16 bit, coordinates not
265 bool OnlyDerivatives = false;
266
267 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
268 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
269 Value *Coord = II.getOperand(OperandIndex);
270 // If the values are not derived from 16-bit values, we cannot optimize.
271 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
272 if (OperandIndex < ImageDimIntr->CoordStart ||
273 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
274 return std::nullopt;
275 }
276 // All gradients can be converted, so convert only them
277 OnlyDerivatives = true;
278 break;
279 }
280
281 assert(OperandIndex == ImageDimIntr->GradientStart ||
282 FloatCoord == Coord->getType()->isFloatingPointTy());
283 FloatCoord = Coord->getType()->isFloatingPointTy();
284 }
285
286 if (!OnlyDerivatives && !ST->hasA16())
287 OnlyDerivatives = true; // Only supports G16
288
289 // Check if there is a bias parameter and if it can be converted to f16
290 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
291 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
292 assert(HasSampler &&
293 "Only image instructions with a sampler can have a bias");
294 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
295 OnlyDerivatives = true;
296 }
297
298 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
299 ImageDimIntr->CoordStart))
300 return std::nullopt;
301
302 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
303 : Type::getInt16Ty(II.getContext());
304
305 return modifyIntrinsicCall(
306 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
307 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
308 if (!OnlyDerivatives) {
309 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
310
311 // Change the bias type
312 if (ImageDimIntr->NumBiasArgs != 0)
313 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
314 }
315
316 unsigned EndIndex =
317 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
318 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
319 OperandIndex < EndIndex; OperandIndex++) {
320 Args[OperandIndex] =
321 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
322 }
323
324 // Convert the bias
325 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
326 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
327 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
328 }
329 });
330 }
331
canSimplifyLegacyMulToMul(const Instruction & I,const Value * Op0,const Value * Op1,InstCombiner & IC) const332 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
333 const Value *Op0, const Value *Op1,
334 InstCombiner &IC) const {
335 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
336 // infinity, gives +0.0. If we can prove we don't have one of the special
337 // cases then we can use a normal multiply instead.
338 // TODO: Create and use isKnownFiniteNonZero instead of just matching
339 // constants here.
340 if (match(Op0, PatternMatch::m_FiniteNonZero()) ||
341 match(Op1, PatternMatch::m_FiniteNonZero())) {
342 // One operand is not zero or infinity or NaN.
343 return true;
344 }
345
346 auto *TLI = &IC.getTargetLibraryInfo();
347 if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0,
348 &IC.getAssumptionCache(), &I,
349 &IC.getDominatorTree()) &&
350 isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0,
351 &IC.getAssumptionCache(), &I,
352 &IC.getDominatorTree())) {
353 // Neither operand is infinity or NaN.
354 return true;
355 }
356 return false;
357 }
358
359 /// Match an fpext from half to float, or a constant we can convert.
matchFPExtFromF16(Value * Arg,Value * & FPExtSrc)360 static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
361 if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
362 return FPExtSrc->getType()->isHalfTy();
363
364 ConstantFP *CFP;
365 if (match(Arg, m_ConstantFP(CFP))) {
366 bool LosesInfo;
367 APFloat Val(CFP->getValueAPF());
368 Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
369 if (LosesInfo)
370 return false;
371
372 FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
373 return true;
374 }
375
376 return false;
377 }
378
379 // Trim all zero components from the end of the vector \p UseV and return
380 // an appropriate bitset with known elements.
trimTrailingZerosInVector(InstCombiner & IC,Value * UseV,Instruction * I)381 static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
382 Instruction *I) {
383 auto *VTy = cast<FixedVectorType>(UseV->getType());
384 unsigned VWidth = VTy->getNumElements();
385 APInt DemandedElts = APInt::getAllOnes(VWidth);
386
387 for (int i = VWidth - 1; i > 0; --i) {
388 auto *Elt = findScalarElement(UseV, i);
389 if (!Elt)
390 break;
391
392 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
393 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
394 break;
395 } else {
396 break;
397 }
398
399 DemandedElts.clearBit(i);
400 }
401
402 return DemandedElts;
403 }
404
405 // Trim elements of the end of the vector \p V, if they are
406 // equal to the first element of the vector.
defaultComponentBroadcast(Value * V)407 static APInt defaultComponentBroadcast(Value *V) {
408 auto *VTy = cast<FixedVectorType>(V->getType());
409 unsigned VWidth = VTy->getNumElements();
410 APInt DemandedElts = APInt::getAllOnes(VWidth);
411 Value *FirstComponent = findScalarElement(V, 0);
412
413 SmallVector<int> ShuffleMask;
414 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
415 SVI->getShuffleMask(ShuffleMask);
416
417 for (int I = VWidth - 1; I > 0; --I) {
418 if (ShuffleMask.empty()) {
419 auto *Elt = findScalarElement(V, I);
420 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
421 break;
422 } else {
423 // Detect identical elements in the shufflevector result, even though
424 // findScalarElement cannot tell us what that element is.
425 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
426 break;
427 }
428 DemandedElts.clearBit(I);
429 }
430
431 return DemandedElts;
432 }
433
434 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
435 IntrinsicInst &II,
436 APInt DemandedElts,
437 int DMaskIdx = -1,
438 bool IsLoad = true);
439
440 /// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
canContractSqrtToRsq(const FPMathOperator * SqrtOp)441 static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
442 return (SqrtOp->getType()->isFloatTy() &&
443 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
444 SqrtOp->getType()->isHalfTy();
445 }
446
447 std::optional<Instruction *>
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II) const448 GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
449 Intrinsic::ID IID = II.getIntrinsicID();
450 switch (IID) {
451 case Intrinsic::amdgcn_rcp: {
452 Value *Src = II.getArgOperand(0);
453
454 // TODO: Move to ConstantFolding/InstSimplify?
455 if (isa<UndefValue>(Src)) {
456 Type *Ty = II.getType();
457 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
458 return IC.replaceInstUsesWith(II, QNaN);
459 }
460
461 if (II.isStrictFP())
462 break;
463
464 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
465 const APFloat &ArgVal = C->getValueAPF();
466 APFloat Val(ArgVal.getSemantics(), 1);
467 Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
468
469 // This is more precise than the instruction may give.
470 //
471 // TODO: The instruction always flushes denormal results (except for f16),
472 // should this also?
473 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
474 }
475
476 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
477 if (!FMF.allowContract())
478 break;
479 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
480 if (!SrcCI)
481 break;
482
483 auto IID = SrcCI->getIntrinsicID();
484 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
485 //
486 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
487 // relaxed.
488 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
489 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
490 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
491 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
492 break;
493
494 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
495 break;
496
497 Function *NewDecl = Intrinsic::getDeclaration(
498 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
499
500 InnerFMF |= FMF;
501 II.setFastMathFlags(InnerFMF);
502
503 II.setCalledFunction(NewDecl);
504 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
505 }
506
507 break;
508 }
509 case Intrinsic::amdgcn_sqrt:
510 case Intrinsic::amdgcn_rsq: {
511 Value *Src = II.getArgOperand(0);
512
513 // TODO: Move to ConstantFolding/InstSimplify?
514 if (isa<UndefValue>(Src)) {
515 Type *Ty = II.getType();
516 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
517 return IC.replaceInstUsesWith(II, QNaN);
518 }
519
520 // f16 amdgcn.sqrt is identical to regular sqrt.
521 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
522 Function *NewDecl = Intrinsic::getDeclaration(
523 II.getModule(), Intrinsic::sqrt, {II.getType()});
524 II.setCalledFunction(NewDecl);
525 return &II;
526 }
527
528 break;
529 }
530 case Intrinsic::amdgcn_log:
531 case Intrinsic::amdgcn_exp2: {
532 const bool IsLog = IID == Intrinsic::amdgcn_log;
533 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
534 Value *Src = II.getArgOperand(0);
535 Type *Ty = II.getType();
536
537 if (isa<PoisonValue>(Src))
538 return IC.replaceInstUsesWith(II, Src);
539
540 if (IC.getSimplifyQuery().isUndefValue(Src))
541 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
542
543 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
544 if (C->isInfinity()) {
545 // exp2(+inf) -> +inf
546 // log2(+inf) -> +inf
547 if (!C->isNegative())
548 return IC.replaceInstUsesWith(II, C);
549
550 // exp2(-inf) -> 0
551 if (IsExp && C->isNegative())
552 return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
553 }
554
555 if (II.isStrictFP())
556 break;
557
558 if (C->isNaN()) {
559 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
560 return IC.replaceInstUsesWith(II, Quieted);
561 }
562
563 // f32 instruction doesn't handle denormals, f16 does.
564 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
565 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
566 : ConstantFP::get(Ty, 1.0);
567 return IC.replaceInstUsesWith(II, FoldedValue);
568 }
569
570 if (IsLog && C->isNegative())
571 return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
572
573 // TODO: Full constant folding matching hardware behavior.
574 }
575
576 break;
577 }
578 case Intrinsic::amdgcn_frexp_mant:
579 case Intrinsic::amdgcn_frexp_exp: {
580 Value *Src = II.getArgOperand(0);
581 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
582 int Exp;
583 APFloat Significand =
584 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
585
586 if (IID == Intrinsic::amdgcn_frexp_mant) {
587 return IC.replaceInstUsesWith(
588 II, ConstantFP::get(II.getContext(), Significand));
589 }
590
591 // Match instruction special case behavior.
592 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
593 Exp = 0;
594
595 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Exp));
596 }
597
598 if (isa<UndefValue>(Src)) {
599 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
600 }
601
602 break;
603 }
604 case Intrinsic::amdgcn_class: {
605 Value *Src0 = II.getArgOperand(0);
606 Value *Src1 = II.getArgOperand(1);
607 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
608 if (CMask) {
609 II.setCalledOperand(Intrinsic::getDeclaration(
610 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
611
612 // Clamp any excess bits, as they're illegal for the generic intrinsic.
613 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
614 CMask->getZExtValue() & fcAllFlags));
615 return &II;
616 }
617
618 // Propagate poison.
619 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
620 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
621
622 // llvm.amdgcn.class(_, undef) -> false
623 if (IC.getSimplifyQuery().isUndefValue(Src1))
624 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
625
626 // llvm.amdgcn.class(undef, mask) -> mask != 0
627 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
628 Value *CmpMask = IC.Builder.CreateICmpNE(
629 Src1, ConstantInt::getNullValue(Src1->getType()));
630 return IC.replaceInstUsesWith(II, CmpMask);
631 }
632 break;
633 }
634 case Intrinsic::amdgcn_cvt_pkrtz: {
635 Value *Src0 = II.getArgOperand(0);
636 Value *Src1 = II.getArgOperand(1);
637 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
638 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
639 const fltSemantics &HalfSem =
640 II.getType()->getScalarType()->getFltSemantics();
641 bool LosesInfo;
642 APFloat Val0 = C0->getValueAPF();
643 APFloat Val1 = C1->getValueAPF();
644 Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
645 Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
646
647 Constant *Folded =
648 ConstantVector::get({ConstantFP::get(II.getContext(), Val0),
649 ConstantFP::get(II.getContext(), Val1)});
650 return IC.replaceInstUsesWith(II, Folded);
651 }
652 }
653
654 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
655 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
656 }
657
658 break;
659 }
660 case Intrinsic::amdgcn_cvt_pknorm_i16:
661 case Intrinsic::amdgcn_cvt_pknorm_u16:
662 case Intrinsic::amdgcn_cvt_pk_i16:
663 case Intrinsic::amdgcn_cvt_pk_u16: {
664 Value *Src0 = II.getArgOperand(0);
665 Value *Src1 = II.getArgOperand(1);
666
667 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
668 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
669 }
670
671 break;
672 }
673 case Intrinsic::amdgcn_ubfe:
674 case Intrinsic::amdgcn_sbfe: {
675 // Decompose simple cases into standard shifts.
676 Value *Src = II.getArgOperand(0);
677 if (isa<UndefValue>(Src)) {
678 return IC.replaceInstUsesWith(II, Src);
679 }
680
681 unsigned Width;
682 Type *Ty = II.getType();
683 unsigned IntSize = Ty->getIntegerBitWidth();
684
685 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
686 if (CWidth) {
687 Width = CWidth->getZExtValue();
688 if ((Width & (IntSize - 1)) == 0) {
689 return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(Ty));
690 }
691
692 // Hardware ignores high bits, so remove those.
693 if (Width >= IntSize) {
694 return IC.replaceOperand(
695 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
696 }
697 }
698
699 unsigned Offset;
700 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
701 if (COffset) {
702 Offset = COffset->getZExtValue();
703 if (Offset >= IntSize) {
704 return IC.replaceOperand(
705 II, 1,
706 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
707 }
708 }
709
710 bool Signed = IID == Intrinsic::amdgcn_sbfe;
711
712 if (!CWidth || !COffset)
713 break;
714
715 // The case of Width == 0 is handled above, which makes this transformation
716 // safe. If Width == 0, then the ashr and lshr instructions become poison
717 // value since the shift amount would be equal to the bit size.
718 assert(Width != 0);
719
720 // TODO: This allows folding to undef when the hardware has specific
721 // behavior?
722 if (Offset + Width < IntSize) {
723 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
724 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
725 : IC.Builder.CreateLShr(Shl, IntSize - Width);
726 RightShift->takeName(&II);
727 return IC.replaceInstUsesWith(II, RightShift);
728 }
729
730 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
731 : IC.Builder.CreateLShr(Src, Offset);
732
733 RightShift->takeName(&II);
734 return IC.replaceInstUsesWith(II, RightShift);
735 }
736 case Intrinsic::amdgcn_exp:
737 case Intrinsic::amdgcn_exp_row:
738 case Intrinsic::amdgcn_exp_compr: {
739 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
740 unsigned EnBits = En->getZExtValue();
741 if (EnBits == 0xf)
742 break; // All inputs enabled.
743
744 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
745 bool Changed = false;
746 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
747 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
748 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
749 Value *Src = II.getArgOperand(I + 2);
750 if (!isa<UndefValue>(Src)) {
751 IC.replaceOperand(II, I + 2, UndefValue::get(Src->getType()));
752 Changed = true;
753 }
754 }
755 }
756
757 if (Changed) {
758 return &II;
759 }
760
761 break;
762 }
763 case Intrinsic::amdgcn_fmed3: {
764 // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
765 // for the shader.
766
767 Value *Src0 = II.getArgOperand(0);
768 Value *Src1 = II.getArgOperand(1);
769 Value *Src2 = II.getArgOperand(2);
770
771 // Checking for NaN before canonicalization provides better fidelity when
772 // mapping other operations onto fmed3 since the order of operands is
773 // unchanged.
774 CallInst *NewCall = nullptr;
775 if (match(Src0, PatternMatch::m_NaN()) || isa<UndefValue>(Src0)) {
776 NewCall = IC.Builder.CreateMinNum(Src1, Src2);
777 } else if (match(Src1, PatternMatch::m_NaN()) || isa<UndefValue>(Src1)) {
778 NewCall = IC.Builder.CreateMinNum(Src0, Src2);
779 } else if (match(Src2, PatternMatch::m_NaN()) || isa<UndefValue>(Src2)) {
780 NewCall = IC.Builder.CreateMaxNum(Src0, Src1);
781 }
782
783 if (NewCall) {
784 NewCall->copyFastMathFlags(&II);
785 NewCall->takeName(&II);
786 return IC.replaceInstUsesWith(II, NewCall);
787 }
788
789 bool Swap = false;
790 // Canonicalize constants to RHS operands.
791 //
792 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
793 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
794 std::swap(Src0, Src1);
795 Swap = true;
796 }
797
798 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
799 std::swap(Src1, Src2);
800 Swap = true;
801 }
802
803 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
804 std::swap(Src0, Src1);
805 Swap = true;
806 }
807
808 if (Swap) {
809 II.setArgOperand(0, Src0);
810 II.setArgOperand(1, Src1);
811 II.setArgOperand(2, Src2);
812 return &II;
813 }
814
815 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
816 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
817 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
818 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
819 C2->getValueAPF());
820 return IC.replaceInstUsesWith(
821 II, ConstantFP::get(IC.Builder.getContext(), Result));
822 }
823 }
824 }
825
826 if (!ST->hasMed3_16())
827 break;
828
829 Value *X, *Y, *Z;
830
831 // Repeat floating-point width reduction done for minnum/maxnum.
832 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
833 if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
834 matchFPExtFromF16(Src2, Z)) {
835 Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
836 {X, Y, Z}, &II, II.getName());
837 return new FPExtInst(NewCall, II.getType());
838 }
839
840 break;
841 }
842 case Intrinsic::amdgcn_icmp:
843 case Intrinsic::amdgcn_fcmp: {
844 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
845 // Guard against invalid arguments.
846 int64_t CCVal = CC->getZExtValue();
847 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
848 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
849 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
850 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
851 CCVal > CmpInst::LAST_FCMP_PREDICATE)))
852 break;
853
854 Value *Src0 = II.getArgOperand(0);
855 Value *Src1 = II.getArgOperand(1);
856
857 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
858 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
859 Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
860 if (CCmp->isNullValue()) {
861 return IC.replaceInstUsesWith(
862 II, IC.Builder.CreateSExt(CCmp, II.getType()));
863 }
864
865 // The result of V_ICMP/V_FCMP assembly instructions (which this
866 // intrinsic exposes) is one bit per thread, masked with the EXEC
867 // register (which contains the bitmask of live threads). So a
868 // comparison that always returns true is the same as a read of the
869 // EXEC register.
870 Function *NewF = Intrinsic::getDeclaration(
871 II.getModule(), Intrinsic::read_register, II.getType());
872 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
873 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
874 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
875 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
876 NewCall->addFnAttr(Attribute::Convergent);
877 NewCall->takeName(&II);
878 return IC.replaceInstUsesWith(II, NewCall);
879 }
880
881 // Canonicalize constants to RHS.
882 CmpInst::Predicate SwapPred =
883 CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
884 II.setArgOperand(0, Src1);
885 II.setArgOperand(1, Src0);
886 II.setArgOperand(
887 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
888 return &II;
889 }
890
891 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
892 break;
893
894 // Canonicalize compare eq with true value to compare != 0
895 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
896 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
897 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
898 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
899 Value *ExtSrc;
900 if (CCVal == CmpInst::ICMP_EQ &&
901 ((match(Src1, PatternMatch::m_One()) &&
902 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
903 (match(Src1, PatternMatch::m_AllOnes()) &&
904 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
905 ExtSrc->getType()->isIntegerTy(1)) {
906 IC.replaceOperand(II, 1, ConstantInt::getNullValue(Src1->getType()));
907 IC.replaceOperand(II, 2,
908 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
909 return &II;
910 }
911
912 CmpInst::Predicate SrcPred;
913 Value *SrcLHS;
914 Value *SrcRHS;
915
916 // Fold compare eq/ne with 0 from a compare result as the predicate to the
917 // intrinsic. The typical use is a wave vote function in the library, which
918 // will be fed from a user code condition compared with 0. Fold in the
919 // redundant compare.
920
921 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
922 // -> llvm.amdgcn.[if]cmp(a, b, pred)
923 //
924 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
925 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
926 if (match(Src1, PatternMatch::m_Zero()) &&
927 match(Src0, PatternMatch::m_ZExtOrSExt(
928 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
929 PatternMatch::m_Value(SrcRHS))))) {
930 if (CCVal == CmpInst::ICMP_EQ)
931 SrcPred = CmpInst::getInversePredicate(SrcPred);
932
933 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
934 ? Intrinsic::amdgcn_fcmp
935 : Intrinsic::amdgcn_icmp;
936
937 Type *Ty = SrcLHS->getType();
938 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
939 // Promote to next legal integer type.
940 unsigned Width = CmpType->getBitWidth();
941 unsigned NewWidth = Width;
942
943 // Don't do anything for i1 comparisons.
944 if (Width == 1)
945 break;
946
947 if (Width <= 16)
948 NewWidth = 16;
949 else if (Width <= 32)
950 NewWidth = 32;
951 else if (Width <= 64)
952 NewWidth = 64;
953 else if (Width > 64)
954 break; // Can't handle this.
955
956 if (Width != NewWidth) {
957 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
958 if (CmpInst::isSigned(SrcPred)) {
959 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
960 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
961 } else {
962 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
963 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
964 }
965 }
966 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
967 break;
968
969 Function *NewF = Intrinsic::getDeclaration(
970 II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
971 Value *Args[] = {SrcLHS, SrcRHS,
972 ConstantInt::get(CC->getType(), SrcPred)};
973 CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
974 NewCall->takeName(&II);
975 return IC.replaceInstUsesWith(II, NewCall);
976 }
977
978 break;
979 }
980 case Intrinsic::amdgcn_mbcnt_hi: {
981 // exec_hi is all 0, so this is just a copy.
982 if (ST->isWave32())
983 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
984 break;
985 }
986 case Intrinsic::amdgcn_ballot: {
987 if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
988 if (Src->isZero()) {
989 // amdgcn.ballot(i1 0) is zero.
990 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
991 }
992 }
993 break;
994 }
995 case Intrinsic::amdgcn_wqm_vote: {
996 // wqm_vote is identity when the argument is constant.
997 if (!isa<Constant>(II.getArgOperand(0)))
998 break;
999
1000 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1001 }
1002 case Intrinsic::amdgcn_kill: {
1003 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1004 if (!C || !C->getZExtValue())
1005 break;
1006
1007 // amdgcn.kill(i1 1) is a no-op
1008 return IC.eraseInstFromFunction(II);
1009 }
1010 case Intrinsic::amdgcn_update_dpp: {
1011 Value *Old = II.getArgOperand(0);
1012
1013 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1014 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1015 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1016 if (BC->isZeroValue() || RM->getZExtValue() != 0xF ||
1017 BM->getZExtValue() != 0xF || isa<UndefValue>(Old))
1018 break;
1019
1020 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1021 return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
1022 }
1023 case Intrinsic::amdgcn_permlane16:
1024 case Intrinsic::amdgcn_permlane16_var:
1025 case Intrinsic::amdgcn_permlanex16:
1026 case Intrinsic::amdgcn_permlanex16_var: {
1027 // Discard vdst_in if it's not going to be read.
1028 Value *VDstIn = II.getArgOperand(0);
1029 if (isa<UndefValue>(VDstIn))
1030 break;
1031
1032 // FetchInvalid operand idx.
1033 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1034 IID == Intrinsic::amdgcn_permlanex16)
1035 ? 4 /* for permlane16 and permlanex16 */
1036 : 3; /* for permlane16_var and permlanex16_var */
1037
1038 // BoundCtrl operand idx.
1039 // For permlane16 and permlanex16 it should be 5
1040 // For Permlane16_var and permlanex16_var it should be 4
1041 unsigned int BcIdx = FiIdx + 1;
1042
1043 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1044 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1045 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1046 break;
1047
1048 return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
1049 }
1050 case Intrinsic::amdgcn_permlane64:
1051 // A constant value is trivially uniform.
1052 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1053 return IC.replaceInstUsesWith(II, C);
1054 }
1055 break;
1056 case Intrinsic::amdgcn_readfirstlane:
1057 case Intrinsic::amdgcn_readlane: {
1058 // A constant value is trivially uniform.
1059 if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
1060 return IC.replaceInstUsesWith(II, C);
1061 }
1062
1063 // The rest of these may not be safe if the exec may not be the same between
1064 // the def and use.
1065 Value *Src = II.getArgOperand(0);
1066 Instruction *SrcInst = dyn_cast<Instruction>(Src);
1067 if (SrcInst && SrcInst->getParent() != II.getParent())
1068 break;
1069
1070 // readfirstlane (readfirstlane x) -> readfirstlane x
1071 // readlane (readfirstlane x), y -> readfirstlane x
1072 if (match(Src,
1073 PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readfirstlane>())) {
1074 return IC.replaceInstUsesWith(II, Src);
1075 }
1076
1077 if (IID == Intrinsic::amdgcn_readfirstlane) {
1078 // readfirstlane (readlane x, y) -> readlane x, y
1079 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>())) {
1080 return IC.replaceInstUsesWith(II, Src);
1081 }
1082 } else {
1083 // readlane (readlane x, y), y -> readlane x, y
1084 if (match(Src, PatternMatch::m_Intrinsic<Intrinsic::amdgcn_readlane>(
1085 PatternMatch::m_Value(),
1086 PatternMatch::m_Specific(II.getArgOperand(1))))) {
1087 return IC.replaceInstUsesWith(II, Src);
1088 }
1089 }
1090
1091 break;
1092 }
1093 case Intrinsic::amdgcn_fmul_legacy: {
1094 Value *Op0 = II.getArgOperand(0);
1095 Value *Op1 = II.getArgOperand(1);
1096
1097 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1098 // infinity, gives +0.0.
1099 // TODO: Move to InstSimplify?
1100 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1101 match(Op1, PatternMatch::m_AnyZeroFP()))
1102 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1103
1104 // If we can prove we don't have one of the special cases then we can use a
1105 // normal fmul instruction instead.
1106 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1107 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
1108 FMul->takeName(&II);
1109 return IC.replaceInstUsesWith(II, FMul);
1110 }
1111 break;
1112 }
1113 case Intrinsic::amdgcn_fma_legacy: {
1114 Value *Op0 = II.getArgOperand(0);
1115 Value *Op1 = II.getArgOperand(1);
1116 Value *Op2 = II.getArgOperand(2);
1117
1118 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
1119 // infinity, gives +0.0.
1120 // TODO: Move to InstSimplify?
1121 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
1122 match(Op1, PatternMatch::m_AnyZeroFP())) {
1123 // It's tempting to just return Op2 here, but that would give the wrong
1124 // result if Op2 was -0.0.
1125 auto *Zero = ConstantFP::getZero(II.getType());
1126 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
1127 FAdd->takeName(&II);
1128 return IC.replaceInstUsesWith(II, FAdd);
1129 }
1130
1131 // If we can prove we don't have one of the special cases then we can use a
1132 // normal fma instead.
1133 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
1134 II.setCalledOperand(Intrinsic::getDeclaration(
1135 II.getModule(), Intrinsic::fma, II.getType()));
1136 return &II;
1137 }
1138 break;
1139 }
1140 case Intrinsic::amdgcn_is_shared:
1141 case Intrinsic::amdgcn_is_private: {
1142 if (isa<UndefValue>(II.getArgOperand(0)))
1143 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1144
1145 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
1146 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
1147 break;
1148 }
1149 case Intrinsic::amdgcn_buffer_store_format:
1150 case Intrinsic::amdgcn_raw_buffer_store_format:
1151 case Intrinsic::amdgcn_struct_buffer_store_format:
1152 case Intrinsic::amdgcn_raw_tbuffer_store:
1153 case Intrinsic::amdgcn_struct_tbuffer_store:
1154 case Intrinsic::amdgcn_tbuffer_store:
1155 case Intrinsic::amdgcn_image_store_1d:
1156 case Intrinsic::amdgcn_image_store_1darray:
1157 case Intrinsic::amdgcn_image_store_2d:
1158 case Intrinsic::amdgcn_image_store_2darray:
1159 case Intrinsic::amdgcn_image_store_2darraymsaa:
1160 case Intrinsic::amdgcn_image_store_2dmsaa:
1161 case Intrinsic::amdgcn_image_store_3d:
1162 case Intrinsic::amdgcn_image_store_cube:
1163 case Intrinsic::amdgcn_image_store_mip_1d:
1164 case Intrinsic::amdgcn_image_store_mip_1darray:
1165 case Intrinsic::amdgcn_image_store_mip_2d:
1166 case Intrinsic::amdgcn_image_store_mip_2darray:
1167 case Intrinsic::amdgcn_image_store_mip_3d:
1168 case Intrinsic::amdgcn_image_store_mip_cube: {
1169 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
1170 break;
1171
1172 APInt DemandedElts;
1173 if (ST->hasDefaultComponentBroadcast())
1174 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
1175 else if (ST->hasDefaultComponentZero())
1176 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
1177 else
1178 break;
1179
1180 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
1181 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
1182 false)) {
1183 return IC.eraseInstFromFunction(II);
1184 }
1185
1186 break;
1187 }
1188 }
1189 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
1190 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
1191 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
1192 }
1193 return std::nullopt;
1194 }
1195
1196 /// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
1197 ///
1198 /// The result of simplifying amdgcn image and buffer store intrinsics is updating
1199 /// definitions of the intrinsics vector argument, not Uses of the result like
1200 /// image and buffer loads.
1201 /// Note: This only supports non-TFE/LWE image intrinsic calls; those have
1202 /// struct returns.
simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,int DMaskIdx,bool IsLoad)1203 static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
1204 IntrinsicInst &II,
1205 APInt DemandedElts,
1206 int DMaskIdx, bool IsLoad) {
1207
1208 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
1209 : II.getOperand(0)->getType());
1210 unsigned VWidth = IIVTy->getNumElements();
1211 if (VWidth == 1)
1212 return nullptr;
1213 Type *EltTy = IIVTy->getElementType();
1214
1215 IRBuilderBase::InsertPointGuard Guard(IC.Builder);
1216 IC.Builder.SetInsertPoint(&II);
1217
1218 // Assume the arguments are unchanged and later override them, if needed.
1219 SmallVector<Value *, 16> Args(II.args());
1220
1221 if (DMaskIdx < 0) {
1222 // Buffer case.
1223
1224 const unsigned ActiveBits = DemandedElts.getActiveBits();
1225 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
1226
1227 // Start assuming the prefix of elements is demanded, but possibly clear
1228 // some other bits if there are trailing zeros (unused components at front)
1229 // and update offset.
1230 DemandedElts = (1 << ActiveBits) - 1;
1231
1232 if (UnusedComponentsAtFront > 0) {
1233 static const unsigned InvalidOffsetIdx = 0xf;
1234
1235 unsigned OffsetIdx;
1236 switch (II.getIntrinsicID()) {
1237 case Intrinsic::amdgcn_raw_buffer_load:
1238 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1239 OffsetIdx = 1;
1240 break;
1241 case Intrinsic::amdgcn_s_buffer_load:
1242 // If resulting type is vec3, there is no point in trimming the
1243 // load with updated offset, as the vec3 would most likely be widened to
1244 // vec4 anyway during lowering.
1245 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
1246 OffsetIdx = InvalidOffsetIdx;
1247 else
1248 OffsetIdx = 1;
1249 break;
1250 case Intrinsic::amdgcn_struct_buffer_load:
1251 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1252 OffsetIdx = 2;
1253 break;
1254 default:
1255 // TODO: handle tbuffer* intrinsics.
1256 OffsetIdx = InvalidOffsetIdx;
1257 break;
1258 }
1259
1260 if (OffsetIdx != InvalidOffsetIdx) {
1261 // Clear demanded bits and update the offset.
1262 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
1263 auto *Offset = Args[OffsetIdx];
1264 unsigned SingleComponentSizeInBits =
1265 IC.getDataLayout().getTypeSizeInBits(EltTy);
1266 unsigned OffsetAdd =
1267 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
1268 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
1269 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
1270 }
1271 }
1272 } else {
1273 // Image case.
1274
1275 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
1276 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
1277
1278 // dmask 0 has special semantics, do not simplify.
1279 if (DMaskVal == 0)
1280 return nullptr;
1281
1282 // Mask off values that are undefined because the dmask doesn't cover them
1283 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
1284
1285 unsigned NewDMaskVal = 0;
1286 unsigned OrigLdStIdx = 0;
1287 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
1288 const unsigned Bit = 1 << SrcIdx;
1289 if (!!(DMaskVal & Bit)) {
1290 if (!!DemandedElts[OrigLdStIdx])
1291 NewDMaskVal |= Bit;
1292 OrigLdStIdx++;
1293 }
1294 }
1295
1296 if (DMaskVal != NewDMaskVal)
1297 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
1298 }
1299
1300 unsigned NewNumElts = DemandedElts.popcount();
1301 if (!NewNumElts)
1302 return PoisonValue::get(IIVTy);
1303
1304 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
1305 if (DMaskIdx >= 0)
1306 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
1307 return nullptr;
1308 }
1309
1310 // Validate function argument and return types, extracting overloaded types
1311 // along the way.
1312 SmallVector<Type *, 6> OverloadTys;
1313 if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), OverloadTys))
1314 return nullptr;
1315
1316 Type *NewTy =
1317 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
1318 OverloadTys[0] = NewTy;
1319
1320 if (!IsLoad) {
1321 SmallVector<int, 8> EltMask;
1322 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
1323 if (DemandedElts[OrigStoreIdx])
1324 EltMask.push_back(OrigStoreIdx);
1325
1326 if (NewNumElts == 1)
1327 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
1328 else
1329 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
1330 }
1331
1332 Function *NewIntrin = Intrinsic::getDeclaration(
1333 II.getModule(), II.getIntrinsicID(), OverloadTys);
1334 CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
1335 NewCall->takeName(&II);
1336 NewCall->copyMetadata(II);
1337
1338 if (IsLoad) {
1339 if (NewNumElts == 1) {
1340 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
1341 DemandedElts.countr_zero());
1342 }
1343
1344 SmallVector<int, 8> EltMask;
1345 unsigned NewLoadIdx = 0;
1346 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
1347 if (!!DemandedElts[OrigLoadIdx])
1348 EltMask.push_back(NewLoadIdx++);
1349 else
1350 EltMask.push_back(NewNumElts);
1351 }
1352
1353 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
1354
1355 return Shuffle;
1356 }
1357
1358 return NewCall;
1359 }
1360
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp) const1361 std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
1362 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
1363 APInt &UndefElts2, APInt &UndefElts3,
1364 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1365 SimplifyAndSetOp) const {
1366 switch (II.getIntrinsicID()) {
1367 case Intrinsic::amdgcn_buffer_load:
1368 case Intrinsic::amdgcn_buffer_load_format:
1369 case Intrinsic::amdgcn_raw_buffer_load:
1370 case Intrinsic::amdgcn_raw_ptr_buffer_load:
1371 case Intrinsic::amdgcn_raw_buffer_load_format:
1372 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
1373 case Intrinsic::amdgcn_raw_tbuffer_load:
1374 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
1375 case Intrinsic::amdgcn_s_buffer_load:
1376 case Intrinsic::amdgcn_struct_buffer_load:
1377 case Intrinsic::amdgcn_struct_ptr_buffer_load:
1378 case Intrinsic::amdgcn_struct_buffer_load_format:
1379 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
1380 case Intrinsic::amdgcn_struct_tbuffer_load:
1381 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
1382 case Intrinsic::amdgcn_tbuffer_load:
1383 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
1384 default: {
1385 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
1386 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
1387 }
1388 break;
1389 }
1390 }
1391 return std::nullopt;
1392 }
1393