1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPULegalizerInfo.h"
15
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41 "amdgpu-global-isel-new-legality",
42 cl::desc("Use GlobalISel desired legality, rather than try to use"
43 "rules compatible with selection patterns"),
44 cl::init(false),
45 cl::ReallyHidden);
46
47 static constexpr unsigned MaxRegisterSize = 1024;
48
49 // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)50 static LLT getPow2VectorType(LLT Ty) {
51 unsigned NElts = Ty.getNumElements();
52 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
53 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55
56 // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)57 static LLT getPow2ScalarType(LLT Ty) {
58 unsigned Bits = Ty.getSizeInBits();
59 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
60 return LLT::scalar(Pow2Bits);
61 }
62
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67 return [=](const LegalityQuery &Query) {
68 const LLT Ty = Query.Types[TypeIdx];
69 if (!Ty.isVector())
70 return false;
71
72 const LLT EltTy = Ty.getElementType();
73 const unsigned EltSize = EltTy.getSizeInBits();
74 return Ty.getNumElements() % 2 != 0 &&
75 EltSize > 1 && EltSize < 32 &&
76 Ty.getSizeInBits() % 32 != 0;
77 };
78 }
79
sizeIsMultipleOf32(unsigned TypeIdx)80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81 return [=](const LegalityQuery &Query) {
82 const LLT Ty = Query.Types[TypeIdx];
83 return Ty.getSizeInBits() % 32 == 0;
84 };
85 }
86
isWideVec16(unsigned TypeIdx)87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88 return [=](const LegalityQuery &Query) {
89 const LLT Ty = Query.Types[TypeIdx];
90 const LLT EltTy = Ty.getScalarType();
91 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92 };
93 }
94
oneMoreElement(unsigned TypeIdx)95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96 return [=](const LegalityQuery &Query) {
97 const LLT Ty = Query.Types[TypeIdx];
98 const LLT EltTy = Ty.getElementType();
99 return std::make_pair(TypeIdx,
100 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101 };
102 }
103
fewerEltsToSize64Vector(unsigned TypeIdx)104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 unsigned Size = Ty.getSizeInBits();
109 unsigned Pieces = (Size + 63) / 64;
110 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111 return std::make_pair(
112 TypeIdx,
113 LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114 };
115 }
116
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
moreEltsToNext32Bit(unsigned TypeIdx)119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120 return [=](const LegalityQuery &Query) {
121 const LLT Ty = Query.Types[TypeIdx];
122
123 const LLT EltTy = Ty.getElementType();
124 const int Size = Ty.getSizeInBits();
125 const int EltSize = EltTy.getSizeInBits();
126 const int NextMul32 = (Size + 31) / 32;
127
128 assert(EltSize < 32);
129
130 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131 return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132 };
133 }
134
getBitcastRegisterType(const LLT Ty)135 static LLT getBitcastRegisterType(const LLT Ty) {
136 const unsigned Size = Ty.getSizeInBits();
137
138 if (Size <= 32) {
139 // <2 x s8> -> s16
140 // <4 x s8> -> s32
141 return LLT::scalar(Size);
142 }
143
144 return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146
bitcastToRegisterType(unsigned TypeIdx)147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148 return [=](const LegalityQuery &Query) {
149 const LLT Ty = Query.Types[TypeIdx];
150 return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151 };
152 }
153
bitcastToVectorElement32(unsigned TypeIdx)154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155 return [=](const LegalityQuery &Query) {
156 const LLT Ty = Query.Types[TypeIdx];
157 unsigned Size = Ty.getSizeInBits();
158 assert(Size % 32 == 0);
159 return std::make_pair(
160 TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161 };
162 }
163
vectorSmallerThan(unsigned TypeIdx,unsigned Size)164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165 return [=](const LegalityQuery &Query) {
166 const LLT QueryTy = Query.Types[TypeIdx];
167 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168 };
169 }
170
vectorWiderThan(unsigned TypeIdx,unsigned Size)171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172 return [=](const LegalityQuery &Query) {
173 const LLT QueryTy = Query.Types[TypeIdx];
174 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175 };
176 }
177
numElementsNotEven(unsigned TypeIdx)178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179 return [=](const LegalityQuery &Query) {
180 const LLT QueryTy = Query.Types[TypeIdx];
181 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182 };
183 }
184
isRegisterSize(unsigned Size)185 static bool isRegisterSize(unsigned Size) {
186 return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188
isRegisterVectorElementType(LLT EltTy)189 static bool isRegisterVectorElementType(LLT EltTy) {
190 const int EltSize = EltTy.getSizeInBits();
191 return EltSize == 16 || EltSize % 32 == 0;
192 }
193
isRegisterVectorType(LLT Ty)194 static bool isRegisterVectorType(LLT Ty) {
195 const int EltSize = Ty.getElementType().getSizeInBits();
196 return EltSize == 32 || EltSize == 64 ||
197 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198 EltSize == 128 || EltSize == 256;
199 }
200
isRegisterType(LLT Ty)201 static bool isRegisterType(LLT Ty) {
202 if (!isRegisterSize(Ty.getSizeInBits()))
203 return false;
204
205 if (Ty.isVector())
206 return isRegisterVectorType(Ty);
207
208 return true;
209 }
210
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
isRegisterType(unsigned TypeIdx)213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214 return [=](const LegalityQuery &Query) {
215 return isRegisterType(Query.Types[TypeIdx]);
216 };
217 }
218
elementTypeIsLegal(unsigned TypeIdx)219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220 return [=](const LegalityQuery &Query) {
221 const LLT QueryTy = Query.Types[TypeIdx];
222 if (!QueryTy.isVector())
223 return false;
224 const LLT EltTy = QueryTy.getElementType();
225 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226 };
227 }
228
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232 return [=](const LegalityQuery &Query) {
233 const LLT Ty = Query.Types[TypeIdx];
234 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236 };
237 }
238
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad)242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243 bool IsLoad) {
244 switch (AS) {
245 case AMDGPUAS::PRIVATE_ADDRESS:
246 // FIXME: Private element size.
247 return ST.enableFlatScratch() ? 128 : 32;
248 case AMDGPUAS::LOCAL_ADDRESS:
249 return ST.useDS128() ? 128 : 64;
250 case AMDGPUAS::GLOBAL_ADDRESS:
251 case AMDGPUAS::CONSTANT_ADDRESS:
252 case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253 // Treat constant and global as identical. SMRD loads are sometimes usable for
254 // global loads (ideally constant address space should be eliminated)
255 // depending on the context. Legality cannot be context dependent, but
256 // RegBankSelect can split the load as necessary depending on the pointer
257 // register bank/uniformity and if the memory is invariant or not written in a
258 // kernel.
259 return IsLoad ? 512 : 128;
260 default:
261 // Flat addresses may contextually need to be split to 32-bit parts if they
262 // may alias scratch depending on the subtarget.
263 return 128;
264 }
265 }
266
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268 const LegalityQuery &Query) {
269 const LLT Ty = Query.Types[0];
270
271 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273
274 unsigned RegSize = Ty.getSizeInBits();
275 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277 unsigned AS = Query.Types[1].getAddressSpace();
278
279 // All of these need to be custom lowered to cast the pointer operand.
280 if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281 return false;
282
283 // Do not handle extending vector loads.
284 if (Ty.isVector() && MemSize != RegSize)
285 return false;
286
287 // TODO: We should be able to widen loads if the alignment is high enough, but
288 // we also need to modify the memory access size.
289 #if 0
290 // Accept widening loads based on alignment.
291 if (IsLoad && MemSize < Size)
292 MemSize = std::max(MemSize, Align);
293 #endif
294
295 // Only 1-byte and 2-byte to 32-bit extloads are valid.
296 if (MemSize != RegSize && RegSize != 32)
297 return false;
298
299 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300 return false;
301
302 switch (MemSize) {
303 case 8:
304 case 16:
305 case 32:
306 case 64:
307 case 128:
308 break;
309 case 96:
310 if (!ST.hasDwordx3LoadStores())
311 return false;
312 break;
313 case 256:
314 case 512:
315 // These may contextually need to be broken down.
316 break;
317 default:
318 return false;
319 }
320
321 assert(RegSize >= MemSize);
322
323 if (AlignBits < MemSize) {
324 const SITargetLowering *TLI = ST.getTargetLowering();
325 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326 Align(AlignBits / 8)))
327 return false;
328 }
329
330 return true;
331 }
332
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338 if (EnableNewLegality)
339 return false;
340
341 const unsigned Size = Ty.getSizeInBits();
342 if (Size <= 64)
343 return false;
344 if (!Ty.isVector())
345 return true;
346
347 LLT EltTy = Ty.getElementType();
348 if (EltTy.isPointer())
349 return true;
350
351 unsigned EltSize = EltTy.getSizeInBits();
352 return EltSize != 32 && EltSize != 64;
353 }
354
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356 const LLT Ty = Query.Types[0];
357 return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358 !loadStoreBitcastWorkaround(Ty);
359 }
360
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364 const LLT MemTy) {
365 const unsigned MemSizeInBits = MemTy.getSizeInBits();
366 const unsigned Size = Ty.getSizeInBits();
367 if (Size != MemSizeInBits)
368 return Size <= 32 && Ty.isVector();
369
370 if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371 return true;
372
373 // Don't try to handle bitcasting vector ext loads for now.
374 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375 (Size <= 32 || isRegisterSize(Size)) &&
376 !isRegisterVectorElementType(Ty.getElementType());
377 }
378
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,uint64_t AlignInBits,unsigned AddrSpace,unsigned Opcode)382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383 uint64_t AlignInBits, unsigned AddrSpace,
384 unsigned Opcode) {
385 unsigned SizeInBits = MemoryTy.getSizeInBits();
386 // We don't want to widen cases that are naturally legal.
387 if (isPowerOf2_32(SizeInBits))
388 return false;
389
390 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391 // end up widening these for a scalar load during RegBankSelect, since there
392 // aren't 96-bit scalar loads.
393 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394 return false;
395
396 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397 return false;
398
399 // A load is known dereferenceable up to the alignment, so it's legal to widen
400 // to it.
401 //
402 // TODO: Could check dereferenceable for less aligned cases.
403 unsigned RoundedSize = NextPowerOf2(SizeInBits);
404 if (AlignInBits < RoundedSize)
405 return false;
406
407 // Do not widen if it would introduce a slow unaligned load.
408 const SITargetLowering *TLI = ST.getTargetLowering();
409 bool Fast = false;
410 return TLI->allowsMisalignedMemoryAccessesImpl(
411 RoundedSize, AddrSpace, Align(AlignInBits / 8),
412 MachineMemOperand::MOLoad, &Fast) &&
413 Fast;
414 }
415
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417 unsigned Opcode) {
418 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419 return false;
420
421 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422 Query.MMODescrs[0].AlignInBits,
423 Query.Types[1].getAddressSpace(), Opcode);
424 }
425
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427 const GCNTargetMachine &TM)
428 : ST(ST_) {
429 using namespace TargetOpcode;
430
431 auto GetAddrSpacePtr = [&TM](unsigned AS) {
432 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433 };
434
435 const LLT S1 = LLT::scalar(1);
436 const LLT S8 = LLT::scalar(8);
437 const LLT S16 = LLT::scalar(16);
438 const LLT S32 = LLT::scalar(32);
439 const LLT S64 = LLT::scalar(64);
440 const LLT S128 = LLT::scalar(128);
441 const LLT S256 = LLT::scalar(256);
442 const LLT S512 = LLT::scalar(512);
443 const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444
445 const LLT V2S8 = LLT::fixed_vector(2, 8);
446 const LLT V2S16 = LLT::fixed_vector(2, 16);
447 const LLT V4S16 = LLT::fixed_vector(4, 16);
448
449 const LLT V2S32 = LLT::fixed_vector(2, 32);
450 const LLT V3S32 = LLT::fixed_vector(3, 32);
451 const LLT V4S32 = LLT::fixed_vector(4, 32);
452 const LLT V5S32 = LLT::fixed_vector(5, 32);
453 const LLT V6S32 = LLT::fixed_vector(6, 32);
454 const LLT V7S32 = LLT::fixed_vector(7, 32);
455 const LLT V8S32 = LLT::fixed_vector(8, 32);
456 const LLT V9S32 = LLT::fixed_vector(9, 32);
457 const LLT V10S32 = LLT::fixed_vector(10, 32);
458 const LLT V11S32 = LLT::fixed_vector(11, 32);
459 const LLT V12S32 = LLT::fixed_vector(12, 32);
460 const LLT V13S32 = LLT::fixed_vector(13, 32);
461 const LLT V14S32 = LLT::fixed_vector(14, 32);
462 const LLT V15S32 = LLT::fixed_vector(15, 32);
463 const LLT V16S32 = LLT::fixed_vector(16, 32);
464 const LLT V32S32 = LLT::fixed_vector(32, 32);
465
466 const LLT V2S64 = LLT::fixed_vector(2, 64);
467 const LLT V3S64 = LLT::fixed_vector(3, 64);
468 const LLT V4S64 = LLT::fixed_vector(4, 64);
469 const LLT V5S64 = LLT::fixed_vector(5, 64);
470 const LLT V6S64 = LLT::fixed_vector(6, 64);
471 const LLT V7S64 = LLT::fixed_vector(7, 64);
472 const LLT V8S64 = LLT::fixed_vector(8, 64);
473 const LLT V16S64 = LLT::fixed_vector(16, 64);
474
475 std::initializer_list<LLT> AllS32Vectors =
476 {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477 V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478 std::initializer_list<LLT> AllS64Vectors =
479 {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480
481 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488
489 const LLT CodePtr = FlatPtr;
490
491 const std::initializer_list<LLT> AddrSpaces64 = {
492 GlobalPtr, ConstantPtr, FlatPtr
493 };
494
495 const std::initializer_list<LLT> AddrSpaces32 = {
496 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497 };
498
499 const std::initializer_list<LLT> FPTypesBase = {
500 S32, S64
501 };
502
503 const std::initializer_list<LLT> FPTypes16 = {
504 S32, S64, S16
505 };
506
507 const std::initializer_list<LLT> FPTypesPK16 = {
508 S32, S64, S16, V2S16
509 };
510
511 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512
513 // s1 for VCC branches, s32 for SCC branches.
514 getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515
516 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517 // elements for v3s16
518 getActionDefinitionsBuilder(G_PHI)
519 .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520 .legalFor(AllS32Vectors)
521 .legalFor(AllS64Vectors)
522 .legalFor(AddrSpaces64)
523 .legalFor(AddrSpaces32)
524 .legalIf(isPointer(0))
525 .clampScalar(0, S16, S256)
526 .widenScalarToNextPow2(0, 32)
527 .clampMaxNumElements(0, S32, 16)
528 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529 .scalarize(0);
530
531 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532 // Full set of gfx9 features.
533 getActionDefinitionsBuilder({G_ADD, G_SUB})
534 .legalFor({S32, S16, V2S16})
535 .clampMaxNumElementsStrict(0, S16, 2)
536 .scalarize(0)
537 .minScalar(0, S16)
538 .widenScalarToNextMultipleOf(0, 32)
539 .maxScalar(0, S32);
540
541 getActionDefinitionsBuilder(G_MUL)
542 .legalFor({S32, S16, V2S16})
543 .clampMaxNumElementsStrict(0, S16, 2)
544 .scalarize(0)
545 .minScalar(0, S16)
546 .widenScalarToNextMultipleOf(0, 32)
547 .custom();
548 assert(ST.hasMad64_32());
549
550 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551 .legalFor({S32, S16, V2S16}) // Clamp modifier
552 .minScalarOrElt(0, S16)
553 .clampMaxNumElementsStrict(0, S16, 2)
554 .scalarize(0)
555 .widenScalarToNextPow2(0, 32)
556 .lower();
557 } else if (ST.has16BitInsts()) {
558 getActionDefinitionsBuilder({G_ADD, G_SUB})
559 .legalFor({S32, S16})
560 .minScalar(0, S16)
561 .widenScalarToNextMultipleOf(0, 32)
562 .maxScalar(0, S32)
563 .scalarize(0);
564
565 getActionDefinitionsBuilder(G_MUL)
566 .legalFor({S32, S16})
567 .scalarize(0)
568 .minScalar(0, S16)
569 .widenScalarToNextMultipleOf(0, 32)
570 .custom();
571 assert(ST.hasMad64_32());
572
573 // Technically the saturating operations require clamp bit support, but this
574 // was introduced at the same time as 16-bit operations.
575 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576 .legalFor({S32, S16}) // Clamp modifier
577 .minScalar(0, S16)
578 .scalarize(0)
579 .widenScalarToNextPow2(0, 16)
580 .lower();
581
582 // We're just lowering this, but it helps get a better result to try to
583 // coerce to the desired type first.
584 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585 .minScalar(0, S16)
586 .scalarize(0)
587 .lower();
588 } else {
589 getActionDefinitionsBuilder({G_ADD, G_SUB})
590 .legalFor({S32})
591 .widenScalarToNextMultipleOf(0, 32)
592 .clampScalar(0, S32, S32)
593 .scalarize(0);
594
595 auto &Mul = getActionDefinitionsBuilder(G_MUL)
596 .legalFor({S32})
597 .scalarize(0)
598 .minScalar(0, S32)
599 .widenScalarToNextMultipleOf(0, 32);
600
601 if (ST.hasMad64_32())
602 Mul.custom();
603 else
604 Mul.maxScalar(0, S32);
605
606 if (ST.hasIntClamp()) {
607 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608 .legalFor({S32}) // Clamp modifier.
609 .scalarize(0)
610 .minScalarOrElt(0, S32)
611 .lower();
612 } else {
613 // Clamp bit support was added in VI, along with 16-bit operations.
614 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615 .minScalar(0, S32)
616 .scalarize(0)
617 .lower();
618 }
619
620 // FIXME: DAG expansion gets better results. The widening uses the smaller
621 // range values and goes for the min/max lowering directly.
622 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623 .minScalar(0, S32)
624 .scalarize(0)
625 .lower();
626 }
627
628 getActionDefinitionsBuilder(
629 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630 .customFor({S32, S64})
631 .clampScalar(0, S32, S64)
632 .widenScalarToNextPow2(0, 32)
633 .scalarize(0);
634
635 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636 .legalFor({S32})
637 .maxScalar(0, S32);
638
639 if (ST.hasVOP3PInsts()) {
640 Mulh
641 .clampMaxNumElements(0, S8, 2)
642 .lowerFor({V2S8});
643 }
644
645 Mulh
646 .scalarize(0)
647 .lower();
648
649 // Report legal for any types we can handle anywhere. For the cases only legal
650 // on the SALU, RegBankSelect will be able to re-legalize.
651 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653 .clampScalar(0, S32, S64)
654 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655 .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656 .widenScalarToNextPow2(0)
657 .scalarize(0);
658
659 getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660 G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661 .legalFor({{S32, S1}, {S32, S32}})
662 .minScalar(0, S32)
663 .scalarize(0)
664 .lower();
665
666 getActionDefinitionsBuilder(G_BITCAST)
667 // Don't worry about the size constraint.
668 .legalIf(all(isRegisterType(0), isRegisterType(1)))
669 .lower();
670
671
672 getActionDefinitionsBuilder(G_CONSTANT)
673 .legalFor({S1, S32, S64, S16, GlobalPtr,
674 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675 .legalIf(isPointer(0))
676 .clampScalar(0, S32, S64)
677 .widenScalarToNextPow2(0);
678
679 getActionDefinitionsBuilder(G_FCONSTANT)
680 .legalFor({S32, S64, S16})
681 .clampScalar(0, S16, S64);
682
683 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684 .legalIf(isRegisterType(0))
685 // s1 and s16 are special cases because they have legal operations on
686 // them, but don't really occupy registers in the normal way.
687 .legalFor({S1, S16})
688 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689 .clampScalarOrElt(0, S32, MaxScalar)
690 .widenScalarToNextPow2(0, 32)
691 .clampMaxNumElements(0, S32, 16);
692
693 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694
695 // If the amount is divergent, we have to do a wave reduction to get the
696 // maximum value, so this is expanded during RegBankSelect.
697 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698 .legalFor({{PrivatePtr, S32}});
699
700 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701 .customIf(typeIsNot(0, PrivatePtr));
702
703 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704
705 auto &FPOpActions = getActionDefinitionsBuilder(
706 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707 .legalFor({S32, S64});
708 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709 .customFor({S32, S64});
710 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711 .customFor({S32, S64});
712
713 if (ST.has16BitInsts()) {
714 if (ST.hasVOP3PInsts())
715 FPOpActions.legalFor({S16, V2S16});
716 else
717 FPOpActions.legalFor({S16});
718
719 TrigActions.customFor({S16});
720 FDIVActions.customFor({S16});
721 }
722
723 auto &MinNumMaxNum = getActionDefinitionsBuilder({
724 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725
726 if (ST.hasVOP3PInsts()) {
727 MinNumMaxNum.customFor(FPTypesPK16)
728 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729 .clampMaxNumElements(0, S16, 2)
730 .clampScalar(0, S16, S64)
731 .scalarize(0);
732 } else if (ST.has16BitInsts()) {
733 MinNumMaxNum.customFor(FPTypes16)
734 .clampScalar(0, S16, S64)
735 .scalarize(0);
736 } else {
737 MinNumMaxNum.customFor(FPTypesBase)
738 .clampScalar(0, S32, S64)
739 .scalarize(0);
740 }
741
742 if (ST.hasVOP3PInsts())
743 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744
745 FPOpActions
746 .scalarize(0)
747 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748
749 TrigActions
750 .scalarize(0)
751 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752
753 FDIVActions
754 .scalarize(0)
755 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756
757 getActionDefinitionsBuilder({G_FNEG, G_FABS})
758 .legalFor(FPTypesPK16)
759 .clampMaxNumElementsStrict(0, S16, 2)
760 .scalarize(0)
761 .clampScalar(0, S16, S64);
762
763 if (ST.has16BitInsts()) {
764 getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765 .legalFor({S32, S64, S16})
766 .scalarize(0)
767 .clampScalar(0, S16, S64);
768 } else {
769 getActionDefinitionsBuilder(G_FSQRT)
770 .legalFor({S32, S64})
771 .scalarize(0)
772 .clampScalar(0, S32, S64);
773
774 if (ST.hasFractBug()) {
775 getActionDefinitionsBuilder(G_FFLOOR)
776 .customFor({S64})
777 .legalFor({S32, S64})
778 .scalarize(0)
779 .clampScalar(0, S32, S64);
780 } else {
781 getActionDefinitionsBuilder(G_FFLOOR)
782 .legalFor({S32, S64})
783 .scalarize(0)
784 .clampScalar(0, S32, S64);
785 }
786 }
787
788 getActionDefinitionsBuilder(G_FPTRUNC)
789 .legalFor({{S32, S64}, {S16, S32}})
790 .scalarize(0)
791 .lower();
792
793 getActionDefinitionsBuilder(G_FPEXT)
794 .legalFor({{S64, S32}, {S32, S16}})
795 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796 .scalarize(0);
797
798 auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799 if (ST.has16BitInsts()) {
800 FSubActions
801 // Use actual fsub instruction
802 .legalFor({S32, S16})
803 // Must use fadd + fneg
804 .lowerFor({S64, V2S16});
805 } else {
806 FSubActions
807 // Use actual fsub instruction
808 .legalFor({S32})
809 // Must use fadd + fneg
810 .lowerFor({S64, S16, V2S16});
811 }
812
813 FSubActions
814 .scalarize(0)
815 .clampScalar(0, S32, S64);
816
817 // Whether this is legal depends on the floating point mode for the function.
818 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820 FMad.customFor({S32, S16});
821 else if (ST.hasMadMacF32Insts())
822 FMad.customFor({S32});
823 else if (ST.hasMadF16())
824 FMad.customFor({S16});
825 FMad.scalarize(0)
826 .lower();
827
828 auto &FRem = getActionDefinitionsBuilder(G_FREM);
829 if (ST.has16BitInsts()) {
830 FRem.customFor({S16, S32, S64});
831 } else {
832 FRem.minScalar(0, S32)
833 .customFor({S32, S64});
834 }
835 FRem.scalarize(0);
836
837 // TODO: Do we need to clamp maximum bitwidth?
838 getActionDefinitionsBuilder(G_TRUNC)
839 .legalIf(isScalar(0))
840 .legalFor({{V2S16, V2S32}})
841 .clampMaxNumElements(0, S16, 2)
842 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843 // situations (like an invalid implicit use), we don't want to infinite loop
844 // in the legalizer.
845 .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
846 .alwaysLegal();
847
848 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850 {S32, S1}, {S64, S1}, {S16, S1}})
851 .scalarize(0)
852 .clampScalar(0, S32, S64)
853 .widenScalarToNextPow2(1, 32);
854
855 // TODO: Split s1->s64 during regbankselect for VALU.
856 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858 .lowerIf(typeIs(1, S1))
859 .customFor({{S32, S64}, {S64, S64}});
860 if (ST.has16BitInsts())
861 IToFP.legalFor({{S16, S16}});
862 IToFP.clampScalar(1, S32, S64)
863 .minScalar(0, S32)
864 .scalarize(0)
865 .widenScalarToNextPow2(1);
866
867 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869 .customFor({{S64, S32}, {S64, S64}})
870 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871 if (ST.has16BitInsts())
872 FPToI.legalFor({{S16, S16}});
873 else
874 FPToI.minScalar(1, S32);
875
876 FPToI.minScalar(0, S32)
877 .widenScalarToNextPow2(0, 32)
878 .scalarize(0)
879 .lower();
880
881 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882 .customFor({S16, S32})
883 .scalarize(0)
884 .lower();
885
886 // Lower roundeven into G_FRINT
887 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888 .scalarize(0)
889 .lower();
890
891 if (ST.has16BitInsts()) {
892 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893 .legalFor({S16, S32, S64})
894 .clampScalar(0, S16, S64)
895 .scalarize(0);
896 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898 .legalFor({S32, S64})
899 .clampScalar(0, S32, S64)
900 .scalarize(0);
901 } else {
902 getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903 .legalFor({S32})
904 .customFor({S64})
905 .clampScalar(0, S32, S64)
906 .scalarize(0);
907 }
908
909 getActionDefinitionsBuilder(G_PTR_ADD)
910 .legalIf(all(isPointer(0), sameSize(0, 1)))
911 .scalarize(0)
912 .scalarSameSizeAs(1, 0);
913
914 getActionDefinitionsBuilder(G_PTRMASK)
915 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916 .scalarSameSizeAs(1, 0)
917 .scalarize(0);
918
919 auto &CmpBuilder =
920 getActionDefinitionsBuilder(G_ICMP)
921 // The compare output type differs based on the register bank of the output,
922 // so make both s1 and s32 legal.
923 //
924 // Scalar compares producing output in scc will be promoted to s32, as that
925 // is the allocatable register type that will be needed for the copy from
926 // scc. This will be promoted during RegBankSelect, and we assume something
927 // before that won't try to use s32 result types.
928 //
929 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930 // bank.
931 .legalForCartesianProduct(
932 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933 .legalForCartesianProduct(
934 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935 if (ST.has16BitInsts()) {
936 CmpBuilder.legalFor({{S1, S16}});
937 }
938
939 CmpBuilder
940 .widenScalarToNextPow2(1)
941 .clampScalar(1, S32, S64)
942 .scalarize(0)
943 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944
945 getActionDefinitionsBuilder(G_FCMP)
946 .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947 .widenScalarToNextPow2(1)
948 .clampScalar(1, S32, S64)
949 .scalarize(0);
950
951 // FIXME: fpow has a selection pattern that should move to custom lowering.
952 auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953 if (ST.has16BitInsts())
954 Exp2Ops.legalFor({S32, S16});
955 else
956 Exp2Ops.legalFor({S32});
957 Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958 Exp2Ops.scalarize(0);
959
960 auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961 if (ST.has16BitInsts())
962 ExpOps.customFor({{S32}, {S16}});
963 else
964 ExpOps.customFor({S32});
965 ExpOps.clampScalar(0, MinScalarFPTy, S32)
966 .scalarize(0);
967
968 getActionDefinitionsBuilder(G_FPOWI)
969 .clampScalar(0, MinScalarFPTy, S32)
970 .lower();
971
972 // The 64-bit versions produce 32-bit results, but only on the SALU.
973 getActionDefinitionsBuilder(G_CTPOP)
974 .legalFor({{S32, S32}, {S32, S64}})
975 .clampScalar(0, S32, S32)
976 .widenScalarToNextPow2(1, 32)
977 .clampScalar(1, S32, S64)
978 .scalarize(0)
979 .widenScalarToNextPow2(0, 32);
980
981
982 // The hardware instructions return a different result on 0 than the generic
983 // instructions expect. The hardware produces -1, but these produce the
984 // bitwidth.
985 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986 .scalarize(0)
987 .clampScalar(0, S32, S32)
988 .clampScalar(1, S32, S64)
989 .widenScalarToNextPow2(0, 32)
990 .widenScalarToNextPow2(1, 32)
991 .custom();
992
993 // The 64-bit versions produce 32-bit results, but only on the SALU.
994 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995 .legalFor({{S32, S32}, {S32, S64}})
996 .clampScalar(0, S32, S32)
997 .clampScalar(1, S32, S64)
998 .scalarize(0)
999 .widenScalarToNextPow2(0, 32)
1000 .widenScalarToNextPow2(1, 32);
1001
1002 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003 // RegBankSelect.
1004 getActionDefinitionsBuilder(G_BITREVERSE)
1005 .legalFor({S32, S64})
1006 .clampScalar(0, S32, S64)
1007 .scalarize(0)
1008 .widenScalarToNextPow2(0);
1009
1010 if (ST.has16BitInsts()) {
1011 getActionDefinitionsBuilder(G_BSWAP)
1012 .legalFor({S16, S32, V2S16})
1013 .clampMaxNumElementsStrict(0, S16, 2)
1014 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015 // narrowScalar limitation.
1016 .widenScalarToNextPow2(0)
1017 .clampScalar(0, S16, S32)
1018 .scalarize(0);
1019
1020 if (ST.hasVOP3PInsts()) {
1021 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022 .legalFor({S32, S16, V2S16})
1023 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024 .clampMaxNumElements(0, S16, 2)
1025 .minScalar(0, S16)
1026 .widenScalarToNextPow2(0)
1027 .scalarize(0)
1028 .lower();
1029 } else {
1030 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031 .legalFor({S32, S16})
1032 .widenScalarToNextPow2(0)
1033 .minScalar(0, S16)
1034 .scalarize(0)
1035 .lower();
1036 }
1037 } else {
1038 // TODO: Should have same legality without v_perm_b32
1039 getActionDefinitionsBuilder(G_BSWAP)
1040 .legalFor({S32})
1041 .lowerIf(scalarNarrowerThan(0, 32))
1042 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043 // narrowScalar limitation.
1044 .widenScalarToNextPow2(0)
1045 .maxScalar(0, S32)
1046 .scalarize(0)
1047 .lower();
1048
1049 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050 .legalFor({S32})
1051 .minScalar(0, S32)
1052 .widenScalarToNextPow2(0)
1053 .scalarize(0)
1054 .lower();
1055 }
1056
1057 getActionDefinitionsBuilder(G_INTTOPTR)
1058 // List the common cases
1059 .legalForCartesianProduct(AddrSpaces64, {S64})
1060 .legalForCartesianProduct(AddrSpaces32, {S32})
1061 .scalarize(0)
1062 // Accept any address space as long as the size matches
1063 .legalIf(sameSize(0, 1))
1064 .widenScalarIf(smallerThan(1, 0),
1065 [](const LegalityQuery &Query) {
1066 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067 })
1068 .narrowScalarIf(largerThan(1, 0),
1069 [](const LegalityQuery &Query) {
1070 return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071 });
1072
1073 getActionDefinitionsBuilder(G_PTRTOINT)
1074 // List the common cases
1075 .legalForCartesianProduct(AddrSpaces64, {S64})
1076 .legalForCartesianProduct(AddrSpaces32, {S32})
1077 .scalarize(0)
1078 // Accept any address space as long as the size matches
1079 .legalIf(sameSize(0, 1))
1080 .widenScalarIf(smallerThan(0, 1),
1081 [](const LegalityQuery &Query) {
1082 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083 })
1084 .narrowScalarIf(
1085 largerThan(0, 1),
1086 [](const LegalityQuery &Query) {
1087 return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088 });
1089
1090 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091 .scalarize(0)
1092 .custom();
1093
1094 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095 bool IsLoad) -> bool {
1096 const LLT DstTy = Query.Types[0];
1097
1098 // Split vector extloads.
1099 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100
1101 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102 return true;
1103
1104 const LLT PtrTy = Query.Types[1];
1105 unsigned AS = PtrTy.getAddressSpace();
1106 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107 return true;
1108
1109 // Catch weird sized loads that don't evenly divide into the access sizes
1110 // TODO: May be able to widen depending on alignment etc.
1111 unsigned NumRegs = (MemSize + 31) / 32;
1112 if (NumRegs == 3) {
1113 if (!ST.hasDwordx3LoadStores())
1114 return true;
1115 } else {
1116 // If the alignment allows, these should have been widened.
1117 if (!isPowerOf2_32(NumRegs))
1118 return true;
1119 }
1120
1121 return false;
1122 };
1123
1124 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127
1128 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129 // LDS
1130 // TODO: Unsupported flat for SI.
1131
1132 for (unsigned Op : {G_LOAD, G_STORE}) {
1133 const bool IsStore = Op == G_STORE;
1134
1135 auto &Actions = getActionDefinitionsBuilder(Op);
1136 // Explicitly list some common cases.
1137 // TODO: Does this help compile time at all?
1138 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141 {S64, GlobalPtr, S64, GlobalAlign32},
1142 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144 {S32, GlobalPtr, S8, GlobalAlign8},
1145 {S32, GlobalPtr, S16, GlobalAlign16},
1146
1147 {S32, LocalPtr, S32, 32},
1148 {S64, LocalPtr, S64, 32},
1149 {V2S32, LocalPtr, V2S32, 32},
1150 {S32, LocalPtr, S8, 8},
1151 {S32, LocalPtr, S16, 16},
1152 {V2S16, LocalPtr, S32, 32},
1153
1154 {S32, PrivatePtr, S32, 32},
1155 {S32, PrivatePtr, S8, 8},
1156 {S32, PrivatePtr, S16, 16},
1157 {V2S16, PrivatePtr, S32, 32},
1158
1159 {S32, ConstantPtr, S32, GlobalAlign32},
1160 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162 {S64, ConstantPtr, S64, GlobalAlign32},
1163 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164 Actions.legalIf(
1165 [=](const LegalityQuery &Query) -> bool {
1166 return isLoadStoreLegal(ST, Query);
1167 });
1168
1169 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170 // 64-bits.
1171 //
1172 // TODO: Should generalize bitcast action into coerce, which will also cover
1173 // inserting addrspacecasts.
1174 Actions.customIf(typeIs(1, Constant32Ptr));
1175
1176 // Turn any illegal element vectors into something easier to deal
1177 // with. These will ultimately produce 32-bit scalar shifts to extract the
1178 // parts anyway.
1179 //
1180 // For odd 16-bit element vectors, prefer to split those into pieces with
1181 // 16-bit vector parts.
1182 Actions.bitcastIf(
1183 [=](const LegalityQuery &Query) -> bool {
1184 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185 Query.MMODescrs[0].MemoryTy);
1186 }, bitcastToRegisterType(0));
1187
1188 if (!IsStore) {
1189 // Widen suitably aligned loads by loading extra bytes. The standard
1190 // legalization actions can't properly express widening memory operands.
1191 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192 return shouldWidenLoad(ST, Query, G_LOAD);
1193 });
1194 }
1195
1196 // FIXME: load/store narrowing should be moved to lower action
1197 Actions
1198 .narrowScalarIf(
1199 [=](const LegalityQuery &Query) -> bool {
1200 return !Query.Types[0].isVector() &&
1201 needToSplitMemOp(Query, Op == G_LOAD);
1202 },
1203 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204 const LLT DstTy = Query.Types[0];
1205 const LLT PtrTy = Query.Types[1];
1206
1207 const unsigned DstSize = DstTy.getSizeInBits();
1208 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209
1210 // Split extloads.
1211 if (DstSize > MemSize)
1212 return std::make_pair(0, LLT::scalar(MemSize));
1213
1214 unsigned MaxSize = maxSizeForAddrSpace(ST,
1215 PtrTy.getAddressSpace(),
1216 Op == G_LOAD);
1217 if (MemSize > MaxSize)
1218 return std::make_pair(0, LLT::scalar(MaxSize));
1219
1220 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221 return std::make_pair(0, LLT::scalar(Align));
1222 })
1223 .fewerElementsIf(
1224 [=](const LegalityQuery &Query) -> bool {
1225 return Query.Types[0].isVector() &&
1226 needToSplitMemOp(Query, Op == G_LOAD);
1227 },
1228 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229 const LLT DstTy = Query.Types[0];
1230 const LLT PtrTy = Query.Types[1];
1231
1232 LLT EltTy = DstTy.getElementType();
1233 unsigned MaxSize = maxSizeForAddrSpace(ST,
1234 PtrTy.getAddressSpace(),
1235 Op == G_LOAD);
1236
1237 // FIXME: Handle widened to power of 2 results better. This ends
1238 // up scalarizing.
1239 // FIXME: 3 element stores scalarized on SI
1240
1241 // Split if it's too large for the address space.
1242 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243 if (MemSize > MaxSize) {
1244 unsigned NumElts = DstTy.getNumElements();
1245 unsigned EltSize = EltTy.getSizeInBits();
1246
1247 if (MaxSize % EltSize == 0) {
1248 return std::make_pair(
1249 0, LLT::scalarOrVector(
1250 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251 }
1252
1253 unsigned NumPieces = MemSize / MaxSize;
1254
1255 // FIXME: Refine when odd breakdowns handled
1256 // The scalars will need to be re-legalized.
1257 if (NumPieces == 1 || NumPieces >= NumElts ||
1258 NumElts % NumPieces != 0)
1259 return std::make_pair(0, EltTy);
1260
1261 return std::make_pair(
1262 0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263 }
1264
1265 // FIXME: We could probably handle weird extending loads better.
1266 if (DstTy.getSizeInBits() > MemSize)
1267 return std::make_pair(0, EltTy);
1268
1269 unsigned EltSize = EltTy.getSizeInBits();
1270 unsigned DstSize = DstTy.getSizeInBits();
1271 if (!isPowerOf2_32(DstSize)) {
1272 // We're probably decomposing an odd sized store. Try to split
1273 // to the widest type. TODO: Account for alignment. As-is it
1274 // should be OK, since the new parts will be further legalized.
1275 unsigned FloorSize = PowerOf2Floor(DstSize);
1276 return std::make_pair(
1277 0, LLT::scalarOrVector(
1278 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279 }
1280
1281 // May need relegalization for the scalars.
1282 return std::make_pair(0, EltTy);
1283 })
1284 .minScalar(0, S32)
1285 .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286 .widenScalarToNextPow2(0)
1287 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288 .lower();
1289 }
1290
1291 // FIXME: Unaligned accesses not lowered.
1292 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294 {S32, GlobalPtr, S16, 2 * 8},
1295 {S32, LocalPtr, S8, 8},
1296 {S32, LocalPtr, S16, 16},
1297 {S32, PrivatePtr, S8, 8},
1298 {S32, PrivatePtr, S16, 16},
1299 {S32, ConstantPtr, S8, 8},
1300 {S32, ConstantPtr, S16, 2 * 8}})
1301 .legalIf(
1302 [=](const LegalityQuery &Query) -> bool {
1303 return isLoadStoreLegal(ST, Query);
1304 });
1305
1306 if (ST.hasFlatAddressSpace()) {
1307 ExtLoads.legalForTypesWithMemDesc(
1308 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309 }
1310
1311 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312 // 64-bits.
1313 //
1314 // TODO: Should generalize bitcast action into coerce, which will also cover
1315 // inserting addrspacecasts.
1316 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317
1318 ExtLoads.clampScalar(0, S32, S32)
1319 .widenScalarToNextPow2(0)
1320 .lower();
1321
1322 auto &Atomics = getActionDefinitionsBuilder(
1323 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326 G_ATOMICRMW_UMIN})
1327 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328 {S64, GlobalPtr}, {S64, LocalPtr},
1329 {S32, RegionPtr}, {S64, RegionPtr}});
1330 if (ST.hasFlatAddressSpace()) {
1331 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332 }
1333
1334 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335 if (ST.hasLDSFPAtomicAdd()) {
1336 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337 if (ST.hasGFX90AInsts())
1338 Atomic.legalFor({{S64, LocalPtr}});
1339 if (ST.hasGFX940Insts())
1340 Atomic.legalFor({{V2S16, LocalPtr}});
1341 }
1342 if (ST.hasAtomicFaddInsts())
1343 Atomic.legalFor({{S32, GlobalPtr}});
1344
1345 if (ST.hasGFX90AInsts()) {
1346 // These are legal with some caveats, and should have undergone expansion in
1347 // the IR in most situations
1348 // TODO: Move atomic expansion into legalizer
1349 // TODO: Also supports <2 x f16>
1350 Atomic.legalFor({
1351 {S32, GlobalPtr},
1352 {S64, GlobalPtr},
1353 {S64, FlatPtr}
1354 });
1355 }
1356
1357 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358 // demarshalling
1359 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361 {S32, FlatPtr}, {S64, FlatPtr}})
1362 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363 {S32, RegionPtr}, {S64, RegionPtr}});
1364 // TODO: Pointer types, any 32-bit or 64-bit vector
1365
1366 // Condition should be s32 for scalar, s1 for vector.
1367 getActionDefinitionsBuilder(G_SELECT)
1368 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369 LocalPtr, FlatPtr, PrivatePtr,
1370 LLT::fixed_vector(2, LocalPtr),
1371 LLT::fixed_vector(2, PrivatePtr)},
1372 {S1, S32})
1373 .clampScalar(0, S16, S64)
1374 .scalarize(1)
1375 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1376 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1377 .clampMaxNumElements(0, S32, 2)
1378 .clampMaxNumElements(0, LocalPtr, 2)
1379 .clampMaxNumElements(0, PrivatePtr, 2)
1380 .scalarize(0)
1381 .widenScalarToNextPow2(0)
1382 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1383
1384 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1385 // be more flexible with the shift amount type.
1386 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387 .legalFor({{S32, S32}, {S64, S32}});
1388 if (ST.has16BitInsts()) {
1389 if (ST.hasVOP3PInsts()) {
1390 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391 .clampMaxNumElements(0, S16, 2);
1392 } else
1393 Shifts.legalFor({{S16, S16}});
1394
1395 // TODO: Support 16-bit shift amounts for all types
1396 Shifts.widenScalarIf(
1397 [=](const LegalityQuery &Query) {
1398 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1399 // 32-bit amount.
1400 const LLT ValTy = Query.Types[0];
1401 const LLT AmountTy = Query.Types[1];
1402 return ValTy.getSizeInBits() <= 16 &&
1403 AmountTy.getSizeInBits() < 16;
1404 }, changeTo(1, S16));
1405 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406 Shifts.clampScalar(1, S32, S32);
1407 Shifts.widenScalarToNextPow2(0, 16);
1408 Shifts.clampScalar(0, S16, S64);
1409
1410 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411 .minScalar(0, S16)
1412 .scalarize(0)
1413 .lower();
1414 } else {
1415 // Make sure we legalize the shift amount type first, as the general
1416 // expansion for the shifted type will produce much worse code if it hasn't
1417 // been truncated already.
1418 Shifts.clampScalar(1, S32, S32);
1419 Shifts.widenScalarToNextPow2(0, 32);
1420 Shifts.clampScalar(0, S32, S64);
1421
1422 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423 .minScalar(0, S32)
1424 .scalarize(0)
1425 .lower();
1426 }
1427 Shifts.scalarize(0);
1428
1429 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432 unsigned IdxTypeIdx = 2;
1433
1434 getActionDefinitionsBuilder(Op)
1435 .customIf([=](const LegalityQuery &Query) {
1436 const LLT EltTy = Query.Types[EltTypeIdx];
1437 const LLT VecTy = Query.Types[VecTypeIdx];
1438 const LLT IdxTy = Query.Types[IdxTypeIdx];
1439 const unsigned EltSize = EltTy.getSizeInBits();
1440 return (EltSize == 32 || EltSize == 64) &&
1441 VecTy.getSizeInBits() % 32 == 0 &&
1442 VecTy.getSizeInBits() <= MaxRegisterSize &&
1443 IdxTy.getSizeInBits() == 32;
1444 })
1445 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446 bitcastToVectorElement32(VecTypeIdx))
1447 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448 .bitcastIf(
1449 all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450 [=](const LegalityQuery &Query) {
1451 // For > 64-bit element types, try to turn this into a 64-bit
1452 // element vector since we may be able to do better indexing
1453 // if this is scalar. If not, fall back to 32.
1454 const LLT EltTy = Query.Types[EltTypeIdx];
1455 const LLT VecTy = Query.Types[VecTypeIdx];
1456 const unsigned DstEltSize = EltTy.getSizeInBits();
1457 const unsigned VecSize = VecTy.getSizeInBits();
1458
1459 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460 return std::make_pair(
1461 VecTypeIdx,
1462 LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463 })
1464 .clampScalar(EltTypeIdx, S32, S64)
1465 .clampScalar(VecTypeIdx, S32, S64)
1466 .clampScalar(IdxTypeIdx, S32, S32)
1467 .clampMaxNumElements(VecTypeIdx, S32, 32)
1468 // TODO: Clamp elements for 64-bit vectors?
1469 // It should only be necessary with variable indexes.
1470 // As a last resort, lower to the stack
1471 .lower();
1472 }
1473
1474 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1475 .unsupportedIf([=](const LegalityQuery &Query) {
1476 const LLT &EltTy = Query.Types[1].getElementType();
1477 return Query.Types[0] != EltTy;
1478 });
1479
1480 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1481 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1482 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1483
1484 // FIXME: Doesn't handle extract of illegal sizes.
1485 getActionDefinitionsBuilder(Op)
1486 .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1487 .lowerIf([=](const LegalityQuery &Query) {
1488 // Sub-vector(or single element) insert and extract.
1489 // TODO: verify immediate offset here since lower only works with
1490 // whole elements.
1491 const LLT BigTy = Query.Types[BigTyIdx];
1492 return BigTy.isVector();
1493 })
1494 // FIXME: Multiples of 16 should not be legal.
1495 .legalIf([=](const LegalityQuery &Query) {
1496 const LLT BigTy = Query.Types[BigTyIdx];
1497 const LLT LitTy = Query.Types[LitTyIdx];
1498 return (BigTy.getSizeInBits() % 32 == 0) &&
1499 (LitTy.getSizeInBits() % 16 == 0);
1500 })
1501 .widenScalarIf(
1502 [=](const LegalityQuery &Query) {
1503 const LLT BigTy = Query.Types[BigTyIdx];
1504 return (BigTy.getScalarSizeInBits() < 16);
1505 },
1506 LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1507 .widenScalarIf(
1508 [=](const LegalityQuery &Query) {
1509 const LLT LitTy = Query.Types[LitTyIdx];
1510 return (LitTy.getScalarSizeInBits() < 16);
1511 },
1512 LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1513 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1514 .widenScalarToNextPow2(BigTyIdx, 32);
1515
1516 }
1517
1518 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519 .legalForCartesianProduct(AllS32Vectors, {S32})
1520 .legalForCartesianProduct(AllS64Vectors, {S64})
1521 .clampNumElements(0, V16S32, V32S32)
1522 .clampNumElements(0, V2S64, V16S64)
1523 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1524
1525 if (ST.hasScalarPackInsts()) {
1526 BuildVector
1527 // FIXME: Should probably widen s1 vectors straight to s32
1528 .minScalarOrElt(0, S16)
1529 // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1530 .minScalar(1, S32);
1531
1532 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533 .legalFor({V2S16, S32})
1534 .lower();
1535 BuildVector.minScalarOrElt(0, S32);
1536 } else {
1537 BuildVector.customFor({V2S16, S16});
1538 BuildVector.minScalarOrElt(0, S32);
1539
1540 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541 .customFor({V2S16, S32})
1542 .lower();
1543 }
1544
1545 BuildVector.legalIf(isRegisterType(0));
1546
1547 // FIXME: Clamp maximum size
1548 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550 .clampMaxNumElements(0, S32, 32)
1551 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552 .clampMaxNumElements(0, S16, 64);
1553
1554 // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1555 // pre-legalize.
1556 if (ST.hasVOP3PInsts()) {
1557 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558 .customFor({V2S16, V2S16})
1559 .lower();
1560 } else
1561 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1562
1563 // Merge/Unmerge
1564 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1566 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1567
1568 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1569 const LLT Ty = Query.Types[TypeIdx];
1570 if (Ty.isVector()) {
1571 const LLT &EltTy = Ty.getElementType();
1572 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1573 return true;
1574 if (!isPowerOf2_32(EltTy.getSizeInBits()))
1575 return true;
1576 }
1577 return false;
1578 };
1579
1580 auto &Builder = getActionDefinitionsBuilder(Op)
1581 .legalIf(all(isRegisterType(0), isRegisterType(1)))
1582 .lowerFor({{S16, V2S16}})
1583 .lowerIf([=](const LegalityQuery &Query) {
1584 const LLT BigTy = Query.Types[BigTyIdx];
1585 return BigTy.getSizeInBits() == 32;
1586 })
1587 // Try to widen to s16 first for small types.
1588 // TODO: Only do this on targets with legal s16 shifts
1589 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1590 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1591 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1592 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1593 elementTypeIs(1, S16)),
1594 changeTo(1, V2S16))
1595 // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1596 // worth considering the multiples of 64 since 2*192 and 2*384 are not
1597 // valid.
1598 .clampScalar(LitTyIdx, S32, S512)
1599 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1600 // Break up vectors with weird elements into scalars
1601 .fewerElementsIf(
1602 [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1603 scalarize(0))
1604 .fewerElementsIf(
1605 [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1606 scalarize(1))
1607 .clampScalar(BigTyIdx, S32, MaxScalar);
1608
1609 if (Op == G_MERGE_VALUES) {
1610 Builder.widenScalarIf(
1611 // TODO: Use 16-bit shifts if legal for 8-bit values?
1612 [=](const LegalityQuery &Query) {
1613 const LLT Ty = Query.Types[LitTyIdx];
1614 return Ty.getSizeInBits() < 32;
1615 },
1616 changeTo(LitTyIdx, S32));
1617 }
1618
1619 Builder.widenScalarIf(
1620 [=](const LegalityQuery &Query) {
1621 const LLT Ty = Query.Types[BigTyIdx];
1622 return !isPowerOf2_32(Ty.getSizeInBits()) &&
1623 Ty.getSizeInBits() % 16 != 0;
1624 },
1625 [=](const LegalityQuery &Query) {
1626 // Pick the next power of 2, or a multiple of 64 over 128.
1627 // Whichever is smaller.
1628 const LLT &Ty = Query.Types[BigTyIdx];
1629 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1630 if (NewSizeInBits >= 256) {
1631 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1632 if (RoundedTo < NewSizeInBits)
1633 NewSizeInBits = RoundedTo;
1634 }
1635 return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1636 })
1637 // Any vectors left are the wrong size. Scalarize them.
1638 .scalarize(0)
1639 .scalarize(1);
1640 }
1641
1642 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1643 // RegBankSelect.
1644 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645 .legalFor({{S32}, {S64}});
1646
1647 if (ST.hasVOP3PInsts()) {
1648 SextInReg.lowerFor({{V2S16}})
1649 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1650 // get more vector shift opportunities, since we'll get those when
1651 // expanded.
1652 .clampMaxNumElementsStrict(0, S16, 2);
1653 } else if (ST.has16BitInsts()) {
1654 SextInReg.lowerFor({{S32}, {S64}, {S16}});
1655 } else {
1656 // Prefer to promote to s32 before lowering if we don't have 16-bit
1657 // shifts. This avoid a lot of intermediate truncate and extend operations.
1658 SextInReg.lowerFor({{S32}, {S64}});
1659 }
1660
1661 SextInReg
1662 .scalarize(0)
1663 .clampScalar(0, S32, S64)
1664 .lower();
1665
1666 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667 .scalarize(0)
1668 .lower();
1669
1670 // TODO: Only Try to form v2s16 with legal packed instructions.
1671 getActionDefinitionsBuilder(G_FSHR)
1672 .legalFor({{S32, S32}})
1673 .lowerFor({{V2S16, V2S16}})
1674 .clampMaxNumElementsStrict(0, S16, 2)
1675 .scalarize(0)
1676 .lower();
1677
1678 if (ST.hasVOP3PInsts()) {
1679 getActionDefinitionsBuilder(G_FSHL)
1680 .lowerFor({{V2S16, V2S16}})
1681 .clampMaxNumElementsStrict(0, S16, 2)
1682 .scalarize(0)
1683 .lower();
1684 } else {
1685 getActionDefinitionsBuilder(G_FSHL)
1686 .scalarize(0)
1687 .lower();
1688 }
1689
1690 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691 .legalFor({S64});
1692
1693 getActionDefinitionsBuilder(G_FENCE)
1694 .alwaysLegal();
1695
1696 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697 .scalarize(0)
1698 .minScalar(0, S32)
1699 .lower();
1700
1701 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702 .legalFor({{S32, S32}, {S64, S32}})
1703 .clampScalar(1, S32, S32)
1704 .clampScalar(0, S32, S64)
1705 .widenScalarToNextPow2(0)
1706 .scalarize(0);
1707
1708 getActionDefinitionsBuilder({
1709 // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1710 G_FCOPYSIGN,
1711
1712 G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713 G_ATOMICRMW_NAND,
1714 G_ATOMICRMW_FSUB,
1715 G_READ_REGISTER,
1716 G_WRITE_REGISTER,
1717
1718 G_SADDO, G_SSUBO,
1719
1720 // TODO: Implement
1721 G_FMINIMUM, G_FMAXIMUM}).lower();
1722
1723 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724 .lower();
1725
1726 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729 .unsupported();
1730
1731 getLegacyLegalizerInfo().computeTables();
1732 verify(*ST.getInstrInfo());
1733 }
1734
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI) const1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1736 MachineInstr &MI) const {
1737 MachineIRBuilder &B = Helper.MIRBuilder;
1738 MachineRegisterInfo &MRI = *B.getMRI();
1739
1740 switch (MI.getOpcode()) {
1741 case TargetOpcode::G_ADDRSPACE_CAST:
1742 return legalizeAddrSpaceCast(MI, MRI, B);
1743 case TargetOpcode::G_FRINT:
1744 return legalizeFrint(MI, MRI, B);
1745 case TargetOpcode::G_FCEIL:
1746 return legalizeFceil(MI, MRI, B);
1747 case TargetOpcode::G_FREM:
1748 return legalizeFrem(MI, MRI, B);
1749 case TargetOpcode::G_INTRINSIC_TRUNC:
1750 return legalizeIntrinsicTrunc(MI, MRI, B);
1751 case TargetOpcode::G_SITOFP:
1752 return legalizeITOFP(MI, MRI, B, true);
1753 case TargetOpcode::G_UITOFP:
1754 return legalizeITOFP(MI, MRI, B, false);
1755 case TargetOpcode::G_FPTOSI:
1756 return legalizeFPTOI(MI, MRI, B, true);
1757 case TargetOpcode::G_FPTOUI:
1758 return legalizeFPTOI(MI, MRI, B, false);
1759 case TargetOpcode::G_FMINNUM:
1760 case TargetOpcode::G_FMAXNUM:
1761 case TargetOpcode::G_FMINNUM_IEEE:
1762 case TargetOpcode::G_FMAXNUM_IEEE:
1763 return legalizeMinNumMaxNum(Helper, MI);
1764 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1765 return legalizeExtractVectorElt(MI, MRI, B);
1766 case TargetOpcode::G_INSERT_VECTOR_ELT:
1767 return legalizeInsertVectorElt(MI, MRI, B);
1768 case TargetOpcode::G_SHUFFLE_VECTOR:
1769 return legalizeShuffleVector(MI, MRI, B);
1770 case TargetOpcode::G_FSIN:
1771 case TargetOpcode::G_FCOS:
1772 return legalizeSinCos(MI, MRI, B);
1773 case TargetOpcode::G_GLOBAL_VALUE:
1774 return legalizeGlobalValue(MI, MRI, B);
1775 case TargetOpcode::G_LOAD:
1776 case TargetOpcode::G_SEXTLOAD:
1777 case TargetOpcode::G_ZEXTLOAD:
1778 return legalizeLoad(Helper, MI);
1779 case TargetOpcode::G_FMAD:
1780 return legalizeFMad(MI, MRI, B);
1781 case TargetOpcode::G_FDIV:
1782 return legalizeFDIV(MI, MRI, B);
1783 case TargetOpcode::G_UDIV:
1784 case TargetOpcode::G_UREM:
1785 case TargetOpcode::G_UDIVREM:
1786 return legalizeUnsignedDIV_REM(MI, MRI, B);
1787 case TargetOpcode::G_SDIV:
1788 case TargetOpcode::G_SREM:
1789 case TargetOpcode::G_SDIVREM:
1790 return legalizeSignedDIV_REM(MI, MRI, B);
1791 case TargetOpcode::G_ATOMIC_CMPXCHG:
1792 return legalizeAtomicCmpXChg(MI, MRI, B);
1793 case TargetOpcode::G_FLOG:
1794 return legalizeFlog(MI, B, numbers::ln2f);
1795 case TargetOpcode::G_FLOG10:
1796 return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1797 case TargetOpcode::G_FEXP:
1798 return legalizeFExp(MI, B);
1799 case TargetOpcode::G_FPOW:
1800 return legalizeFPow(MI, B);
1801 case TargetOpcode::G_FFLOOR:
1802 return legalizeFFloor(MI, MRI, B);
1803 case TargetOpcode::G_BUILD_VECTOR:
1804 return legalizeBuildVector(MI, MRI, B);
1805 case TargetOpcode::G_MUL:
1806 return legalizeMul(Helper, MI);
1807 case TargetOpcode::G_CTLZ:
1808 case TargetOpcode::G_CTTZ:
1809 return legalizeCTLZ_CTTZ(MI, MRI, B);
1810 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1811 return legalizeFPTruncRound(MI, B);
1812 default:
1813 return false;
1814 }
1815
1816 llvm_unreachable("expected switch to return");
1817 }
1818
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1819 Register AMDGPULegalizerInfo::getSegmentAperture(
1820 unsigned AS,
1821 MachineRegisterInfo &MRI,
1822 MachineIRBuilder &B) const {
1823 MachineFunction &MF = B.getMF();
1824 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1825 const LLT S32 = LLT::scalar(32);
1826
1827 assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1828
1829 if (ST.hasApertureRegs()) {
1830 // FIXME: Use inline constants (src_{shared, private}_base) instead of
1831 // getreg.
1832 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1833 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1834 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1835 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1836 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1837 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1838 unsigned Encoding =
1839 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1840 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1841 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1842
1843 Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1844
1845 B.buildInstr(AMDGPU::S_GETREG_B32)
1846 .addDef(GetReg)
1847 .addImm(Encoding);
1848 MRI.setType(GetReg, S32);
1849
1850 auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1851 return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1852 }
1853
1854 // TODO: can we be smarter about machine pointer info?
1855 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1856 Register LoadAddr = MRI.createGenericVirtualRegister(
1857 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1858 // For code object version 5, private_base and shared_base are passed through
1859 // implicit kernargs.
1860 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1861 AMDGPUTargetLowering::ImplicitParameter Param =
1862 AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1863 : AMDGPUTargetLowering::PRIVATE_BASE;
1864 uint64_t Offset =
1865 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1866
1867 Register KernargPtrReg = MRI.createGenericVirtualRegister(
1868 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1869
1870 if (!loadInputValue(KernargPtrReg, B,
1871 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1872 return Register();
1873
1874 MachineMemOperand *MMO = MF.getMachineMemOperand(
1875 PtrInfo,
1876 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1877 MachineMemOperand::MOInvariant,
1878 LLT::scalar(32), commonAlignment(Align(64), Offset));
1879
1880 // Pointer address
1881 B.buildPtrAdd(LoadAddr, KernargPtrReg,
1882 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1883 // Load address
1884 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1885 }
1886
1887 Register QueuePtr = MRI.createGenericVirtualRegister(
1888 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1889
1890 if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1891 return Register();
1892
1893 // Offset into amd_queue_t for group_segment_aperture_base_hi /
1894 // private_segment_aperture_base_hi.
1895 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1896
1897 MachineMemOperand *MMO = MF.getMachineMemOperand(
1898 PtrInfo,
1899 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1900 MachineMemOperand::MOInvariant,
1901 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1902
1903 B.buildPtrAdd(LoadAddr, QueuePtr,
1904 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1905 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1906 }
1907
1908 /// Return true if the value is a known valid address, such that a null check is
1909 /// not necessary.
isKnownNonNull(Register Val,MachineRegisterInfo & MRI,const AMDGPUTargetMachine & TM,unsigned AddrSpace)1910 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1911 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1912 MachineInstr *Def = MRI.getVRegDef(Val);
1913 switch (Def->getOpcode()) {
1914 case AMDGPU::G_FRAME_INDEX:
1915 case AMDGPU::G_GLOBAL_VALUE:
1916 case AMDGPU::G_BLOCK_ADDR:
1917 return true;
1918 case AMDGPU::G_CONSTANT: {
1919 const ConstantInt *CI = Def->getOperand(1).getCImm();
1920 return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1921 }
1922 default:
1923 return false;
1924 }
1925
1926 return false;
1927 }
1928
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const1929 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1930 MachineInstr &MI, MachineRegisterInfo &MRI,
1931 MachineIRBuilder &B) const {
1932 MachineFunction &MF = B.getMF();
1933
1934 const LLT S32 = LLT::scalar(32);
1935 Register Dst = MI.getOperand(0).getReg();
1936 Register Src = MI.getOperand(1).getReg();
1937
1938 LLT DstTy = MRI.getType(Dst);
1939 LLT SrcTy = MRI.getType(Src);
1940 unsigned DestAS = DstTy.getAddressSpace();
1941 unsigned SrcAS = SrcTy.getAddressSpace();
1942
1943 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1944 // vector element.
1945 assert(!DstTy.isVector());
1946
1947 const AMDGPUTargetMachine &TM
1948 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1949
1950 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1952 return true;
1953 }
1954
1955 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1956 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1957 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1958 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1959 // Extract low 32-bits of the pointer.
1960 B.buildExtract(Dst, Src, 0);
1961 MI.eraseFromParent();
1962 return true;
1963 }
1964
1965 unsigned NullVal = TM.getNullPointerValue(DestAS);
1966
1967 auto SegmentNull = B.buildConstant(DstTy, NullVal);
1968 auto FlatNull = B.buildConstant(SrcTy, 0);
1969
1970 // Extract low 32-bits of the pointer.
1971 auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1972
1973 auto CmpRes =
1974 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1975 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976
1977 MI.eraseFromParent();
1978 return true;
1979 }
1980
1981 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1982 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1983 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1984 if (!ST.hasFlatAddressSpace())
1985 return false;
1986
1987 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1988 if (!ApertureReg.isValid())
1989 return false;
1990
1991 // Coerce the type of the low half of the result so we can use merge_values.
1992 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1993
1994 // TODO: Should we allow mismatched types but matching sizes in merges to
1995 // avoid the ptrtoint?
1996 auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1997
1998 if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1999 B.buildCopy(Dst, BuildPtr);
2000 MI.eraseFromParent();
2001 return true;
2002 }
2003
2004 auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2005 auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2006
2007 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2008 SegmentNull.getReg(0));
2009
2010 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2011
2012 MI.eraseFromParent();
2013 return true;
2014 }
2015
2016 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2017 SrcTy.getSizeInBits() == 64) {
2018 // Truncate.
2019 B.buildExtract(Dst, Src, 0);
2020 MI.eraseFromParent();
2021 return true;
2022 }
2023
2024 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2025 DstTy.getSizeInBits() == 64) {
2026 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2027 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2028
2029 // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2030 // another. Merge operands are required to be the same type, but creating an
2031 // extra ptrtoint would be kind of pointless.
2032 auto HighAddr = B.buildConstant(
2033 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2034 B.buildMerge(Dst, {Src, HighAddr});
2035 MI.eraseFromParent();
2036 return true;
2037 }
2038
2039 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2040 MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2041
2042 LLVMContext &Ctx = MF.getFunction().getContext();
2043 Ctx.diagnose(InvalidAddrSpaceCast);
2044 B.buildUndef(Dst);
2045 MI.eraseFromParent();
2046 return true;
2047 }
2048
legalizeFrint(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2049 bool AMDGPULegalizerInfo::legalizeFrint(
2050 MachineInstr &MI, MachineRegisterInfo &MRI,
2051 MachineIRBuilder &B) const {
2052 Register Src = MI.getOperand(1).getReg();
2053 LLT Ty = MRI.getType(Src);
2054 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2055
2056 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2057 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2058
2059 auto C1 = B.buildFConstant(Ty, C1Val);
2060 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2061
2062 // TODO: Should this propagate fast-math-flags?
2063 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2064 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2065
2066 auto C2 = B.buildFConstant(Ty, C2Val);
2067 auto Fabs = B.buildFAbs(Ty, Src);
2068
2069 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2070 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071 MI.eraseFromParent();
2072 return true;
2073 }
2074
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2075 bool AMDGPULegalizerInfo::legalizeFceil(
2076 MachineInstr &MI, MachineRegisterInfo &MRI,
2077 MachineIRBuilder &B) const {
2078
2079 const LLT S1 = LLT::scalar(1);
2080 const LLT S64 = LLT::scalar(64);
2081
2082 Register Src = MI.getOperand(1).getReg();
2083 assert(MRI.getType(Src) == S64);
2084
2085 // result = trunc(src)
2086 // if (src > 0.0 && src != result)
2087 // result += 1.0
2088
2089 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2090
2091 const auto Zero = B.buildFConstant(S64, 0.0);
2092 const auto One = B.buildFConstant(S64, 1.0);
2093 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2094 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2095 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2096 auto Add = B.buildSelect(S64, And, One, Zero);
2097
2098 // TODO: Should this propagate fast-math-flags?
2099 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2100 MI.eraseFromParent();
2101 return true;
2102 }
2103
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2104 bool AMDGPULegalizerInfo::legalizeFrem(
2105 MachineInstr &MI, MachineRegisterInfo &MRI,
2106 MachineIRBuilder &B) const {
2107 Register DstReg = MI.getOperand(0).getReg();
2108 Register Src0Reg = MI.getOperand(1).getReg();
2109 Register Src1Reg = MI.getOperand(2).getReg();
2110 auto Flags = MI.getFlags();
2111 LLT Ty = MRI.getType(DstReg);
2112
2113 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117 MI.eraseFromParent();
2118 return true;
2119 }
2120
extractF64Exponent(Register Hi,MachineIRBuilder & B)2121 static MachineInstrBuilder extractF64Exponent(Register Hi,
2122 MachineIRBuilder &B) {
2123 const unsigned FractBits = 52;
2124 const unsigned ExpBits = 11;
2125 LLT S32 = LLT::scalar(32);
2126
2127 auto Const0 = B.buildConstant(S32, FractBits - 32);
2128 auto Const1 = B.buildConstant(S32, ExpBits);
2129
2130 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131 .addUse(Hi)
2132 .addUse(Const0.getReg(0))
2133 .addUse(Const1.getReg(0));
2134
2135 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2136 }
2137
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2138 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2139 MachineInstr &MI, MachineRegisterInfo &MRI,
2140 MachineIRBuilder &B) const {
2141 const LLT S1 = LLT::scalar(1);
2142 const LLT S32 = LLT::scalar(32);
2143 const LLT S64 = LLT::scalar(64);
2144
2145 Register Src = MI.getOperand(1).getReg();
2146 assert(MRI.getType(Src) == S64);
2147
2148 // TODO: Should this use extract since the low half is unused?
2149 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2150 Register Hi = Unmerge.getReg(1);
2151
2152 // Extract the upper half, since this is where we will find the sign and
2153 // exponent.
2154 auto Exp = extractF64Exponent(Hi, B);
2155
2156 const unsigned FractBits = 52;
2157
2158 // Extract the sign bit.
2159 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2160 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2161
2162 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2163
2164 const auto Zero32 = B.buildConstant(S32, 0);
2165
2166 // Extend back to 64-bits.
2167 auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2168
2169 auto Shr = B.buildAShr(S64, FractMask, Exp);
2170 auto Not = B.buildNot(S64, Shr);
2171 auto Tmp0 = B.buildAnd(S64, Src, Not);
2172 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2173
2174 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2175 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2176
2177 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179 MI.eraseFromParent();
2180 return true;
2181 }
2182
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2183 bool AMDGPULegalizerInfo::legalizeITOFP(
2184 MachineInstr &MI, MachineRegisterInfo &MRI,
2185 MachineIRBuilder &B, bool Signed) const {
2186
2187 Register Dst = MI.getOperand(0).getReg();
2188 Register Src = MI.getOperand(1).getReg();
2189
2190 const LLT S64 = LLT::scalar(64);
2191 const LLT S32 = LLT::scalar(32);
2192
2193 assert(MRI.getType(Src) == S64);
2194
2195 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196 auto ThirtyTwo = B.buildConstant(S32, 32);
2197
2198 if (MRI.getType(Dst) == S64) {
2199 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200 : B.buildUITOFP(S64, Unmerge.getReg(1));
2201
2202 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2203 auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2204 .addUse(CvtHi.getReg(0))
2205 .addUse(ThirtyTwo.getReg(0));
2206
2207 // TODO: Should this propagate fast-math-flags?
2208 B.buildFAdd(Dst, LdExp, CvtLo);
2209 MI.eraseFromParent();
2210 return true;
2211 }
2212
2213 assert(MRI.getType(Dst) == S32);
2214
2215 auto One = B.buildConstant(S32, 1);
2216
2217 MachineInstrBuilder ShAmt;
2218 if (Signed) {
2219 auto ThirtyOne = B.buildConstant(S32, 31);
2220 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224 /*HasSideEffects=*/false)
2225 .addUse(Unmerge.getReg(1));
2226 auto LS2 = B.buildSub(S32, LS, One);
2227 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228 } else
2229 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230 auto Norm = B.buildShl(S64, Src, ShAmt);
2231 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236 B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237 /*HasSideEffects=*/false)
2238 .addUse(FVal.getReg(0))
2239 .addUse(Scale.getReg(0));
2240 MI.eraseFromParent();
2241 return true;
2242 }
2243
2244 // TODO: Copied from DAG implementation. Verify logic and document how this
2245 // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2246 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2247 MachineRegisterInfo &MRI,
2248 MachineIRBuilder &B,
2249 bool Signed) const {
2250
2251 Register Dst = MI.getOperand(0).getReg();
2252 Register Src = MI.getOperand(1).getReg();
2253
2254 const LLT S64 = LLT::scalar(64);
2255 const LLT S32 = LLT::scalar(32);
2256
2257 const LLT SrcLT = MRI.getType(Src);
2258 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2259
2260 unsigned Flags = MI.getFlags();
2261
2262 // The basic idea of converting a floating point number into a pair of 32-bit
2263 // integers is illustrated as follows:
2264 //
2265 // tf := trunc(val);
2266 // hif := floor(tf * 2^-32);
2267 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2268 // hi := fptoi(hif);
2269 // lo := fptoi(lof);
2270 //
2271 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272 MachineInstrBuilder Sign;
2273 if (Signed && SrcLT == S32) {
2274 // However, a 32-bit floating point number has only 23 bits mantissa and
2275 // it's not enough to hold all the significant bits of `lof` if val is
2276 // negative. To avoid the loss of precision, We need to take the absolute
2277 // value after truncating and flip the result back based on the original
2278 // signedness.
2279 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280 Trunc = B.buildFAbs(S32, Trunc, Flags);
2281 }
2282 MachineInstrBuilder K0, K1;
2283 if (SrcLT == S64) {
2284 K0 = B.buildFConstant(S64,
2285 BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286 K1 = B.buildFConstant(S64,
2287 BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288 } else {
2289 K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290 K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291 }
2292
2293 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2296
2297 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298 : B.buildFPTOUI(S32, FloorMul);
2299 auto Lo = B.buildFPTOUI(S32, Fma);
2300
2301 if (Signed && SrcLT == S32) {
2302 // Flip the result based on the signedness, which is either all 0s or 1s.
2303 Sign = B.buildMerge(S64, {Sign, Sign});
2304 // r := xor({lo, hi}, sign) - sign;
2305 B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306 } else
2307 B.buildMerge(Dst, {Lo, Hi});
2308 MI.eraseFromParent();
2309
2310 return true;
2311 }
2312
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const2313 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2314 MachineInstr &MI) const {
2315 MachineFunction &MF = Helper.MIRBuilder.getMF();
2316 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2317
2318 const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320
2321 // With ieee_mode disabled, the instructions have the correct behavior
2322 // already for G_FMINNUM/G_FMAXNUM
2323 if (!MFI->getMode().IEEE)
2324 return !IsIEEEOp;
2325
2326 if (IsIEEEOp)
2327 return true;
2328
2329 return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2330 }
2331
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2332 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2333 MachineInstr &MI, MachineRegisterInfo &MRI,
2334 MachineIRBuilder &B) const {
2335 // TODO: Should move some of this into LegalizerHelper.
2336
2337 // TODO: Promote dynamic indexing of s16 to s32
2338
2339 // FIXME: Artifact combiner probably should have replaced the truncated
2340 // constant before this, so we shouldn't need
2341 // getIConstantVRegValWithLookThrough.
2342 Optional<ValueAndVReg> MaybeIdxVal =
2343 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345 return true;
2346 const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2347
2348 Register Dst = MI.getOperand(0).getReg();
2349 Register Vec = MI.getOperand(1).getReg();
2350
2351 LLT VecTy = MRI.getType(Vec);
2352 LLT EltTy = VecTy.getElementType();
2353 assert(EltTy == MRI.getType(Dst));
2354
2355 if (IdxVal < VecTy.getNumElements()) {
2356 auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358 } else {
2359 B.buildUndef(Dst);
2360 }
2361
2362 MI.eraseFromParent();
2363 return true;
2364 }
2365
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2366 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2367 MachineInstr &MI, MachineRegisterInfo &MRI,
2368 MachineIRBuilder &B) const {
2369 // TODO: Should move some of this into LegalizerHelper.
2370
2371 // TODO: Promote dynamic indexing of s16 to s32
2372
2373 // FIXME: Artifact combiner probably should have replaced the truncated
2374 // constant before this, so we shouldn't need
2375 // getIConstantVRegValWithLookThrough.
2376 Optional<ValueAndVReg> MaybeIdxVal =
2377 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379 return true;
2380
2381 int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2382 Register Dst = MI.getOperand(0).getReg();
2383 Register Vec = MI.getOperand(1).getReg();
2384 Register Ins = MI.getOperand(2).getReg();
2385
2386 LLT VecTy = MRI.getType(Vec);
2387 LLT EltTy = VecTy.getElementType();
2388 assert(EltTy == MRI.getType(Ins));
2389 (void)Ins;
2390
2391 unsigned NumElts = VecTy.getNumElements();
2392 if (IdxVal < NumElts) {
2393 SmallVector<Register, 8> SrcRegs;
2394 for (unsigned i = 0; i < NumElts; ++i)
2395 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396 B.buildUnmerge(SrcRegs, Vec);
2397
2398 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399 B.buildMerge(Dst, SrcRegs);
2400 } else {
2401 B.buildUndef(Dst);
2402 }
2403
2404 MI.eraseFromParent();
2405 return true;
2406 }
2407
legalizeShuffleVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2408 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2409 MachineInstr &MI, MachineRegisterInfo &MRI,
2410 MachineIRBuilder &B) const {
2411 const LLT V2S16 = LLT::fixed_vector(2, 16);
2412
2413 Register Dst = MI.getOperand(0).getReg();
2414 Register Src0 = MI.getOperand(1).getReg();
2415 LLT DstTy = MRI.getType(Dst);
2416 LLT SrcTy = MRI.getType(Src0);
2417
2418 if (SrcTy == V2S16 && DstTy == V2S16 &&
2419 AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2420 return true;
2421
2422 MachineIRBuilder HelperBuilder(MI);
2423 GISelObserverWrapper DummyObserver;
2424 LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2425 return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2426 }
2427
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2428 bool AMDGPULegalizerInfo::legalizeSinCos(
2429 MachineInstr &MI, MachineRegisterInfo &MRI,
2430 MachineIRBuilder &B) const {
2431
2432 Register DstReg = MI.getOperand(0).getReg();
2433 Register SrcReg = MI.getOperand(1).getReg();
2434 LLT Ty = MRI.getType(DstReg);
2435 unsigned Flags = MI.getFlags();
2436
2437 Register TrigVal;
2438 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2439 if (ST.hasTrigReducedRange()) {
2440 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2442 .addUse(MulVal.getReg(0))
2443 .setMIFlags(Flags).getReg(0);
2444 } else
2445 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2446
2447 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2448 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449 B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2450 .addUse(TrigVal)
2451 .setMIFlags(Flags);
2452 MI.eraseFromParent();
2453 return true;
2454 }
2455
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const2456 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2457 MachineIRBuilder &B,
2458 const GlobalValue *GV,
2459 int64_t Offset,
2460 unsigned GAFlags) const {
2461 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2462 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2463 // to the following code sequence:
2464 //
2465 // For constant address space:
2466 // s_getpc_b64 s[0:1]
2467 // s_add_u32 s0, s0, $symbol
2468 // s_addc_u32 s1, s1, 0
2469 //
2470 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2471 // a fixup or relocation is emitted to replace $symbol with a literal
2472 // constant, which is a pc-relative offset from the encoding of the $symbol
2473 // operand to the global variable.
2474 //
2475 // For global address space:
2476 // s_getpc_b64 s[0:1]
2477 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2478 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2479 //
2480 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
2481 // fixups or relocations are emitted to replace $symbol@*@lo and
2482 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2483 // which is a 64-bit pc-relative offset from the encoding of the $symbol
2484 // operand to the global variable.
2485 //
2486 // What we want here is an offset from the value returned by s_getpc
2487 // (which is the address of the s_add_u32 instruction) to the global
2488 // variable, but since the encoding of $symbol starts 4 bytes after the start
2489 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2490 // small. This requires us to add 4 to the global variable offset in order to
2491 // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493 // instruction.
2494
2495 LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2496
2497 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2498 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2499
2500 MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2501 .addDef(PCReg);
2502
2503 MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2504 if (GAFlags == SIInstrInfo::MO_NONE)
2505 MIB.addImm(0);
2506 else
2507 MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2508
2509 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2510
2511 if (PtrTy.getSizeInBits() == 32)
2512 B.buildExtract(DstReg, PCReg, 0);
2513 return true;
2514 }
2515
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2516 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2517 MachineInstr &MI, MachineRegisterInfo &MRI,
2518 MachineIRBuilder &B) const {
2519 Register DstReg = MI.getOperand(0).getReg();
2520 LLT Ty = MRI.getType(DstReg);
2521 unsigned AS = Ty.getAddressSpace();
2522
2523 const GlobalValue *GV = MI.getOperand(1).getGlobal();
2524 MachineFunction &MF = B.getMF();
2525 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2526
2527 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2528 if (!MFI->isModuleEntryFunction() &&
2529 !GV->getName().equals("llvm.amdgcn.module.lds")) {
2530 const Function &Fn = MF.getFunction();
2531 DiagnosticInfoUnsupported BadLDSDecl(
2532 Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2533 DS_Warning);
2534 Fn.getContext().diagnose(BadLDSDecl);
2535
2536 // We currently don't have a way to correctly allocate LDS objects that
2537 // aren't directly associated with a kernel. We do force inlining of
2538 // functions that use local objects. However, if these dead functions are
2539 // not eliminated, we don't want a compile time error. Just emit a warning
2540 // and a trap, since there should be no callable path here.
2541 B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2542 B.buildUndef(DstReg);
2543 MI.eraseFromParent();
2544 return true;
2545 }
2546
2547 // TODO: We could emit code to handle the initialization somewhere.
2548 // We ignore the initializer for now and legalize it to allow selection.
2549 // The initializer will anyway get errored out during assembly emission.
2550 const SITargetLowering *TLI = ST.getTargetLowering();
2551 if (!TLI->shouldUseLDSConstAddress(GV)) {
2552 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2553 return true; // Leave in place;
2554 }
2555
2556 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557 Type *Ty = GV->getValueType();
2558 // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559 // zero-sized type in other languages to declare the dynamic shared
2560 // memory which size is not known at the compile time. They will be
2561 // allocated by the runtime and placed directly after the static
2562 // allocated ones. They all share the same offset.
2563 if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564 // Adjust alignment for that dynamic shared memory array.
2565 MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566 LLT S32 = LLT::scalar(32);
2567 auto Sz =
2568 B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569 B.buildIntToPtr(DstReg, Sz);
2570 MI.eraseFromParent();
2571 return true;
2572 }
2573 }
2574
2575 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576 *cast<GlobalVariable>(GV)));
2577 MI.eraseFromParent();
2578 return true;
2579 }
2580
2581 const SITargetLowering *TLI = ST.getTargetLowering();
2582
2583 if (TLI->shouldEmitFixup(GV)) {
2584 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2585 MI.eraseFromParent();
2586 return true;
2587 }
2588
2589 if (TLI->shouldEmitPCReloc(GV)) {
2590 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2591 MI.eraseFromParent();
2592 return true;
2593 }
2594
2595 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2596 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2597
2598 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2599 MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2600 MachinePointerInfo::getGOT(MF),
2601 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2602 MachineMemOperand::MOInvariant,
2603 LoadTy, Align(8));
2604
2605 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2606
2607 if (Ty.getSizeInBits() == 32) {
2608 // Truncate if this is a 32-bit constant address.
2609 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610 B.buildExtract(DstReg, Load, 0);
2611 } else
2612 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2613
2614 MI.eraseFromParent();
2615 return true;
2616 }
2617
widenToNextPowerOf2(LLT Ty)2618 static LLT widenToNextPowerOf2(LLT Ty) {
2619 if (Ty.isVector())
2620 return Ty.changeElementCount(
2621 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2622 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623 }
2624
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const2625 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2626 MachineInstr &MI) const {
2627 MachineIRBuilder &B = Helper.MIRBuilder;
2628 MachineRegisterInfo &MRI = *B.getMRI();
2629 GISelChangeObserver &Observer = Helper.Observer;
2630
2631 Register PtrReg = MI.getOperand(1).getReg();
2632 LLT PtrTy = MRI.getType(PtrReg);
2633 unsigned AddrSpace = PtrTy.getAddressSpace();
2634
2635 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2636 LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2637 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2638 Observer.changingInstr(MI);
2639 MI.getOperand(1).setReg(Cast.getReg(0));
2640 Observer.changedInstr(MI);
2641 return true;
2642 }
2643
2644 if (MI.getOpcode() != AMDGPU::G_LOAD)
2645 return false;
2646
2647 Register ValReg = MI.getOperand(0).getReg();
2648 LLT ValTy = MRI.getType(ValReg);
2649
2650 MachineMemOperand *MMO = *MI.memoperands_begin();
2651 const unsigned ValSize = ValTy.getSizeInBits();
2652 const LLT MemTy = MMO->getMemoryType();
2653 const Align MemAlign = MMO->getAlign();
2654 const unsigned MemSize = MemTy.getSizeInBits();
2655 const uint64_t AlignInBits = 8 * MemAlign.value();
2656
2657 // Widen non-power-of-2 loads to the alignment if needed
2658 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660
2661 // This was already the correct extending load result type, so just adjust
2662 // the memory type.
2663 if (WideMemSize == ValSize) {
2664 MachineFunction &MF = B.getMF();
2665
2666 MachineMemOperand *WideMMO =
2667 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668 Observer.changingInstr(MI);
2669 MI.setMemRefs(MF, {WideMMO});
2670 Observer.changedInstr(MI);
2671 return true;
2672 }
2673
2674 // Don't bother handling edge case that should probably never be produced.
2675 if (ValSize > WideMemSize)
2676 return false;
2677
2678 LLT WideTy = widenToNextPowerOf2(ValTy);
2679
2680 Register WideLoad;
2681 if (!WideTy.isVector()) {
2682 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683 B.buildTrunc(ValReg, WideLoad).getReg(0);
2684 } else {
2685 // Extract the subvector.
2686
2687 if (isRegisterType(ValTy)) {
2688 // If this a case where G_EXTRACT is legal, use it.
2689 // (e.g. <3 x s32> -> <4 x s32>)
2690 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691 B.buildExtract(ValReg, WideLoad, 0);
2692 } else {
2693 // For cases where the widened type isn't a nice register value, unmerge
2694 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2695 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697 }
2698 }
2699
2700 MI.eraseFromParent();
2701 return true;
2702 }
2703
2704 return false;
2705 }
2706
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2707 bool AMDGPULegalizerInfo::legalizeFMad(
2708 MachineInstr &MI, MachineRegisterInfo &MRI,
2709 MachineIRBuilder &B) const {
2710 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2711 assert(Ty.isScalar());
2712
2713 MachineFunction &MF = B.getMF();
2714 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2715
2716 // TODO: Always legal with future ftz flag.
2717 // FIXME: Do we need just output?
2718 if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2719 return true;
2720 if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2721 return true;
2722
2723 MachineIRBuilder HelperBuilder(MI);
2724 GISelObserverWrapper DummyObserver;
2725 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2726 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2727 }
2728
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2729 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2730 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2731 Register DstReg = MI.getOperand(0).getReg();
2732 Register PtrReg = MI.getOperand(1).getReg();
2733 Register CmpVal = MI.getOperand(2).getReg();
2734 Register NewVal = MI.getOperand(3).getReg();
2735
2736 assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2737 "this should not have been custom lowered");
2738
2739 LLT ValTy = MRI.getType(CmpVal);
2740 LLT VecTy = LLT::fixed_vector(2, ValTy);
2741
2742 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743
2744 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745 .addDef(DstReg)
2746 .addUse(PtrReg)
2747 .addUse(PackedVal)
2748 .setMemRefs(MI.memoperands());
2749
2750 MI.eraseFromParent();
2751 return true;
2752 }
2753
legalizeFlog(MachineInstr & MI,MachineIRBuilder & B,double Log2BaseInverted) const2754 bool AMDGPULegalizerInfo::legalizeFlog(
2755 MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2756 Register Dst = MI.getOperand(0).getReg();
2757 Register Src = MI.getOperand(1).getReg();
2758 LLT Ty = B.getMRI()->getType(Dst);
2759 unsigned Flags = MI.getFlags();
2760
2761 auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2762 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2763
2764 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765 MI.eraseFromParent();
2766 return true;
2767 }
2768
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const2769 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2770 MachineIRBuilder &B) const {
2771 Register Dst = MI.getOperand(0).getReg();
2772 Register Src = MI.getOperand(1).getReg();
2773 unsigned Flags = MI.getFlags();
2774 LLT Ty = B.getMRI()->getType(Dst);
2775
2776 auto K = B.buildFConstant(Ty, numbers::log2e);
2777 auto Mul = B.buildFMul(Ty, Src, K, Flags);
2778 B.buildFExp2(Dst, Mul, Flags);
2779 MI.eraseFromParent();
2780 return true;
2781 }
2782
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const2783 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2784 MachineIRBuilder &B) const {
2785 Register Dst = MI.getOperand(0).getReg();
2786 Register Src0 = MI.getOperand(1).getReg();
2787 Register Src1 = MI.getOperand(2).getReg();
2788 unsigned Flags = MI.getFlags();
2789 LLT Ty = B.getMRI()->getType(Dst);
2790 const LLT S16 = LLT::scalar(16);
2791 const LLT S32 = LLT::scalar(32);
2792
2793 if (Ty == S32) {
2794 auto Log = B.buildFLog2(S32, Src0, Flags);
2795 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2796 .addUse(Log.getReg(0))
2797 .addUse(Src1)
2798 .setMIFlags(Flags);
2799 B.buildFExp2(Dst, Mul, Flags);
2800 } else if (Ty == S16) {
2801 // There's no f16 fmul_legacy, so we need to convert for it.
2802 auto Log = B.buildFLog2(S16, Src0, Flags);
2803 auto Ext0 = B.buildFPExt(S32, Log, Flags);
2804 auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2805 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2806 .addUse(Ext0.getReg(0))
2807 .addUse(Ext1.getReg(0))
2808 .setMIFlags(Flags);
2809
2810 B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2811 } else
2812 return false;
2813
2814 MI.eraseFromParent();
2815 return true;
2816 }
2817
2818 // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)2819 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2820 Register ModSrc = OrigSrc;
2821 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2822 ModSrc = SrcFNeg->getOperand(1).getReg();
2823 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2824 ModSrc = SrcFAbs->getOperand(1).getReg();
2825 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2826 ModSrc = SrcFAbs->getOperand(1).getReg();
2827 return ModSrc;
2828 }
2829
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2830 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2831 MachineRegisterInfo &MRI,
2832 MachineIRBuilder &B) const {
2833
2834 const LLT S1 = LLT::scalar(1);
2835 const LLT S64 = LLT::scalar(64);
2836 Register Dst = MI.getOperand(0).getReg();
2837 Register OrigSrc = MI.getOperand(1).getReg();
2838 unsigned Flags = MI.getFlags();
2839 assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2840 "this should not have been custom lowered");
2841
2842 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2843 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2844 // efficient way to implement it is using V_FRACT_F64. The workaround for the
2845 // V_FRACT bug is:
2846 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2847 //
2848 // Convert floor(x) to (x - fract(x))
2849
2850 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2851 .addUse(OrigSrc)
2852 .setMIFlags(Flags);
2853
2854 // Give source modifier matching some assistance before obscuring a foldable
2855 // pattern.
2856
2857 // TODO: We can avoid the neg on the fract? The input sign to fract
2858 // shouldn't matter?
2859 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2860
2861 auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2862
2863 Register Min = MRI.createGenericVirtualRegister(S64);
2864
2865 // We don't need to concern ourselves with the snan handling difference, so
2866 // use the one which will directly select.
2867 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868 if (MFI->getMode().IEEE)
2869 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2870 else
2871 B.buildFMinNum(Min, Fract, Const, Flags);
2872
2873 Register CorrectedFract = Min;
2874 if (!MI.getFlag(MachineInstr::FmNoNans)) {
2875 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2876 CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2877 }
2878
2879 auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2880 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2881
2882 MI.eraseFromParent();
2883 return true;
2884 }
2885
2886 // Turn an illegal packed v2s16 build vector into bit operations.
2887 // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2888 bool AMDGPULegalizerInfo::legalizeBuildVector(
2889 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2890 Register Dst = MI.getOperand(0).getReg();
2891 const LLT S32 = LLT::scalar(32);
2892 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2893
2894 Register Src0 = MI.getOperand(1).getReg();
2895 Register Src1 = MI.getOperand(2).getReg();
2896 assert(MRI.getType(Src0) == LLT::scalar(16));
2897
2898 auto Merge = B.buildMerge(S32, {Src0, Src1});
2899 B.buildBitcast(Dst, Merge);
2900
2901 MI.eraseFromParent();
2902 return true;
2903 }
2904
2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2906 //
2907 // Source and accumulation registers must all be 32-bits.
2908 //
2909 // TODO: When the multiply is uniform, we should produce a code sequence
2910 // that is better suited to instruction selection on the SALU. Instead of
2911 // the outer loop going over parts of the result, the outer loop should go
2912 // over parts of one of the factors. This should result in instruction
2913 // selection that makes full use of S_ADDC_U32 instructions.
buildMultiply(LegalizerHelper & Helper,MutableArrayRef<Register> Accum,ArrayRef<Register> Src0,ArrayRef<Register> Src1,bool UsePartialMad64_32,bool SeparateOddAlignedProducts) const2914 void AMDGPULegalizerInfo::buildMultiply(
2915 LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2916 ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2917 bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2918 // Use (possibly empty) vectors of S1 registers to represent the set of
2919 // carries from one pair of positions to the next.
2920 using Carry = SmallVector<Register, 2>;
2921
2922 MachineIRBuilder &B = Helper.MIRBuilder;
2923
2924 const LLT S1 = LLT::scalar(1);
2925 const LLT S32 = LLT::scalar(32);
2926 const LLT S64 = LLT::scalar(64);
2927
2928 Register Zero32;
2929 Register Zero64;
2930
2931 auto getZero32 = [&]() -> Register {
2932 if (!Zero32)
2933 Zero32 = B.buildConstant(S32, 0).getReg(0);
2934 return Zero32;
2935 };
2936 auto getZero64 = [&]() -> Register {
2937 if (!Zero64)
2938 Zero64 = B.buildConstant(S64, 0).getReg(0);
2939 return Zero64;
2940 };
2941
2942 // Merge the given carries into the 32-bit LocalAccum, which is modified
2943 // in-place.
2944 //
2945 // Returns the carry-out, which is a single S1 register or null.
2946 auto mergeCarry =
2947 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2948 if (CarryIn.empty())
2949 return Register();
2950
2951 bool HaveCarryOut = true;
2952 Register CarryAccum;
2953 if (CarryIn.size() == 1) {
2954 if (!LocalAccum) {
2955 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2956 return Register();
2957 }
2958
2959 CarryAccum = getZero32();
2960 } else {
2961 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2962 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2963 CarryAccum =
2964 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2965 .getReg(0);
2966 }
2967
2968 if (!LocalAccum) {
2969 LocalAccum = getZero32();
2970 HaveCarryOut = false;
2971 }
2972 }
2973
2974 auto Add =
2975 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976 LocalAccum = Add.getReg(0);
2977 return HaveCarryOut ? Add.getReg(1) : Register();
2978 };
2979
2980 // Build a multiply-add chain to compute
2981 //
2982 // LocalAccum + (partial products at DstIndex)
2983 // + (opportunistic subset of CarryIn)
2984 //
2985 // LocalAccum is an array of one or two 32-bit registers that are updated
2986 // in-place. The incoming registers may be null.
2987 //
2988 // In some edge cases, carry-ins can be consumed "for free". In that case,
2989 // the consumed carry bits are removed from CarryIn in-place.
2990 auto buildMadChain =
2991 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2992 -> Carry {
2993 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2994 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2995
2996 Carry CarryOut;
2997 unsigned j0 = 0;
2998
2999 // Use plain 32-bit multiplication for the most significant part of the
3000 // result by default.
3001 if (LocalAccum.size() == 1 &&
3002 (!UsePartialMad64_32 || !CarryIn.empty())) {
3003 do {
3004 unsigned j1 = DstIndex - j0;
3005 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3006 if (!LocalAccum[0]) {
3007 LocalAccum[0] = Mul.getReg(0);
3008 } else {
3009 if (CarryIn.empty()) {
3010 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3011 } else {
3012 LocalAccum[0] =
3013 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3014 .getReg(0);
3015 CarryIn.pop_back();
3016 }
3017 }
3018 ++j0;
3019 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3020 }
3021
3022 // Build full 64-bit multiplies.
3023 if (j0 <= DstIndex) {
3024 bool HaveSmallAccum = false;
3025 Register Tmp;
3026
3027 if (LocalAccum[0]) {
3028 if (LocalAccum.size() == 1) {
3029 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030 HaveSmallAccum = true;
3031 } else if (LocalAccum[1]) {
3032 Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3033 HaveSmallAccum = false;
3034 } else {
3035 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036 HaveSmallAccum = true;
3037 }
3038 } else {
3039 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3040 Tmp = getZero64();
3041 HaveSmallAccum = true;
3042 }
3043
3044 do {
3045 unsigned j1 = DstIndex - j0;
3046 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047 {Src0[j0], Src1[j1], Tmp});
3048 Tmp = Mad.getReg(0);
3049 if (!HaveSmallAccum)
3050 CarryOut.push_back(Mad.getReg(1));
3051 HaveSmallAccum = false;
3052 ++j0;
3053 } while (j0 <= DstIndex);
3054
3055 auto Unmerge = B.buildUnmerge(S32, Tmp);
3056 LocalAccum[0] = Unmerge.getReg(0);
3057 if (LocalAccum.size() > 1)
3058 LocalAccum[1] = Unmerge.getReg(1);
3059 }
3060
3061 return CarryOut;
3062 };
3063
3064 // Outer multiply loop, iterating over destination parts from least
3065 // significant to most significant parts.
3066 //
3067 // The columns of the following diagram correspond to the destination parts
3068 // affected by one iteration of the outer loop (ignoring boundary
3069 // conditions).
3070 //
3071 // Dest index relative to 2 * i: 1 0 -1
3072 // ------
3073 // Carries from previous iteration: e o
3074 // Even-aligned partial product sum: E E .
3075 // Odd-aligned partial product sum: O O
3076 //
3077 // 'o' is OddCarry, 'e' is EvenCarry.
3078 // EE and OO are computed from partial products via buildMadChain and use
3079 // accumulation where possible and appropriate.
3080 //
3081 Register SeparateOddCarry;
3082 Carry EvenCarry;
3083 Carry OddCarry;
3084
3085 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3086 Carry OddCarryIn = std::move(OddCarry);
3087 Carry EvenCarryIn = std::move(EvenCarry);
3088 OddCarry.clear();
3089 EvenCarry.clear();
3090
3091 // Partial products at offset 2 * i.
3092 if (2 * i < Accum.size()) {
3093 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3094 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3095 }
3096
3097 // Partial products at offset 2 * i - 1.
3098 if (i > 0) {
3099 if (!SeparateOddAlignedProducts) {
3100 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3101 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3102 } else {
3103 bool IsHighest = 2 * i >= Accum.size();
3104 Register SeparateOddOut[2];
3105 auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3106 .take_front(IsHighest ? 1 : 2);
3107 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108
3109 MachineInstr *Lo;
3110
3111 if (i == 1) {
3112 if (!IsHighest)
3113 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3114 else
3115 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3116 } else {
3117 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3118 SeparateOddCarry);
3119 }
3120 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3121
3122 if (!IsHighest) {
3123 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3124 Lo->getOperand(1).getReg());
3125 Accum[2 * i] = Hi.getReg(0);
3126 SeparateOddCarry = Hi.getReg(1);
3127 }
3128 }
3129 }
3130
3131 // Add in the carries from the previous iteration
3132 if (i > 0) {
3133 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3134 EvenCarryIn.push_back(CarryOut);
3135
3136 if (2 * i < Accum.size()) {
3137 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3138 OddCarry.push_back(CarryOut);
3139 }
3140 }
3141 }
3142 }
3143
3144 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3145 //
3146 // TODO: If the multiply is followed by an addition, we should attempt to
3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
legalizeMul(LegalizerHelper & Helper,MachineInstr & MI) const3148 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3149 MachineInstr &MI) const {
3150 assert(ST.hasMad64_32());
3151 assert(MI.getOpcode() == TargetOpcode::G_MUL);
3152
3153 MachineIRBuilder &B = Helper.MIRBuilder;
3154 MachineRegisterInfo &MRI = *B.getMRI();
3155
3156 Register DstReg = MI.getOperand(0).getReg();
3157 Register Src0 = MI.getOperand(1).getReg();
3158 Register Src1 = MI.getOperand(2).getReg();
3159
3160 LLT Ty = MRI.getType(DstReg);
3161 assert(Ty.isScalar());
3162
3163 unsigned Size = Ty.getSizeInBits();
3164 unsigned NumParts = Size / 32;
3165 assert((Size % 32) == 0);
3166 assert(NumParts >= 2);
3167
3168 // Whether to use MAD_64_32 for partial products whose high half is
3169 // discarded. This avoids some ADD instructions but risks false dependency
3170 // stalls on some subtargets in some cases.
3171 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3172
3173 // Whether to compute odd-aligned partial products separately. This is
3174 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3175 // in an even-aligned VGPR.
3176 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3177
3178 LLT S32 = LLT::scalar(32);
3179 SmallVector<Register, 2> Src0Parts, Src1Parts;
3180 for (unsigned i = 0; i < NumParts; ++i) {
3181 Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182 Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3183 }
3184 B.buildUnmerge(Src0Parts, Src0);
3185 B.buildUnmerge(Src1Parts, Src1);
3186
3187 SmallVector<Register, 2> AccumRegs(NumParts);
3188 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189 SeparateOddAlignedProducts);
3190
3191 B.buildMerge(DstReg, AccumRegs);
3192 MI.eraseFromParent();
3193 return true;
3194
3195 }
3196
3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199 // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3200 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3201 MachineRegisterInfo &MRI,
3202 MachineIRBuilder &B) const {
3203 Register Dst = MI.getOperand(0).getReg();
3204 Register Src = MI.getOperand(1).getReg();
3205 LLT DstTy = MRI.getType(Dst);
3206 LLT SrcTy = MRI.getType(Src);
3207
3208 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209 ? AMDGPU::G_AMDGPU_FFBH_U32
3210 : AMDGPU::G_AMDGPU_FFBL_B32;
3211 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213
3214 MI.eraseFromParent();
3215 return true;
3216 }
3217
3218 // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220 if (MI.getOpcode() != TargetOpcode::G_XOR)
3221 return false;
3222 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223 return ConstVal && *ConstVal == -1;
3224 }
3225
3226 // Return the use branch instruction, otherwise null if the usage is invalid.
3227 static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)3228 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3229 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3230 Register CondDef = MI.getOperand(0).getReg();
3231 if (!MRI.hasOneNonDBGUse(CondDef))
3232 return nullptr;
3233
3234 MachineBasicBlock *Parent = MI.getParent();
3235 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3236
3237 if (isNot(MRI, *UseMI)) {
3238 Register NegatedCond = UseMI->getOperand(0).getReg();
3239 if (!MRI.hasOneNonDBGUse(NegatedCond))
3240 return nullptr;
3241
3242 // We're deleting the def of this value, so we need to remove it.
3243 eraseInstr(*UseMI, MRI);
3244
3245 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246 Negated = true;
3247 }
3248
3249 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250 return nullptr;
3251
3252 // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3254 if (Next == Parent->end()) {
3255 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3256 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3257 return nullptr;
3258 UncondBrTarget = &*NextMBB;
3259 } else {
3260 if (Next->getOpcode() != AMDGPU::G_BR)
3261 return nullptr;
3262 Br = &*Next;
3263 UncondBrTarget = Br->getOperand(0).getMBB();
3264 }
3265
3266 return UseMI;
3267 }
3268
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const3269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3270 const ArgDescriptor *Arg,
3271 const TargetRegisterClass *ArgRC,
3272 LLT ArgTy) const {
3273 MCRegister SrcReg = Arg->getRegister();
3274 assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3275 assert(DstReg.isVirtual() && "Virtual register expected");
3276
3277 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3278 *ArgRC, B.getDebugLoc(), ArgTy);
3279 if (Arg->isMasked()) {
3280 // TODO: Should we try to emit this once in the entry block?
3281 const LLT S32 = LLT::scalar(32);
3282 const unsigned Mask = Arg->getMask();
3283 const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3284
3285 Register AndMaskSrc = LiveIn;
3286
3287 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3288 // 0.
3289 if (Shift != 0) {
3290 auto ShiftAmt = B.buildConstant(S32, Shift);
3291 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3292 }
3293
3294 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3295 } else {
3296 B.buildCopy(DstReg, LiveIn);
3297 }
3298
3299 return true;
3300 }
3301
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3302 bool AMDGPULegalizerInfo::loadInputValue(
3303 Register DstReg, MachineIRBuilder &B,
3304 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306 const ArgDescriptor *Arg;
3307 const TargetRegisterClass *ArgRC;
3308 LLT ArgTy;
3309 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310
3311 if (!Arg) {
3312 if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3313 // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314 // case the pointer argument may be missing and we use null.
3315 B.buildConstant(DstReg, 0);
3316 return true;
3317 }
3318
3319 // It's undefined behavior if a function marked with the amdgpu-no-*
3320 // attributes uses the corresponding intrinsic.
3321 B.buildUndef(DstReg);
3322 return true;
3323 }
3324
3325 if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326 return false; // TODO: Handle these
3327 return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328 }
3329
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3330 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3331 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3332 AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3334 return false;
3335
3336 MI.eraseFromParent();
3337 return true;
3338 }
3339
replaceWithConstant(MachineIRBuilder & B,MachineInstr & MI,int64_t C)3340 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3341 int64_t C) {
3342 B.buildConstant(MI.getOperand(0).getReg(), C);
3343 MI.eraseFromParent();
3344 return true;
3345 }
3346
legalizeWorkitemIDIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned Dim,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const3347 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3348 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3349 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3350 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3351 if (MaxID == 0)
3352 return replaceWithConstant(B, MI, 0);
3353
3354 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3355 const ArgDescriptor *Arg;
3356 const TargetRegisterClass *ArgRC;
3357 LLT ArgTy;
3358 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3359
3360 Register DstReg = MI.getOperand(0).getReg();
3361 if (!Arg) {
3362 // It's undefined behavior if a function marked with the amdgpu-no-*
3363 // attributes uses the corresponding intrinsic.
3364 B.buildUndef(DstReg);
3365 MI.eraseFromParent();
3366 return true;
3367 }
3368
3369 if (Arg->isMasked()) {
3370 // Don't bother inserting AssertZext for packed IDs since we're emitting the
3371 // masking operations anyway.
3372 //
3373 // TODO: We could assert the top bit is 0 for the source copy.
3374 if (!loadInputValue(DstReg, B, ArgType))
3375 return false;
3376 } else {
3377 Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3378 if (!loadInputValue(TmpReg, B, ArgType))
3379 return false;
3380 B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3381 }
3382
3383 MI.eraseFromParent();
3384 return true;
3385 }
3386
getKernargParameterPtr(MachineIRBuilder & B,int64_t Offset) const3387 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3388 int64_t Offset) const {
3389 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3390 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3391
3392 // TODO: If we passed in the base kernel offset we could have a better
3393 // alignment than 4, but we don't really need it.
3394 if (!loadInputValue(KernArgReg, B,
3395 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3396 llvm_unreachable("failed to find kernarg segment ptr");
3397
3398 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3399 // TODO: Should get nuw
3400 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3401 }
3402
3403 /// Legalize a value that's loaded from kernel arguments. This is only used by
3404 /// legacy intrinsics.
legalizeKernargMemParameter(MachineInstr & MI,MachineIRBuilder & B,uint64_t Offset,Align Alignment) const3405 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3406 MachineIRBuilder &B,
3407 uint64_t Offset,
3408 Align Alignment) const {
3409 Register DstReg = MI.getOperand(0).getReg();
3410
3411 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3412 "unexpected kernarg parameter type");
3413
3414 Register Ptr = getKernargParameterPtr(B, Offset);
3415 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3416 B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3417 MachineMemOperand::MODereferenceable |
3418 MachineMemOperand::MOInvariant);
3419 MI.eraseFromParent();
3420 return true;
3421 }
3422
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3424 MachineRegisterInfo &MRI,
3425 MachineIRBuilder &B) const {
3426 Register Dst = MI.getOperand(0).getReg();
3427 LLT DstTy = MRI.getType(Dst);
3428 LLT S16 = LLT::scalar(16);
3429 LLT S32 = LLT::scalar(32);
3430 LLT S64 = LLT::scalar(64);
3431
3432 if (DstTy == S16)
3433 return legalizeFDIV16(MI, MRI, B);
3434 if (DstTy == S32)
3435 return legalizeFDIV32(MI, MRI, B);
3436 if (DstTy == S64)
3437 return legalizeFDIV64(MI, MRI, B);
3438
3439 return false;
3440 }
3441
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const3442 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3443 Register DstDivReg,
3444 Register DstRemReg,
3445 Register X,
3446 Register Y) const {
3447 const LLT S1 = LLT::scalar(1);
3448 const LLT S32 = LLT::scalar(32);
3449
3450 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3451 // algorithm used here.
3452
3453 // Initial estimate of inv(y).
3454 auto FloatY = B.buildUITOFP(S32, Y);
3455 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456 auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3457 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3458 auto Z = B.buildFPTOUI(S32, ScaledY);
3459
3460 // One round of UNR.
3461 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3462 auto NegYZ = B.buildMul(S32, NegY, Z);
3463 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3464
3465 // Quotient/remainder estimate.
3466 auto Q = B.buildUMulH(S32, X, Z);
3467 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3468
3469 // First quotient/remainder refinement.
3470 auto One = B.buildConstant(S32, 1);
3471 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472 if (DstDivReg)
3473 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3474 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3475
3476 // Second quotient/remainder refinement.
3477 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478 if (DstDivReg)
3479 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3480
3481 if (DstRemReg)
3482 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3483 }
3484
3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3486 //
3487 // Return lo, hi of result
3488 //
3489 // %cvt.lo = G_UITOFP Val.lo
3490 // %cvt.hi = G_UITOFP Val.hi
3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3494 // %mul2 = G_FMUL %mul1, 2**(-32)
3495 // %trunc = G_INTRINSIC_TRUNC %mul2
3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3499 Register Val) {
3500 const LLT S32 = LLT::scalar(32);
3501 auto Unmerge = B.buildUnmerge(S32, Val);
3502
3503 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3504 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3505
3506 auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3507 B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3508
3509 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3510 auto Mul1 =
3511 B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3512
3513 // 2**(-32)
3514 auto Mul2 =
3515 B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3516 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3517
3518 // -(2**32)
3519 auto Mad2 = B.buildFMAD(S32, Trunc,
3520 B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3521
3522 auto ResultLo = B.buildFPTOUI(S32, Mad2);
3523 auto ResultHi = B.buildFPTOUI(S32, Trunc);
3524
3525 return {ResultLo.getReg(0), ResultHi.getReg(0)};
3526 }
3527
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const3528 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3529 Register DstDivReg,
3530 Register DstRemReg,
3531 Register Numer,
3532 Register Denom) const {
3533 const LLT S32 = LLT::scalar(32);
3534 const LLT S64 = LLT::scalar(64);
3535 const LLT S1 = LLT::scalar(1);
3536 Register RcpLo, RcpHi;
3537
3538 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3539
3540 auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3541
3542 auto Zero64 = B.buildConstant(S64, 0);
3543 auto NegDenom = B.buildSub(S64, Zero64, Denom);
3544
3545 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3546 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3547
3548 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3549 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3551
3552 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554 auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3555
3556 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3557 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3558 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3559 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3561
3562 auto Zero32 = B.buildConstant(S32, 0);
3563 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565 auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3566
3567 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3568 Register NumerLo = UnmergeNumer.getReg(0);
3569 Register NumerHi = UnmergeNumer.getReg(1);
3570
3571 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3572 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3573 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3574 Register Mul3_Lo = UnmergeMul3.getReg(0);
3575 Register Mul3_Hi = UnmergeMul3.getReg(1);
3576 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3579 auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3580
3581 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3582 Register DenomLo = UnmergeDenom.getReg(0);
3583 Register DenomHi = UnmergeDenom.getReg(1);
3584
3585 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3586 auto C1 = B.buildSExt(S32, CmpHi);
3587
3588 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3589 auto C2 = B.buildSExt(S32, CmpLo);
3590
3591 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3592 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3593
3594 // TODO: Here and below portions of the code can be enclosed into if/endif.
3595 // Currently control flow is unconditional and we have 4 selects after
3596 // potential endif to substitute PHIs.
3597
3598 // if C3 != 0 ...
3599 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602 auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3603
3604 auto One64 = B.buildConstant(S64, 1);
3605 auto Add3 = B.buildAdd(S64, MulHi3, One64);
3606
3607 auto C4 =
3608 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3609 auto C5 =
3610 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3611 auto C6 = B.buildSelect(
3612 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3613
3614 // if (C6 != 0)
3615 auto Add4 = B.buildAdd(S64, Add3, One64);
3616 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3617
3618 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620 auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3621
3622 // endif C6
3623 // endif C3
3624
3625 if (DstDivReg) {
3626 auto Sel1 = B.buildSelect(
3627 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629 Sel1, MulHi3);
3630 }
3631
3632 if (DstRemReg) {
3633 auto Sel2 = B.buildSelect(
3634 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636 Sel2, Sub1);
3637 }
3638 }
3639
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3640 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3641 MachineRegisterInfo &MRI,
3642 MachineIRBuilder &B) const {
3643 Register DstDivReg, DstRemReg;
3644 switch (MI.getOpcode()) {
3645 default:
3646 llvm_unreachable("Unexpected opcode!");
3647 case AMDGPU::G_UDIV: {
3648 DstDivReg = MI.getOperand(0).getReg();
3649 break;
3650 }
3651 case AMDGPU::G_UREM: {
3652 DstRemReg = MI.getOperand(0).getReg();
3653 break;
3654 }
3655 case AMDGPU::G_UDIVREM: {
3656 DstDivReg = MI.getOperand(0).getReg();
3657 DstRemReg = MI.getOperand(1).getReg();
3658 break;
3659 }
3660 }
3661
3662 const LLT S64 = LLT::scalar(64);
3663 const LLT S32 = LLT::scalar(32);
3664 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3668
3669 if (Ty == S32)
3670 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3671 else if (Ty == S64)
3672 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3673 else
3674 return false;
3675
3676 MI.eraseFromParent();
3677 return true;
3678 }
3679
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3680 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3681 MachineRegisterInfo &MRI,
3682 MachineIRBuilder &B) const {
3683 const LLT S64 = LLT::scalar(64);
3684 const LLT S32 = LLT::scalar(32);
3685
3686 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3687 if (Ty != S32 && Ty != S64)
3688 return false;
3689
3690 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3693
3694 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3695 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3696 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3697
3698 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3699 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3700
3701 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3702 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3703
3704 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705 switch (MI.getOpcode()) {
3706 default:
3707 llvm_unreachable("Unexpected opcode!");
3708 case AMDGPU::G_SDIV: {
3709 DstDivReg = MI.getOperand(0).getReg();
3710 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711 break;
3712 }
3713 case AMDGPU::G_SREM: {
3714 DstRemReg = MI.getOperand(0).getReg();
3715 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716 break;
3717 }
3718 case AMDGPU::G_SDIVREM: {
3719 DstDivReg = MI.getOperand(0).getReg();
3720 DstRemReg = MI.getOperand(1).getReg();
3721 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723 break;
3724 }
3725 }
3726
3727 if (Ty == S32)
3728 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3729 else
3730 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3731
3732 if (DstDivReg) {
3733 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735 B.buildSub(DstDivReg, SignXor, Sign);
3736 }
3737
3738 if (DstRemReg) {
3739 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741 B.buildSub(DstRemReg, SignXor, Sign);
3742 }
3743
3744 MI.eraseFromParent();
3745 return true;
3746 }
3747
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3748 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3749 MachineRegisterInfo &MRI,
3750 MachineIRBuilder &B) const {
3751 Register Res = MI.getOperand(0).getReg();
3752 Register LHS = MI.getOperand(1).getReg();
3753 Register RHS = MI.getOperand(2).getReg();
3754 uint16_t Flags = MI.getFlags();
3755 LLT ResTy = MRI.getType(Res);
3756
3757 const MachineFunction &MF = B.getMF();
3758 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759 MI.getFlag(MachineInstr::FmAfn);
3760
3761 if (!AllowInaccurateRcp)
3762 return false;
3763
3764 if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3765 // 1 / x -> RCP(x)
3766 if (CLHS->isExactlyValue(1.0)) {
3767 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768 .addUse(RHS)
3769 .setMIFlags(Flags);
3770
3771 MI.eraseFromParent();
3772 return true;
3773 }
3774
3775 // -1 / x -> RCP( FNEG(x) )
3776 if (CLHS->isExactlyValue(-1.0)) {
3777 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3778 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3779 .addUse(FNeg.getReg(0))
3780 .setMIFlags(Flags);
3781
3782 MI.eraseFromParent();
3783 return true;
3784 }
3785 }
3786
3787 // x / y -> x * (1.0 / y)
3788 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3789 .addUse(RHS)
3790 .setMIFlags(Flags);
3791 B.buildFMul(Res, LHS, RCP, Flags);
3792
3793 MI.eraseFromParent();
3794 return true;
3795 }
3796
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3797 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3798 MachineRegisterInfo &MRI,
3799 MachineIRBuilder &B) const {
3800 Register Res = MI.getOperand(0).getReg();
3801 Register X = MI.getOperand(1).getReg();
3802 Register Y = MI.getOperand(2).getReg();
3803 uint16_t Flags = MI.getFlags();
3804 LLT ResTy = MRI.getType(Res);
3805
3806 const MachineFunction &MF = B.getMF();
3807 bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808 MI.getFlag(MachineInstr::FmAfn);
3809
3810 if (!AllowInaccurateRcp)
3811 return false;
3812
3813 auto NegY = B.buildFNeg(ResTy, Y);
3814 auto One = B.buildFConstant(ResTy, 1.0);
3815
3816 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817 .addUse(Y)
3818 .setMIFlags(Flags);
3819
3820 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821 R = B.buildFMA(ResTy, Tmp0, R, R);
3822
3823 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824 R = B.buildFMA(ResTy, Tmp1, R, R);
3825
3826 auto Ret = B.buildFMul(ResTy, X, R);
3827 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828
3829 B.buildFMA(Res, Tmp2, R, Ret);
3830 MI.eraseFromParent();
3831 return true;
3832 }
3833
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3834 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3835 MachineRegisterInfo &MRI,
3836 MachineIRBuilder &B) const {
3837 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838 return true;
3839
3840 Register Res = MI.getOperand(0).getReg();
3841 Register LHS = MI.getOperand(1).getReg();
3842 Register RHS = MI.getOperand(2).getReg();
3843
3844 uint16_t Flags = MI.getFlags();
3845
3846 LLT S16 = LLT::scalar(16);
3847 LLT S32 = LLT::scalar(32);
3848
3849 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851
3852 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853 .addUse(RHSExt.getReg(0))
3854 .setMIFlags(Flags);
3855
3856 auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857 auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858
3859 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860 .addUse(RDst.getReg(0))
3861 .addUse(RHS)
3862 .addUse(LHS)
3863 .setMIFlags(Flags);
3864
3865 MI.eraseFromParent();
3866 return true;
3867 }
3868
3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,AMDGPU::SIModeRegisterDefaults Mode)3871 static void toggleSPDenormMode(bool Enable,
3872 MachineIRBuilder &B,
3873 const GCNSubtarget &ST,
3874 AMDGPU::SIModeRegisterDefaults Mode) {
3875 // Set SP denorm mode to this value.
3876 unsigned SPDenormMode =
3877 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878
3879 if (ST.hasDenormModeInst()) {
3880 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3881 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882
3883 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884 B.buildInstr(AMDGPU::S_DENORM_MODE)
3885 .addImm(NewDenormModeValue);
3886
3887 } else {
3888 // Select FP32 bit field in mode register.
3889 unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3890 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3891 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3892
3893 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894 .addImm(SPDenormMode)
3895 .addImm(SPDenormModeBitField);
3896 }
3897 }
3898
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3899 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3900 MachineRegisterInfo &MRI,
3901 MachineIRBuilder &B) const {
3902 if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903 return true;
3904
3905 Register Res = MI.getOperand(0).getReg();
3906 Register LHS = MI.getOperand(1).getReg();
3907 Register RHS = MI.getOperand(2).getReg();
3908 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3909 AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3910
3911 uint16_t Flags = MI.getFlags();
3912
3913 LLT S32 = LLT::scalar(32);
3914 LLT S1 = LLT::scalar(1);
3915
3916 auto One = B.buildFConstant(S32, 1.0f);
3917
3918 auto DenominatorScaled =
3919 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920 .addUse(LHS)
3921 .addUse(RHS)
3922 .addImm(0)
3923 .setMIFlags(Flags);
3924 auto NumeratorScaled =
3925 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926 .addUse(LHS)
3927 .addUse(RHS)
3928 .addImm(1)
3929 .setMIFlags(Flags);
3930
3931 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932 .addUse(DenominatorScaled.getReg(0))
3933 .setMIFlags(Flags);
3934 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935
3936 // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937 // aren't modeled as reading it.
3938 if (!Mode.allFP32Denormals())
3939 toggleSPDenormMode(true, B, ST, Mode);
3940
3941 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947
3948 if (!Mode.allFP32Denormals())
3949 toggleSPDenormMode(false, B, ST, Mode);
3950
3951 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952 .addUse(Fma4.getReg(0))
3953 .addUse(Fma1.getReg(0))
3954 .addUse(Fma3.getReg(0))
3955 .addUse(NumeratorScaled.getReg(1))
3956 .setMIFlags(Flags);
3957
3958 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959 .addUse(Fmas.getReg(0))
3960 .addUse(RHS)
3961 .addUse(LHS)
3962 .setMIFlags(Flags);
3963
3964 MI.eraseFromParent();
3965 return true;
3966 }
3967
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3968 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3969 MachineRegisterInfo &MRI,
3970 MachineIRBuilder &B) const {
3971 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3972 return true;
3973
3974 Register Res = MI.getOperand(0).getReg();
3975 Register LHS = MI.getOperand(1).getReg();
3976 Register RHS = MI.getOperand(2).getReg();
3977
3978 uint16_t Flags = MI.getFlags();
3979
3980 LLT S64 = LLT::scalar(64);
3981 LLT S1 = LLT::scalar(1);
3982
3983 auto One = B.buildFConstant(S64, 1.0);
3984
3985 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986 .addUse(LHS)
3987 .addUse(RHS)
3988 .addImm(0)
3989 .setMIFlags(Flags);
3990
3991 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992
3993 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994 .addUse(DivScale0.getReg(0))
3995 .setMIFlags(Flags);
3996
3997 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000
4001 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002 .addUse(LHS)
4003 .addUse(RHS)
4004 .addImm(1)
4005 .setMIFlags(Flags);
4006
4007 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010
4011 Register Scale;
4012 if (!ST.hasUsableDivScaleConditionOutput()) {
4013 // Workaround a hardware bug on SI where the condition output from div_scale
4014 // is not usable.
4015
4016 LLT S32 = LLT::scalar(32);
4017
4018 auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019 auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022
4023 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024 Scale1Unmerge.getReg(1));
4025 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026 Scale0Unmerge.getReg(1));
4027 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028 } else {
4029 Scale = DivScale1.getReg(1);
4030 }
4031
4032 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033 .addUse(Fma4.getReg(0))
4034 .addUse(Fma3.getReg(0))
4035 .addUse(Mul.getReg(0))
4036 .addUse(Scale)
4037 .setMIFlags(Flags);
4038
4039 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040 .addUse(Fmas.getReg(0))
4041 .addUse(RHS)
4042 .addUse(LHS)
4043 .setMIFlags(Flags);
4044
4045 MI.eraseFromParent();
4046 return true;
4047 }
4048
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4049 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4050 MachineRegisterInfo &MRI,
4051 MachineIRBuilder &B) const {
4052 Register Res = MI.getOperand(0).getReg();
4053 Register LHS = MI.getOperand(2).getReg();
4054 Register RHS = MI.getOperand(3).getReg();
4055 uint16_t Flags = MI.getFlags();
4056
4057 LLT S32 = LLT::scalar(32);
4058 LLT S1 = LLT::scalar(1);
4059
4060 auto Abs = B.buildFAbs(S32, RHS, Flags);
4061 const APFloat C0Val(1.0f);
4062
4063 auto C0 = B.buildConstant(S32, 0x6f800000);
4064 auto C1 = B.buildConstant(S32, 0x2f800000);
4065 auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4066
4067 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4068 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4069
4070 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4071
4072 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4073 .addUse(Mul0.getReg(0))
4074 .setMIFlags(Flags);
4075
4076 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4077
4078 B.buildFMul(Res, Sel, Mul1, Flags);
4079
4080 MI.eraseFromParent();
4081 return true;
4082 }
4083
4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085 // FIXME: Why do we handle this one but not other removed instructions?
4086 //
4087 // Reciprocal square root. The clamp prevents infinite results, clamping
4088 // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
4089 // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4090 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4091 MachineRegisterInfo &MRI,
4092 MachineIRBuilder &B) const {
4093 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4094 return true;
4095
4096 Register Dst = MI.getOperand(0).getReg();
4097 Register Src = MI.getOperand(2).getReg();
4098 auto Flags = MI.getFlags();
4099
4100 LLT Ty = MRI.getType(Dst);
4101
4102 const fltSemantics *FltSemantics;
4103 if (Ty == LLT::scalar(32))
4104 FltSemantics = &APFloat::IEEEsingle();
4105 else if (Ty == LLT::scalar(64))
4106 FltSemantics = &APFloat::IEEEdouble();
4107 else
4108 return false;
4109
4110 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111 .addUse(Src)
4112 .setMIFlags(Flags);
4113
4114 // We don't need to concern ourselves with the snan handling difference, since
4115 // the rsq quieted (or not) so use the one which will directly select.
4116 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117 const bool UseIEEE = MFI->getMode().IEEE;
4118
4119 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122
4123 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124
4125 if (UseIEEE)
4126 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127 else
4128 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129 MI.eraseFromParent();
4130 return true;
4131 }
4132
getDSFPAtomicOpcode(Intrinsic::ID IID)4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134 switch (IID) {
4135 case Intrinsic::amdgcn_ds_fadd:
4136 return AMDGPU::G_ATOMICRMW_FADD;
4137 case Intrinsic::amdgcn_ds_fmin:
4138 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139 case Intrinsic::amdgcn_ds_fmax:
4140 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141 default:
4142 llvm_unreachable("not a DS FP intrinsic");
4143 }
4144 }
4145
legalizeDSAtomicFPIntrinsic(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const4146 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4147 MachineInstr &MI,
4148 Intrinsic::ID IID) const {
4149 GISelChangeObserver &Observer = Helper.Observer;
4150 Observer.changingInstr(MI);
4151
4152 MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153
4154 // The remaining operands were used to set fields in the MemOperand on
4155 // construction.
4156 for (int I = 6; I > 3; --I)
4157 MI.removeOperand(I);
4158
4159 MI.removeOperand(1); // Remove the intrinsic ID.
4160 Observer.changedInstr(MI);
4161 return true;
4162 }
4163
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4164 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4165 MachineRegisterInfo &MRI,
4166 MachineIRBuilder &B) const {
4167 uint64_t Offset =
4168 ST.getTargetLowering()->getImplicitParameterOffset(
4169 B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4170 LLT DstTy = MRI.getType(DstReg);
4171 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172
4173 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174 if (!loadInputValue(KernargPtrReg, B,
4175 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4176 return false;
4177
4178 // FIXME: This should be nuw
4179 B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180 return true;
4181 }
4182
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4183 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4184 MachineRegisterInfo &MRI,
4185 MachineIRBuilder &B) const {
4186 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4187 if (!MFI->isEntryFunction()) {
4188 return legalizePreloadedArgIntrin(MI, MRI, B,
4189 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4190 }
4191
4192 Register DstReg = MI.getOperand(0).getReg();
4193 if (!getImplicitArgPtr(DstReg, MRI, B))
4194 return false;
4195
4196 MI.eraseFromParent();
4197 return true;
4198 }
4199
getLDSKernelId(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4200 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4201 MachineRegisterInfo &MRI,
4202 MachineIRBuilder &B) const {
4203 Function &F = B.getMF().getFunction();
4204 Optional<uint32_t> KnownSize =
4205 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4206 if (KnownSize.has_value())
4207 B.buildConstant(DstReg, KnownSize.value());
4208 return false;
4209 }
4210
legalizeLDSKernelId(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4211 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4212 MachineRegisterInfo &MRI,
4213 MachineIRBuilder &B) const {
4214
4215 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4216 if (!MFI->isEntryFunction()) {
4217 return legalizePreloadedArgIntrin(MI, MRI, B,
4218 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4219 }
4220
4221 Register DstReg = MI.getOperand(0).getReg();
4222 if (!getLDSKernelId(DstReg, MRI, B))
4223 return false;
4224
4225 MI.eraseFromParent();
4226 return true;
4227 }
4228
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const4229 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4230 MachineRegisterInfo &MRI,
4231 MachineIRBuilder &B,
4232 unsigned AddrSpace) const {
4233 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4234 auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4235 Register Hi32 = Unmerge.getReg(1);
4236
4237 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4238 MI.eraseFromParent();
4239 return true;
4240 }
4241
4242 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4243 // offset (the offset that is included in bounds checking and swizzling, to be
4244 // split between the instruction's voffset and immoffset fields) and soffset
4245 // (the offset that is excluded from bounds checking and swizzling, to go in
4246 // the instruction's soffset field). This function takes the first kind of
4247 // offset and figures out how to split it between voffset and immoffset.
4248 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const4249 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4250 Register OrigOffset) const {
4251 const unsigned MaxImm = 4095;
4252 Register BaseReg;
4253 unsigned ImmOffset;
4254 const LLT S32 = LLT::scalar(32);
4255 MachineRegisterInfo &MRI = *B.getMRI();
4256
4257 std::tie(BaseReg, ImmOffset) =
4258 AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4259
4260 // If BaseReg is a pointer, convert it to int.
4261 if (MRI.getType(BaseReg).isPointer())
4262 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4263
4264 // If the immediate value is too big for the immoffset field, put the value
4265 // and -4096 into the immoffset field so that the value that is copied/added
4266 // for the voffset field is a multiple of 4096, and it stands more chance
4267 // of being CSEd with the copy/add for another similar load/store.
4268 // However, do not do that rounding down to a multiple of 4096 if that is a
4269 // negative number, as it appears to be illegal to have a negative offset
4270 // in the vgpr, even if adding the immediate offset makes it positive.
4271 unsigned Overflow = ImmOffset & ~MaxImm;
4272 ImmOffset -= Overflow;
4273 if ((int32_t)Overflow < 0) {
4274 Overflow += ImmOffset;
4275 ImmOffset = 0;
4276 }
4277
4278 if (Overflow != 0) {
4279 if (!BaseReg) {
4280 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4281 } else {
4282 auto OverflowVal = B.buildConstant(S32, Overflow);
4283 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4284 }
4285 }
4286
4287 if (!BaseReg)
4288 BaseReg = B.buildConstant(S32, 0).getReg(0);
4289
4290 return std::make_pair(BaseReg, ImmOffset);
4291 }
4292
4293 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
updateBufferMMO(MachineMemOperand * MMO,Register VOffset,Register SOffset,unsigned ImmOffset,Register VIndex,MachineRegisterInfo & MRI) const4294 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4295 Register VOffset, Register SOffset,
4296 unsigned ImmOffset, Register VIndex,
4297 MachineRegisterInfo &MRI) const {
4298 Optional<ValueAndVReg> MaybeVOffsetVal =
4299 getIConstantVRegValWithLookThrough(VOffset, MRI);
4300 Optional<ValueAndVReg> MaybeSOffsetVal =
4301 getIConstantVRegValWithLookThrough(SOffset, MRI);
4302 Optional<ValueAndVReg> MaybeVIndexVal =
4303 getIConstantVRegValWithLookThrough(VIndex, MRI);
4304 // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4305 // update the MMO with that offset. The stride is unknown so we can only do
4306 // this if VIndex is constant 0.
4307 if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4308 MaybeVIndexVal->Value == 0) {
4309 uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4310 MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4311 MMO->setOffset(TotalOffset);
4312 } else {
4313 // We don't have a constant combined offset to use in the MMO. Give up.
4314 MMO->setValue((Value *)nullptr);
4315 }
4316 }
4317
4318 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const4319 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4320 MachineRegisterInfo &MRI,
4321 Register Reg,
4322 bool ImageStore) const {
4323 const LLT S16 = LLT::scalar(16);
4324 const LLT S32 = LLT::scalar(32);
4325 LLT StoreVT = MRI.getType(Reg);
4326 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4327
4328 if (ST.hasUnpackedD16VMem()) {
4329 auto Unmerge = B.buildUnmerge(S16, Reg);
4330
4331 SmallVector<Register, 4> WideRegs;
4332 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4333 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4334
4335 int NumElts = StoreVT.getNumElements();
4336
4337 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4338 .getReg(0);
4339 }
4340
4341 if (ImageStore && ST.hasImageStoreD16Bug()) {
4342 if (StoreVT.getNumElements() == 2) {
4343 SmallVector<Register, 4> PackedRegs;
4344 Reg = B.buildBitcast(S32, Reg).getReg(0);
4345 PackedRegs.push_back(Reg);
4346 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4347 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4348 .getReg(0);
4349 }
4350
4351 if (StoreVT.getNumElements() == 3) {
4352 SmallVector<Register, 4> PackedRegs;
4353 auto Unmerge = B.buildUnmerge(S16, Reg);
4354 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4355 PackedRegs.push_back(Unmerge.getReg(I));
4356 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4357 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4358 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4359 }
4360
4361 if (StoreVT.getNumElements() == 4) {
4362 SmallVector<Register, 4> PackedRegs;
4363 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4364 auto Unmerge = B.buildUnmerge(S32, Reg);
4365 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4366 PackedRegs.push_back(Unmerge.getReg(I));
4367 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4368 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4369 .getReg(0);
4370 }
4371
4372 llvm_unreachable("invalid data type");
4373 }
4374
4375 if (StoreVT == LLT::fixed_vector(3, S16)) {
4376 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4377 .getReg(0);
4378 }
4379 return Reg;
4380 }
4381
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const4382 Register AMDGPULegalizerInfo::fixStoreSourceType(
4383 MachineIRBuilder &B, Register VData, bool IsFormat) const {
4384 MachineRegisterInfo *MRI = B.getMRI();
4385 LLT Ty = MRI->getType(VData);
4386
4387 const LLT S16 = LLT::scalar(16);
4388
4389 // Fixup illegal register types for i8 stores.
4390 if (Ty == LLT::scalar(8) || Ty == S16) {
4391 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4392 return AnyExt;
4393 }
4394
4395 if (Ty.isVector()) {
4396 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4397 if (IsFormat)
4398 return handleD16VData(B, *MRI, VData);
4399 }
4400 }
4401
4402 return VData;
4403 }
4404
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const4405 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4406 MachineRegisterInfo &MRI,
4407 MachineIRBuilder &B,
4408 bool IsTyped,
4409 bool IsFormat) const {
4410 Register VData = MI.getOperand(1).getReg();
4411 LLT Ty = MRI.getType(VData);
4412 LLT EltTy = Ty.getScalarType();
4413 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4414 const LLT S32 = LLT::scalar(32);
4415
4416 VData = fixStoreSourceType(B, VData, IsFormat);
4417 Register RSrc = MI.getOperand(2).getReg();
4418
4419 MachineMemOperand *MMO = *MI.memoperands_begin();
4420 const int MemSize = MMO->getSize();
4421
4422 unsigned ImmOffset;
4423
4424 // The typed intrinsics add an immediate after the registers.
4425 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4426
4427 // The struct intrinsic variants add one additional operand over raw.
4428 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4429 Register VIndex;
4430 int OpOffset = 0;
4431 if (HasVIndex) {
4432 VIndex = MI.getOperand(3).getReg();
4433 OpOffset = 1;
4434 } else {
4435 VIndex = B.buildConstant(S32, 0).getReg(0);
4436 }
4437
4438 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4439 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4440
4441 unsigned Format = 0;
4442 if (IsTyped) {
4443 Format = MI.getOperand(5 + OpOffset).getImm();
4444 ++OpOffset;
4445 }
4446
4447 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4448
4449 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4450 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4451
4452 unsigned Opc;
4453 if (IsTyped) {
4454 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4455 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4456 } else if (IsFormat) {
4457 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4458 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4459 } else {
4460 switch (MemSize) {
4461 case 1:
4462 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4463 break;
4464 case 2:
4465 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4466 break;
4467 default:
4468 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4469 break;
4470 }
4471 }
4472
4473 auto MIB = B.buildInstr(Opc)
4474 .addUse(VData) // vdata
4475 .addUse(RSrc) // rsrc
4476 .addUse(VIndex) // vindex
4477 .addUse(VOffset) // voffset
4478 .addUse(SOffset) // soffset
4479 .addImm(ImmOffset); // offset(imm)
4480
4481 if (IsTyped)
4482 MIB.addImm(Format);
4483
4484 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4485 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4486 .addMemOperand(MMO);
4487
4488 MI.eraseFromParent();
4489 return true;
4490 }
4491
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4493 MachineRegisterInfo &MRI,
4494 MachineIRBuilder &B,
4495 bool IsFormat,
4496 bool IsTyped) const {
4497 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4498 MachineMemOperand *MMO = *MI.memoperands_begin();
4499 const LLT MemTy = MMO->getMemoryType();
4500 const LLT S32 = LLT::scalar(32);
4501
4502 Register Dst = MI.getOperand(0).getReg();
4503 Register RSrc = MI.getOperand(2).getReg();
4504
4505 // The typed intrinsics add an immediate after the registers.
4506 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4507
4508 // The struct intrinsic variants add one additional operand over raw.
4509 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4510 Register VIndex;
4511 int OpOffset = 0;
4512 if (HasVIndex) {
4513 VIndex = MI.getOperand(3).getReg();
4514 OpOffset = 1;
4515 } else {
4516 VIndex = B.buildConstant(S32, 0).getReg(0);
4517 }
4518
4519 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4520 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4521
4522 unsigned Format = 0;
4523 if (IsTyped) {
4524 Format = MI.getOperand(5 + OpOffset).getImm();
4525 ++OpOffset;
4526 }
4527
4528 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4529 unsigned ImmOffset;
4530
4531 LLT Ty = MRI.getType(Dst);
4532 LLT EltTy = Ty.getScalarType();
4533 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4534 const bool Unpacked = ST.hasUnpackedD16VMem();
4535
4536 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4537 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4538
4539 unsigned Opc;
4540
4541 if (IsTyped) {
4542 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4543 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4544 } else if (IsFormat) {
4545 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4546 AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4547 } else {
4548 switch (MemTy.getSizeInBits()) {
4549 case 8:
4550 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4551 break;
4552 case 16:
4553 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4554 break;
4555 default:
4556 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4557 break;
4558 }
4559 }
4560
4561 Register LoadDstReg;
4562
4563 bool IsExtLoad =
4564 (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4565 LLT UnpackedTy = Ty.changeElementSize(32);
4566
4567 if (IsExtLoad)
4568 LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4569 else if (Unpacked && IsD16 && Ty.isVector())
4570 LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4571 else
4572 LoadDstReg = Dst;
4573
4574 auto MIB = B.buildInstr(Opc)
4575 .addDef(LoadDstReg) // vdata
4576 .addUse(RSrc) // rsrc
4577 .addUse(VIndex) // vindex
4578 .addUse(VOffset) // voffset
4579 .addUse(SOffset) // soffset
4580 .addImm(ImmOffset); // offset(imm)
4581
4582 if (IsTyped)
4583 MIB.addImm(Format);
4584
4585 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4586 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4587 .addMemOperand(MMO);
4588
4589 if (LoadDstReg != Dst) {
4590 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4591
4592 // Widen result for extending loads was widened.
4593 if (IsExtLoad)
4594 B.buildTrunc(Dst, LoadDstReg);
4595 else {
4596 // Repack to original 16-bit vector result
4597 // FIXME: G_TRUNC should work, but legalization currently fails
4598 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4599 SmallVector<Register, 4> Repack;
4600 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4601 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4602 B.buildMerge(Dst, Repack);
4603 }
4604 }
4605
4606 MI.eraseFromParent();
4607 return true;
4608 }
4609
legalizeAtomicIncDec(MachineInstr & MI,MachineIRBuilder & B,bool IsInc) const4610 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4611 MachineIRBuilder &B,
4612 bool IsInc) const {
4613 unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4614 AMDGPU::G_AMDGPU_ATOMIC_DEC;
4615 B.buildInstr(Opc)
4616 .addDef(MI.getOperand(0).getReg())
4617 .addUse(MI.getOperand(2).getReg())
4618 .addUse(MI.getOperand(3).getReg())
4619 .cloneMemRefs(MI);
4620 MI.eraseFromParent();
4621 return true;
4622 }
4623
getBufferAtomicPseudo(Intrinsic::ID IntrID)4624 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4625 switch (IntrID) {
4626 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4627 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4628 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4629 case Intrinsic::amdgcn_raw_buffer_atomic_add:
4630 case Intrinsic::amdgcn_struct_buffer_atomic_add:
4631 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4632 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4633 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4634 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4635 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4636 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4637 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4638 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4639 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4641 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4642 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4643 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4644 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4645 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4646 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4647 case Intrinsic::amdgcn_raw_buffer_atomic_and:
4648 case Intrinsic::amdgcn_struct_buffer_atomic_and:
4649 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4650 case Intrinsic::amdgcn_raw_buffer_atomic_or:
4651 case Intrinsic::amdgcn_struct_buffer_atomic_or:
4652 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4653 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4654 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4656 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4657 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4658 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4659 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4660 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4661 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4662 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4663 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4664 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4665 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4666 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4667 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4668 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4669 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4670 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4671 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4672 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4673 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4674 default:
4675 llvm_unreachable("unhandled atomic opcode");
4676 }
4677 }
4678
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const4679 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4680 MachineIRBuilder &B,
4681 Intrinsic::ID IID) const {
4682 const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4683 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4684 const bool HasReturn = MI.getNumExplicitDefs() != 0;
4685
4686 Register Dst;
4687
4688 int OpOffset = 0;
4689 if (HasReturn) {
4690 // A few FP atomics do not support return values.
4691 Dst = MI.getOperand(0).getReg();
4692 } else {
4693 OpOffset = -1;
4694 }
4695
4696 Register VData = MI.getOperand(2 + OpOffset).getReg();
4697 Register CmpVal;
4698
4699 if (IsCmpSwap) {
4700 CmpVal = MI.getOperand(3 + OpOffset).getReg();
4701 ++OpOffset;
4702 }
4703
4704 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4705 const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4706
4707 // The struct intrinsic variants add one additional operand over raw.
4708 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4709 Register VIndex;
4710 if (HasVIndex) {
4711 VIndex = MI.getOperand(4 + OpOffset).getReg();
4712 ++OpOffset;
4713 } else {
4714 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4715 }
4716
4717 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4718 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4719 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4720
4721 MachineMemOperand *MMO = *MI.memoperands_begin();
4722
4723 unsigned ImmOffset;
4724 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4725 updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4726
4727 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4728
4729 if (HasReturn)
4730 MIB.addDef(Dst);
4731
4732 MIB.addUse(VData); // vdata
4733
4734 if (IsCmpSwap)
4735 MIB.addReg(CmpVal);
4736
4737 MIB.addUse(RSrc) // rsrc
4738 .addUse(VIndex) // vindex
4739 .addUse(VOffset) // voffset
4740 .addUse(SOffset) // soffset
4741 .addImm(ImmOffset) // offset(imm)
4742 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
4743 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4744 .addMemOperand(MMO);
4745
4746 MI.eraseFromParent();
4747 return true;
4748 }
4749
4750 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4751 /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)4752 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4753 SmallVectorImpl<Register> &PackedAddrs,
4754 unsigned ArgOffset,
4755 const AMDGPU::ImageDimIntrinsicInfo *Intr,
4756 bool IsA16, bool IsG16) {
4757 const LLT S16 = LLT::scalar(16);
4758 const LLT V2S16 = LLT::fixed_vector(2, 16);
4759 auto EndIdx = Intr->VAddrEnd;
4760
4761 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4762 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4763 if (!SrcOp.isReg())
4764 continue; // _L to _LZ may have eliminated this.
4765
4766 Register AddrReg = SrcOp.getReg();
4767
4768 if ((I < Intr->GradientStart) ||
4769 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4770 (I >= Intr->CoordStart && !IsA16)) {
4771 if ((I < Intr->GradientStart) && IsA16 &&
4772 (B.getMRI()->getType(AddrReg) == S16)) {
4773 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4774 // Special handling of bias when A16 is on. Bias is of type half but
4775 // occupies full 32-bit.
4776 PackedAddrs.push_back(
4777 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4778 .getReg(0));
4779 } else {
4780 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4781 "Bias needs to be converted to 16 bit in A16 mode");
4782 // Handle any gradient or coordinate operands that should not be packed
4783 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4784 PackedAddrs.push_back(AddrReg);
4785 }
4786 } else {
4787 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4788 // derivatives dx/dh and dx/dv are packed with undef.
4789 if (((I + 1) >= EndIdx) ||
4790 ((Intr->NumGradients / 2) % 2 == 1 &&
4791 (I == static_cast<unsigned>(Intr->GradientStart +
4792 (Intr->NumGradients / 2) - 1) ||
4793 I == static_cast<unsigned>(Intr->GradientStart +
4794 Intr->NumGradients - 1))) ||
4795 // Check for _L to _LZ optimization
4796 !MI.getOperand(ArgOffset + I + 1).isReg()) {
4797 PackedAddrs.push_back(
4798 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4799 .getReg(0));
4800 } else {
4801 PackedAddrs.push_back(
4802 B.buildBuildVector(
4803 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4804 .getReg(0));
4805 ++I;
4806 }
4807 }
4808 }
4809 }
4810
4811 /// Convert from separate vaddr components to a single vector address register,
4812 /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)4813 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4814 int DimIdx, int NumVAddrs) {
4815 const LLT S32 = LLT::scalar(32);
4816
4817 SmallVector<Register, 8> AddrRegs;
4818 for (int I = 0; I != NumVAddrs; ++I) {
4819 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4820 if (SrcOp.isReg()) {
4821 AddrRegs.push_back(SrcOp.getReg());
4822 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4823 }
4824 }
4825
4826 int NumAddrRegs = AddrRegs.size();
4827 if (NumAddrRegs != 1) {
4828 // Above 8 elements round up to next power of 2 (i.e. 16).
4829 if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4830 const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4831 auto Undef = B.buildUndef(S32);
4832 AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4833 NumAddrRegs = RoundedNumRegs;
4834 }
4835
4836 auto VAddr =
4837 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4838 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4839 }
4840
4841 for (int I = 1; I != NumVAddrs; ++I) {
4842 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4843 if (SrcOp.isReg())
4844 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4845 }
4846 }
4847
4848 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4849 ///
4850 /// Depending on the subtarget, load/store with 16-bit element data need to be
4851 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4852 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4853 /// registers.
4854 ///
4855 /// We don't want to directly select image instructions just yet, but also want
4856 /// to exposes all register repacking to the legalizer/combiners. We also don't
4857 /// want a selected instruction entering RegBankSelect. In order to avoid
4858 /// defining a multitude of intermediate image instructions, directly hack on
4859 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4860 /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const4861 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4862 MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4863 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4864
4865 const unsigned NumDefs = MI.getNumExplicitDefs();
4866 const unsigned ArgOffset = NumDefs + 1;
4867 bool IsTFE = NumDefs == 2;
4868 // We are only processing the operands of d16 image operations on subtargets
4869 // that use the unpacked register layout, or need to repack the TFE result.
4870
4871 // TODO: Do we need to guard against already legalized intrinsics?
4872 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4873 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4874
4875 MachineRegisterInfo *MRI = B.getMRI();
4876 const LLT S32 = LLT::scalar(32);
4877 const LLT S16 = LLT::scalar(16);
4878 const LLT V2S16 = LLT::fixed_vector(2, 16);
4879
4880 unsigned DMask = 0;
4881 Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4882 LLT Ty = MRI->getType(VData);
4883
4884 // Check for 16 bit addresses and pack if true.
4885 LLT GradTy =
4886 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4887 LLT AddrTy =
4888 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4889 const bool IsG16 = GradTy == S16;
4890 const bool IsA16 = AddrTy == S16;
4891 const bool IsD16 = Ty.getScalarType() == S16;
4892
4893 int DMaskLanes = 0;
4894 if (!BaseOpcode->Atomic) {
4895 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4896 if (BaseOpcode->Gather4) {
4897 DMaskLanes = 4;
4898 } else if (DMask != 0) {
4899 DMaskLanes = countPopulation(DMask);
4900 } else if (!IsTFE && !BaseOpcode->Store) {
4901 // If dmask is 0, this is a no-op load. This can be eliminated.
4902 B.buildUndef(MI.getOperand(0));
4903 MI.eraseFromParent();
4904 return true;
4905 }
4906 }
4907
4908 Observer.changingInstr(MI);
4909 auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4910
4911 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4912 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4913 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4914 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4915 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4916
4917 // Track that we legalized this
4918 MI.setDesc(B.getTII().get(NewOpcode));
4919
4920 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4921 // dmask to be at least 1 otherwise the instruction will fail
4922 if (IsTFE && DMask == 0) {
4923 DMask = 0x1;
4924 DMaskLanes = 1;
4925 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4926 }
4927
4928 if (BaseOpcode->Atomic) {
4929 Register VData0 = MI.getOperand(2).getReg();
4930 LLT Ty = MRI->getType(VData0);
4931
4932 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4933 if (Ty.isVector())
4934 return false;
4935
4936 if (BaseOpcode->AtomicX2) {
4937 Register VData1 = MI.getOperand(3).getReg();
4938 // The two values are packed in one register.
4939 LLT PackedTy = LLT::fixed_vector(2, Ty);
4940 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4941 MI.getOperand(2).setReg(Concat.getReg(0));
4942 MI.getOperand(3).setReg(AMDGPU::NoRegister);
4943 }
4944 }
4945
4946 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4947
4948 // Rewrite the addressing register layout before doing anything else.
4949 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4950 // 16 bit gradients are supported, but are tied to the A16 control
4951 // so both gradients and addresses must be 16 bit
4952 return false;
4953 }
4954
4955 if (IsA16 && !ST.hasA16()) {
4956 // A16 not supported
4957 return false;
4958 }
4959
4960 if (IsA16 || IsG16) {
4961 if (Intr->NumVAddrs > 1) {
4962 SmallVector<Register, 4> PackedRegs;
4963
4964 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4965 IsG16);
4966
4967 // See also below in the non-a16 branch
4968 const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4969 PackedRegs.size() <= ST.getNSAMaxSize();
4970
4971 if (!UseNSA && PackedRegs.size() > 1) {
4972 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4973 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4974 PackedRegs[0] = Concat.getReg(0);
4975 PackedRegs.resize(1);
4976 }
4977
4978 const unsigned NumPacked = PackedRegs.size();
4979 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4980 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4981 if (!SrcOp.isReg()) {
4982 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4983 continue;
4984 }
4985
4986 assert(SrcOp.getReg() != AMDGPU::NoRegister);
4987
4988 if (I - Intr->VAddrStart < NumPacked)
4989 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4990 else
4991 SrcOp.setReg(AMDGPU::NoRegister);
4992 }
4993 }
4994 } else {
4995 // If the register allocator cannot place the address registers contiguously
4996 // without introducing moves, then using the non-sequential address encoding
4997 // is always preferable, since it saves VALU instructions and is usually a
4998 // wash in terms of code size or even better.
4999 //
5000 // However, we currently have no way of hinting to the register allocator
5001 // that MIMG addresses should be placed contiguously when it is possible to
5002 // do so, so force non-NSA for the common 2-address case as a heuristic.
5003 //
5004 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5005 // allocation when possible.
5006 //
5007 // TODO: we can actually allow partial NSA where the final register is a
5008 // contiguous set of the remaining addresses.
5009 // This could help where there are more addresses than supported.
5010 const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
5011 CorrectedNumVAddrs <= ST.getNSAMaxSize();
5012
5013 if (!UseNSA && Intr->NumVAddrs > 1)
5014 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5015 Intr->NumVAddrs);
5016 }
5017
5018 int Flags = 0;
5019 if (IsA16)
5020 Flags |= 1;
5021 if (IsG16)
5022 Flags |= 2;
5023 MI.addOperand(MachineOperand::CreateImm(Flags));
5024
5025 if (BaseOpcode->Store) { // No TFE for stores?
5026 // TODO: Handle dmask trim
5027 if (!Ty.isVector() || !IsD16)
5028 return true;
5029
5030 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5031 if (RepackedReg != VData) {
5032 MI.getOperand(1).setReg(RepackedReg);
5033 }
5034
5035 return true;
5036 }
5037
5038 Register DstReg = MI.getOperand(0).getReg();
5039 const LLT EltTy = Ty.getScalarType();
5040 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5041
5042 // Confirm that the return type is large enough for the dmask specified
5043 if (NumElts < DMaskLanes)
5044 return false;
5045
5046 if (NumElts > 4 || DMaskLanes > 4)
5047 return false;
5048
5049 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5050 const LLT AdjustedTy =
5051 Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5052
5053 // The raw dword aligned data component of the load. The only legal cases
5054 // where this matters should be when using the packed D16 format, for
5055 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5056 LLT RoundedTy;
5057
5058 // S32 vector to to cover all data, plus TFE result element.
5059 LLT TFETy;
5060
5061 // Register type to use for each loaded component. Will be S32 or V2S16.
5062 LLT RegTy;
5063
5064 if (IsD16 && ST.hasUnpackedD16VMem()) {
5065 RoundedTy =
5066 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5067 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5068 RegTy = S32;
5069 } else {
5070 unsigned EltSize = EltTy.getSizeInBits();
5071 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5072 unsigned RoundedSize = 32 * RoundedElts;
5073 RoundedTy = LLT::scalarOrVector(
5074 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5075 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5076 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5077 }
5078
5079 // The return type does not need adjustment.
5080 // TODO: Should we change s16 case to s32 or <2 x s16>?
5081 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5082 return true;
5083
5084 Register Dst1Reg;
5085
5086 // Insert after the instruction.
5087 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5088
5089 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5090 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5091 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5092 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5093
5094 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5095
5096 MI.getOperand(0).setReg(NewResultReg);
5097
5098 // In the IR, TFE is supposed to be used with a 2 element struct return
5099 // type. The instruction really returns these two values in one contiguous
5100 // register, with one additional dword beyond the loaded data. Rewrite the
5101 // return type to use a single register result.
5102
5103 if (IsTFE) {
5104 Dst1Reg = MI.getOperand(1).getReg();
5105 if (MRI->getType(Dst1Reg) != S32)
5106 return false;
5107
5108 // TODO: Make sure the TFE operand bit is set.
5109 MI.removeOperand(1);
5110
5111 // Handle the easy case that requires no repack instructions.
5112 if (Ty == S32) {
5113 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5114 return true;
5115 }
5116 }
5117
5118 // Now figure out how to copy the new result register back into the old
5119 // result.
5120 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5121
5122 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
5123
5124 if (ResultNumRegs == 1) {
5125 assert(!IsTFE);
5126 ResultRegs[0] = NewResultReg;
5127 } else {
5128 // We have to repack into a new vector of some kind.
5129 for (int I = 0; I != NumDataRegs; ++I)
5130 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5131 B.buildUnmerge(ResultRegs, NewResultReg);
5132
5133 // Drop the final TFE element to get the data part. The TFE result is
5134 // directly written to the right place already.
5135 if (IsTFE)
5136 ResultRegs.resize(NumDataRegs);
5137 }
5138
5139 // For an s16 scalar result, we form an s32 result with a truncate regardless
5140 // of packed vs. unpacked.
5141 if (IsD16 && !Ty.isVector()) {
5142 B.buildTrunc(DstReg, ResultRegs[0]);
5143 return true;
5144 }
5145
5146 // Avoid a build/concat_vector of 1 entry.
5147 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5148 B.buildBitcast(DstReg, ResultRegs[0]);
5149 return true;
5150 }
5151
5152 assert(Ty.isVector());
5153
5154 if (IsD16) {
5155 // For packed D16 results with TFE enabled, all the data components are
5156 // S32. Cast back to the expected type.
5157 //
5158 // TODO: We don't really need to use load s32 elements. We would only need one
5159 // cast for the TFE result if a multiple of v2s16 was used.
5160 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5161 for (Register &Reg : ResultRegs)
5162 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5163 } else if (ST.hasUnpackedD16VMem()) {
5164 for (Register &Reg : ResultRegs)
5165 Reg = B.buildTrunc(S16, Reg).getReg(0);
5166 }
5167 }
5168
5169 auto padWithUndef = [&](LLT Ty, int NumElts) {
5170 if (NumElts == 0)
5171 return;
5172 Register Undef = B.buildUndef(Ty).getReg(0);
5173 for (int I = 0; I != NumElts; ++I)
5174 ResultRegs.push_back(Undef);
5175 };
5176
5177 // Pad out any elements eliminated due to the dmask.
5178 LLT ResTy = MRI->getType(ResultRegs[0]);
5179 if (!ResTy.isVector()) {
5180 padWithUndef(ResTy, NumElts - ResultRegs.size());
5181 B.buildBuildVector(DstReg, ResultRegs);
5182 return true;
5183 }
5184
5185 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5186 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5187
5188 // Deal with the one annoying legal case.
5189 const LLT V3S16 = LLT::fixed_vector(3, 16);
5190 if (Ty == V3S16) {
5191 if (IsTFE) {
5192 if (ResultRegs.size() == 1) {
5193 NewResultReg = ResultRegs[0];
5194 } else if (ResultRegs.size() == 2) {
5195 LLT V4S16 = LLT::fixed_vector(4, 16);
5196 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5197 } else {
5198 return false;
5199 }
5200 }
5201
5202 if (MRI->getType(DstReg).getNumElements() <
5203 MRI->getType(NewResultReg).getNumElements()) {
5204 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5205 } else {
5206 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5207 }
5208 return true;
5209 }
5210
5211 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5212 B.buildConcatVectors(DstReg, ResultRegs);
5213 return true;
5214 }
5215
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const5216 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5217 LegalizerHelper &Helper, MachineInstr &MI) const {
5218 MachineIRBuilder &B = Helper.MIRBuilder;
5219 GISelChangeObserver &Observer = Helper.Observer;
5220
5221 Register Dst = MI.getOperand(0).getReg();
5222 LLT Ty = B.getMRI()->getType(Dst);
5223 unsigned Size = Ty.getSizeInBits();
5224 MachineFunction &MF = B.getMF();
5225
5226 Observer.changingInstr(MI);
5227
5228 if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5229 Ty = getBitcastRegisterType(Ty);
5230 Helper.bitcastDst(MI, Ty, 0);
5231 Dst = MI.getOperand(0).getReg();
5232 B.setInsertPt(B.getMBB(), MI);
5233 }
5234
5235 // FIXME: We don't really need this intermediate instruction. The intrinsic
5236 // should be fixed to have a memory operand. Since it's readnone, we're not
5237 // allowed to add one.
5238 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5239 MI.removeOperand(1); // Remove intrinsic ID
5240
5241 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5242 // TODO: Should this use datalayout alignment?
5243 const unsigned MemSize = (Size + 7) / 8;
5244 const Align MemAlign(4);
5245 MachineMemOperand *MMO = MF.getMachineMemOperand(
5246 MachinePointerInfo(),
5247 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5248 MachineMemOperand::MOInvariant,
5249 MemSize, MemAlign);
5250 MI.addMemOperand(MF, MMO);
5251
5252 // There are no 96-bit result scalar loads, but widening to 128-bit should
5253 // always be legal. We may need to restore this to a 96-bit result if it turns
5254 // out this needs to be converted to a vector load during RegBankSelect.
5255 if (!isPowerOf2_32(Size)) {
5256 if (Ty.isVector())
5257 Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5258 else
5259 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5260 }
5261
5262 Observer.changedInstr(MI);
5263 return true;
5264 }
5265
5266 // TODO: Move to selection
legalizeTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5267 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5268 MachineRegisterInfo &MRI,
5269 MachineIRBuilder &B) const {
5270 if (!ST.isTrapHandlerEnabled() ||
5271 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5272 return legalizeTrapEndpgm(MI, MRI, B);
5273
5274 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5275 switch (*HsaAbiVer) {
5276 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5277 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5278 return legalizeTrapHsaQueuePtr(MI, MRI, B);
5279 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5280 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5281 return ST.supportsGetDoorbellID() ?
5282 legalizeTrapHsa(MI, MRI, B) :
5283 legalizeTrapHsaQueuePtr(MI, MRI, B);
5284 }
5285 }
5286
5287 llvm_unreachable("Unknown trap handler");
5288 }
5289
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5290 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5291 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5292 B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5293 MI.eraseFromParent();
5294 return true;
5295 }
5296
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5297 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5298 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5299 MachineFunction &MF = B.getMF();
5300 const LLT S64 = LLT::scalar(64);
5301
5302 Register SGPR01(AMDGPU::SGPR0_SGPR1);
5303 // For code object version 5, queue_ptr is passed through implicit kernarg.
5304 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5305 AMDGPUTargetLowering::ImplicitParameter Param =
5306 AMDGPUTargetLowering::QUEUE_PTR;
5307 uint64_t Offset =
5308 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5309
5310 Register KernargPtrReg = MRI.createGenericVirtualRegister(
5311 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5312
5313 if (!loadInputValue(KernargPtrReg, B,
5314 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5315 return false;
5316
5317 // TODO: can we be smarter about machine pointer info?
5318 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5319 MachineMemOperand *MMO = MF.getMachineMemOperand(
5320 PtrInfo,
5321 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5322 MachineMemOperand::MOInvariant,
5323 LLT::scalar(64), commonAlignment(Align(64), Offset));
5324
5325 // Pointer address
5326 Register LoadAddr = MRI.createGenericVirtualRegister(
5327 LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5328 B.buildPtrAdd(LoadAddr, KernargPtrReg,
5329 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5330 // Load address
5331 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5332 B.buildCopy(SGPR01, Temp);
5333 B.buildInstr(AMDGPU::S_TRAP)
5334 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5335 .addReg(SGPR01, RegState::Implicit);
5336 MI.eraseFromParent();
5337 return true;
5338 }
5339
5340 // Pass queue pointer to trap handler as input, and insert trap instruction
5341 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5342 Register LiveIn =
5343 MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5344 if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5345 return false;
5346
5347 B.buildCopy(SGPR01, LiveIn);
5348 B.buildInstr(AMDGPU::S_TRAP)
5349 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5350 .addReg(SGPR01, RegState::Implicit);
5351
5352 MI.eraseFromParent();
5353 return true;
5354 }
5355
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5356 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5357 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5358 B.buildInstr(AMDGPU::S_TRAP)
5359 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5360 MI.eraseFromParent();
5361 return true;
5362 }
5363
legalizeDebugTrapIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5364 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5365 MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5366 // Is non-HSA path or trap-handler disabled? Then, report a warning
5367 // accordingly
5368 if (!ST.isTrapHandlerEnabled() ||
5369 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5370 DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5371 "debugtrap handler not supported",
5372 MI.getDebugLoc(), DS_Warning);
5373 LLVMContext &Ctx = B.getMF().getFunction().getContext();
5374 Ctx.diagnose(NoTrap);
5375 } else {
5376 // Insert debug-trap instruction
5377 B.buildInstr(AMDGPU::S_TRAP)
5378 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5379 }
5380
5381 MI.eraseFromParent();
5382 return true;
5383 }
5384
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const5385 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5386 MachineIRBuilder &B) const {
5387 MachineRegisterInfo &MRI = *B.getMRI();
5388 const LLT S16 = LLT::scalar(16);
5389 const LLT S32 = LLT::scalar(32);
5390 const LLT V2S16 = LLT::fixed_vector(2, 16);
5391 const LLT V3S32 = LLT::fixed_vector(3, 32);
5392
5393 Register DstReg = MI.getOperand(0).getReg();
5394 Register NodePtr = MI.getOperand(2).getReg();
5395 Register RayExtent = MI.getOperand(3).getReg();
5396 Register RayOrigin = MI.getOperand(4).getReg();
5397 Register RayDir = MI.getOperand(5).getReg();
5398 Register RayInvDir = MI.getOperand(6).getReg();
5399 Register TDescr = MI.getOperand(7).getReg();
5400
5401 if (!ST.hasGFX10_AEncoding()) {
5402 DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5403 "intrinsic not supported on subtarget",
5404 MI.getDebugLoc());
5405 B.getMF().getFunction().getContext().diagnose(BadIntrin);
5406 return false;
5407 }
5408
5409 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5410 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5411 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5412 const unsigned NumVDataDwords = 4;
5413 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5414 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5415 const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5416 const unsigned BaseOpcodes[2][2] = {
5417 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5418 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5419 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5420 int Opcode;
5421 if (UseNSA) {
5422 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5423 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5424 : AMDGPU::MIMGEncGfx10NSA,
5425 NumVDataDwords, NumVAddrDwords);
5426 } else {
5427 Opcode = AMDGPU::getMIMGOpcode(
5428 BaseOpcodes[Is64][IsA16],
5429 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5430 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5431 }
5432 assert(Opcode != -1);
5433
5434 SmallVector<Register, 12> Ops;
5435 if (UseNSA && IsGFX11Plus) {
5436 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5437 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5438 auto Merged = B.buildMerge(
5439 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5440 Ops.push_back(Merged.getReg(0));
5441 };
5442
5443 Ops.push_back(NodePtr);
5444 Ops.push_back(RayExtent);
5445 packLanes(RayOrigin);
5446
5447 if (IsA16) {
5448 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5449 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5450 auto MergedDir = B.buildMerge(
5451 V3S32,
5452 {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
5453 UnmergeRayDir.getReg(0)}))
5454 .getReg(0),
5455 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
5456 UnmergeRayDir.getReg(1)}))
5457 .getReg(0),
5458 B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
5459 UnmergeRayDir.getReg(2)}))
5460 .getReg(0)});
5461 Ops.push_back(MergedDir.getReg(0));
5462 } else {
5463 packLanes(RayDir);
5464 packLanes(RayInvDir);
5465 }
5466 } else {
5467 if (Is64) {
5468 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5469 Ops.push_back(Unmerge.getReg(0));
5470 Ops.push_back(Unmerge.getReg(1));
5471 } else {
5472 Ops.push_back(NodePtr);
5473 }
5474 Ops.push_back(RayExtent);
5475
5476 auto packLanes = [&Ops, &S32, &B](Register Src) {
5477 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5478 Ops.push_back(Unmerge.getReg(0));
5479 Ops.push_back(Unmerge.getReg(1));
5480 Ops.push_back(Unmerge.getReg(2));
5481 };
5482
5483 packLanes(RayOrigin);
5484 if (IsA16) {
5485 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5486 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5487 Register R1 = MRI.createGenericVirtualRegister(S32);
5488 Register R2 = MRI.createGenericVirtualRegister(S32);
5489 Register R3 = MRI.createGenericVirtualRegister(S32);
5490 B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5491 B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5492 B.buildMerge(R3,
5493 {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5494 Ops.push_back(R1);
5495 Ops.push_back(R2);
5496 Ops.push_back(R3);
5497 } else {
5498 packLanes(RayDir);
5499 packLanes(RayInvDir);
5500 }
5501 }
5502
5503 if (!UseNSA) {
5504 // Build a single vector containing all the operands so far prepared.
5505 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5506 Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5507 Ops.clear();
5508 Ops.push_back(MergedOps);
5509 }
5510
5511 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5512 .addDef(DstReg)
5513 .addImm(Opcode);
5514
5515 for (Register R : Ops) {
5516 MIB.addUse(R);
5517 }
5518
5519 MIB.addUse(TDescr)
5520 .addImm(IsA16 ? 1 : 0)
5521 .cloneMemRefs(MI);
5522
5523 MI.eraseFromParent();
5524 return true;
5525 }
5526
legalizeFPTruncRound(MachineInstr & MI,MachineIRBuilder & B) const5527 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5528 MachineIRBuilder &B) const {
5529 unsigned Opc;
5530 int RoundMode = MI.getOperand(2).getImm();
5531
5532 if (RoundMode == (int)RoundingMode::TowardPositive)
5533 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5534 else if (RoundMode == (int)RoundingMode::TowardNegative)
5535 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5536 else
5537 return false;
5538
5539 B.buildInstr(Opc)
5540 .addDef(MI.getOperand(0).getReg())
5541 .addUse(MI.getOperand(1).getReg());
5542
5543 MI.eraseFromParent();
5544
5545 return true;
5546 }
5547
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const5548 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5549 MachineInstr &MI) const {
5550 MachineIRBuilder &B = Helper.MIRBuilder;
5551 MachineRegisterInfo &MRI = *B.getMRI();
5552
5553 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5554 auto IntrID = MI.getIntrinsicID();
5555 switch (IntrID) {
5556 case Intrinsic::amdgcn_if:
5557 case Intrinsic::amdgcn_else: {
5558 MachineInstr *Br = nullptr;
5559 MachineBasicBlock *UncondBrTarget = nullptr;
5560 bool Negated = false;
5561 if (MachineInstr *BrCond =
5562 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5563 const SIRegisterInfo *TRI
5564 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5565
5566 Register Def = MI.getOperand(1).getReg();
5567 Register Use = MI.getOperand(3).getReg();
5568
5569 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5570
5571 if (Negated)
5572 std::swap(CondBrTarget, UncondBrTarget);
5573
5574 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5575 if (IntrID == Intrinsic::amdgcn_if) {
5576 B.buildInstr(AMDGPU::SI_IF)
5577 .addDef(Def)
5578 .addUse(Use)
5579 .addMBB(UncondBrTarget);
5580 } else {
5581 B.buildInstr(AMDGPU::SI_ELSE)
5582 .addDef(Def)
5583 .addUse(Use)
5584 .addMBB(UncondBrTarget);
5585 }
5586
5587 if (Br) {
5588 Br->getOperand(0).setMBB(CondBrTarget);
5589 } else {
5590 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5591 // since we're swapping branch targets it needs to be reinserted.
5592 // FIXME: IRTranslator should probably not do this
5593 B.buildBr(*CondBrTarget);
5594 }
5595
5596 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5597 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5598 MI.eraseFromParent();
5599 BrCond->eraseFromParent();
5600 return true;
5601 }
5602
5603 return false;
5604 }
5605 case Intrinsic::amdgcn_loop: {
5606 MachineInstr *Br = nullptr;
5607 MachineBasicBlock *UncondBrTarget = nullptr;
5608 bool Negated = false;
5609 if (MachineInstr *BrCond =
5610 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5611 const SIRegisterInfo *TRI
5612 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5613
5614 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5615 Register Reg = MI.getOperand(2).getReg();
5616
5617 if (Negated)
5618 std::swap(CondBrTarget, UncondBrTarget);
5619
5620 B.setInsertPt(B.getMBB(), BrCond->getIterator());
5621 B.buildInstr(AMDGPU::SI_LOOP)
5622 .addUse(Reg)
5623 .addMBB(UncondBrTarget);
5624
5625 if (Br)
5626 Br->getOperand(0).setMBB(CondBrTarget);
5627 else
5628 B.buildBr(*CondBrTarget);
5629
5630 MI.eraseFromParent();
5631 BrCond->eraseFromParent();
5632 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5633 return true;
5634 }
5635
5636 return false;
5637 }
5638 case Intrinsic::amdgcn_kernarg_segment_ptr:
5639 if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5640 // This only makes sense to call in a kernel, so just lower to null.
5641 B.buildConstant(MI.getOperand(0).getReg(), 0);
5642 MI.eraseFromParent();
5643 return true;
5644 }
5645
5646 return legalizePreloadedArgIntrin(
5647 MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5648 case Intrinsic::amdgcn_implicitarg_ptr:
5649 return legalizeImplicitArgPtr(MI, MRI, B);
5650 case Intrinsic::amdgcn_workitem_id_x:
5651 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5652 AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5653 case Intrinsic::amdgcn_workitem_id_y:
5654 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5655 AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5656 case Intrinsic::amdgcn_workitem_id_z:
5657 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5658 AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5659 case Intrinsic::amdgcn_workgroup_id_x:
5660 return legalizePreloadedArgIntrin(MI, MRI, B,
5661 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5662 case Intrinsic::amdgcn_workgroup_id_y:
5663 return legalizePreloadedArgIntrin(MI, MRI, B,
5664 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5665 case Intrinsic::amdgcn_workgroup_id_z:
5666 return legalizePreloadedArgIntrin(MI, MRI, B,
5667 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5668 case Intrinsic::amdgcn_lds_kernel_id:
5669 return legalizePreloadedArgIntrin(MI, MRI, B,
5670 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5671 case Intrinsic::amdgcn_dispatch_ptr:
5672 return legalizePreloadedArgIntrin(MI, MRI, B,
5673 AMDGPUFunctionArgInfo::DISPATCH_PTR);
5674 case Intrinsic::amdgcn_queue_ptr:
5675 return legalizePreloadedArgIntrin(MI, MRI, B,
5676 AMDGPUFunctionArgInfo::QUEUE_PTR);
5677 case Intrinsic::amdgcn_implicit_buffer_ptr:
5678 return legalizePreloadedArgIntrin(
5679 MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5680 case Intrinsic::amdgcn_dispatch_id:
5681 return legalizePreloadedArgIntrin(MI, MRI, B,
5682 AMDGPUFunctionArgInfo::DISPATCH_ID);
5683 case Intrinsic::r600_read_ngroups_x:
5684 // TODO: Emit error for hsa
5685 return legalizeKernargMemParameter(MI, B,
5686 SI::KernelInputOffsets::NGROUPS_X);
5687 case Intrinsic::r600_read_ngroups_y:
5688 return legalizeKernargMemParameter(MI, B,
5689 SI::KernelInputOffsets::NGROUPS_Y);
5690 case Intrinsic::r600_read_ngroups_z:
5691 return legalizeKernargMemParameter(MI, B,
5692 SI::KernelInputOffsets::NGROUPS_Z);
5693 case Intrinsic::r600_read_local_size_x:
5694 // TODO: Could insert G_ASSERT_ZEXT from s16
5695 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5696 case Intrinsic::r600_read_local_size_y:
5697 // TODO: Could insert G_ASSERT_ZEXT from s16
5698 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y);
5699 // TODO: Could insert G_ASSERT_ZEXT from s16
5700 case Intrinsic::r600_read_local_size_z:
5701 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5702 case Intrinsic::r600_read_global_size_x:
5703 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5704 case Intrinsic::r600_read_global_size_y:
5705 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5706 case Intrinsic::r600_read_global_size_z:
5707 return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5708 case Intrinsic::amdgcn_fdiv_fast:
5709 return legalizeFDIVFastIntrin(MI, MRI, B);
5710 case Intrinsic::amdgcn_is_shared:
5711 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5712 case Intrinsic::amdgcn_is_private:
5713 return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5714 case Intrinsic::amdgcn_wavefrontsize: {
5715 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5716 MI.eraseFromParent();
5717 return true;
5718 }
5719 case Intrinsic::amdgcn_s_buffer_load:
5720 return legalizeSBufferLoad(Helper, MI);
5721 case Intrinsic::amdgcn_raw_buffer_store:
5722 case Intrinsic::amdgcn_struct_buffer_store:
5723 return legalizeBufferStore(MI, MRI, B, false, false);
5724 case Intrinsic::amdgcn_raw_buffer_store_format:
5725 case Intrinsic::amdgcn_struct_buffer_store_format:
5726 return legalizeBufferStore(MI, MRI, B, false, true);
5727 case Intrinsic::amdgcn_raw_tbuffer_store:
5728 case Intrinsic::amdgcn_struct_tbuffer_store:
5729 return legalizeBufferStore(MI, MRI, B, true, true);
5730 case Intrinsic::amdgcn_raw_buffer_load:
5731 case Intrinsic::amdgcn_struct_buffer_load:
5732 return legalizeBufferLoad(MI, MRI, B, false, false);
5733 case Intrinsic::amdgcn_raw_buffer_load_format:
5734 case Intrinsic::amdgcn_struct_buffer_load_format:
5735 return legalizeBufferLoad(MI, MRI, B, true, false);
5736 case Intrinsic::amdgcn_raw_tbuffer_load:
5737 case Intrinsic::amdgcn_struct_tbuffer_load:
5738 return legalizeBufferLoad(MI, MRI, B, true, true);
5739 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5740 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5741 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5742 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5743 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5744 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5745 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5746 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5747 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5748 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5749 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5750 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5751 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5752 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5753 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5754 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5755 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5756 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5757 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5758 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5759 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5760 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5761 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5762 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5763 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5764 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5765 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5766 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5767 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5768 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5769 return legalizeBufferAtomic(MI, B, IntrID);
5770 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5771 case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5772 Register DstReg = MI.getOperand(0).getReg();
5773 if (!MRI.use_empty(DstReg) &&
5774 !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
5775 Function &F = B.getMF().getFunction();
5776 DiagnosticInfoUnsupported NoFpRet(
5777 F, "return versions of fp atomics not supported", B.getDebugLoc(),
5778 DS_Error);
5779 F.getContext().diagnose(NoFpRet);
5780 B.buildUndef(DstReg);
5781 MI.eraseFromParent();
5782 return true;
5783 }
5784
5785 return legalizeBufferAtomic(MI, B, IntrID);
5786 }
5787 case Intrinsic::amdgcn_atomic_inc:
5788 return legalizeAtomicIncDec(MI, B, true);
5789 case Intrinsic::amdgcn_atomic_dec:
5790 return legalizeAtomicIncDec(MI, B, false);
5791 case Intrinsic::trap:
5792 return legalizeTrapIntrinsic(MI, MRI, B);
5793 case Intrinsic::debugtrap:
5794 return legalizeDebugTrapIntrinsic(MI, MRI, B);
5795 case Intrinsic::amdgcn_rsq_clamp:
5796 return legalizeRsqClampIntrinsic(MI, MRI, B);
5797 case Intrinsic::amdgcn_ds_fadd:
5798 case Intrinsic::amdgcn_ds_fmin:
5799 case Intrinsic::amdgcn_ds_fmax:
5800 return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5801 case Intrinsic::amdgcn_image_bvh_intersect_ray:
5802 return legalizeBVHIntrinsic(MI, B);
5803 default: {
5804 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5805 AMDGPU::getImageDimIntrinsicInfo(IntrID))
5806 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5807 return true;
5808 }
5809 }
5810
5811 return true;
5812 }
5813