1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70
71 #include "AMDGPURegisterBankInfo.h"
72
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91
92 using namespace llvm;
93 using namespace MIPatternMatch;
94
95 namespace {
96
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100 const AMDGPURegisterBankInfo &RBI;
101 MachineRegisterInfo &MRI;
102 const RegisterBank *NewBank;
103 SmallVector<MachineInstr *, 4> NewInsts;
104
105 public:
ApplyRegBankMapping(const AMDGPURegisterBankInfo & RBI_,MachineRegisterInfo & MRI_,const RegisterBank * RB)106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107 MachineRegisterInfo &MRI_, const RegisterBank *RB)
108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109
~ApplyRegBankMapping()110 ~ApplyRegBankMapping() {
111 for (MachineInstr *MI : NewInsts)
112 applyBank(*MI);
113 }
114
115 /// Set any registers that don't have a set register class or bank to SALU.
applyBank(MachineInstr & MI)116 void applyBank(MachineInstr &MI) {
117 const unsigned Opc = MI.getOpcode();
118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119 Opc == AMDGPU::G_SEXT) {
120 // LegalizerHelper wants to use the basic legalization artifacts when
121 // widening etc. We don't handle selection with vcc in artifact sources,
122 // so we need to use a select instead to handle these properly.
123 Register DstReg = MI.getOperand(0).getReg();
124 Register SrcReg = MI.getOperand(1).getReg();
125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126 if (SrcBank == &AMDGPU::VCCRegBank) {
127 const LLT S32 = LLT::scalar(32);
128 assert(MRI.getType(SrcReg) == LLT::scalar(1));
129 assert(MRI.getType(DstReg) == S32);
130 assert(NewBank == &AMDGPU::VGPRRegBank);
131
132 // Replace the extension with a select, which really uses the boolean
133 // source.
134 MachineIRBuilder B(MI);
135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136 auto False = B.buildConstant(S32, 0);
137 B.buildSelect(DstReg, SrcReg, True, False);
138 MRI.setRegBank(True.getReg(0), *NewBank);
139 MRI.setRegBank(False.getReg(0), *NewBank);
140 MI.eraseFromParent();
141 }
142
143 assert(!MRI.getRegClassOrRegBank(DstReg));
144 MRI.setRegBank(DstReg, *NewBank);
145 return;
146 }
147
148 #ifndef NDEBUG
149 if (Opc == AMDGPU::G_TRUNC) {
150 Register DstReg = MI.getOperand(0).getReg();
151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152 assert(DstBank != &AMDGPU::VCCRegBank);
153 }
154 #endif
155
156 for (MachineOperand &Op : MI.operands()) {
157 if (!Op.isReg())
158 continue;
159
160 // We may see physical registers if building a real MI
161 Register Reg = Op.getReg();
162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163 continue;
164
165 const RegisterBank *RB = NewBank;
166 if (MRI.getType(Reg) == LLT::scalar(1)) {
167 assert(NewBank == &AMDGPU::VGPRRegBank &&
168 "s1 operands should only be used for vector bools");
169 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171 "not expecting legalization artifacts here");
172 RB = &AMDGPU::VCCRegBank;
173 }
174
175 MRI.setRegBank(Reg, *RB);
176 }
177 }
178
erasingInstr(MachineInstr & MI)179 void erasingInstr(MachineInstr &MI) override {}
180
createdInstr(MachineInstr & MI)181 void createdInstr(MachineInstr &MI) override {
182 // At this point, the instruction was just inserted and has no operands.
183 NewInsts.push_back(&MI);
184 }
185
changingInstr(MachineInstr & MI)186 void changingInstr(MachineInstr &MI) override {}
changedInstr(MachineInstr & MI)187 void changedInstr(MachineInstr &MI) override {
188 // FIXME: In principle we should probably add the instruction to NewInsts,
189 // but the way the LegalizerHelper uses the observer, we will always see the
190 // registers we need to set the regbank on also referenced in a new
191 // instruction.
192 }
193 };
194
195 }
AMDGPURegisterBankInfo(const GCNSubtarget & ST)196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212
isVectorRegisterBank(const RegisterBank & Bank)213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217
copyCost(const RegisterBank & Dst,const RegisterBank & Src,unsigned Size) const218 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 // There is no direct copy between AGPRs.
242 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243 Src.getID() == AMDGPU::AGPRRegBankID)
244 return 4;
245
246 return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248
getBreakDownCost(const ValueMapping & ValMapping,const RegisterBank * CurBank) const249 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
250 const ValueMapping &ValMapping,
251 const RegisterBank *CurBank) const {
252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253 // VGPR.
254 // FIXME: Is there a better way to do this?
255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256 return 10; // This is expensive.
257
258 assert(ValMapping.NumBreakDowns == 2 &&
259 ValMapping.BreakDown[0].Length == 32 &&
260 ValMapping.BreakDown[0].StartIdx == 0 &&
261 ValMapping.BreakDown[1].Length == 32 &&
262 ValMapping.BreakDown[1].StartIdx == 32 &&
263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264
265 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267 // want.
268
269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270 // alignment restrictions, but this probably isn't important.
271 return 1;
272 }
273
274 const RegisterBank &
getRegBankFromRegClass(const TargetRegisterClass & RC,LLT Ty) const275 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
276 LLT Ty) const {
277 if (&RC == &AMDGPU::SReg_1RegClass)
278 return AMDGPU::VCCRegBank;
279
280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281 // VCC-like use.
282 if (TRI->isSGPRClass(&RC)) {
283 // FIXME: This probably came from a copy from a physical register, which
284 // should be inferable from the copied to-type. We don't have many boolean
285 // physical register constraints so just assume a normal SGPR for now.
286 if (!Ty.isValid())
287 return AMDGPU::SGPRRegBank;
288
289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290 }
291
292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294
295 template <unsigned NumOps>
296 RegisterBankInfo::InstructionMappings
addMappingFromTable(const MachineInstr & MI,const MachineRegisterInfo & MRI,const std::array<unsigned,NumOps> RegSrcOpIdx,ArrayRef<OpRegBankEntry<NumOps>> Table) const297 AMDGPURegisterBankInfo::addMappingFromTable(
298 const MachineInstr &MI, const MachineRegisterInfo &MRI,
299 const std::array<unsigned, NumOps> RegSrcOpIdx,
300 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301
302 InstructionMappings AltMappings;
303
304 SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
305
306 unsigned Sizes[NumOps];
307 for (unsigned I = 0; I < NumOps; ++I) {
308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310 }
311
312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315 }
316
317 // getInstrMapping's default mapping uses ID 1, so start at 2.
318 unsigned MappingID = 2;
319 for (const auto &Entry : Table) {
320 for (unsigned I = 0; I < NumOps; ++I) {
321 int OpIdx = RegSrcOpIdx[I];
322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323 }
324
325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
326 getOperandsMapping(Operands),
327 Operands.size()));
328 }
329
330 return AltMappings;
331 }
332
333 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsic(const MachineInstr & MI,const MachineRegisterInfo & MRI) const334 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336 switch (MI.getIntrinsicID()) {
337 case Intrinsic::amdgcn_readlane: {
338 static const OpRegBankEntry<3> Table[2] = {
339 // Perfectly legal.
340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341
342 // Need a readfirstlane for the index.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344 };
345
346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348 }
349 case Intrinsic::amdgcn_writelane: {
350 static const OpRegBankEntry<4> Table[4] = {
351 // Perfectly legal.
352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353
354 // Need readfirstlane of first op
355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356
357 // Need readfirstlane of second op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of both ops
361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362 };
363
364 // rsrc, voffset, offset
365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367 }
368 default:
369 return RegisterBankInfo::getInstrAlternativeMappings(MI);
370 }
371 }
372
373 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr & MI,const MachineRegisterInfo & MRI) const374 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376
377 switch (MI.getIntrinsicID()) {
378 case Intrinsic::amdgcn_s_buffer_load: {
379 static const OpRegBankEntry<2> Table[4] = {
380 // Perfectly legal.
381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382
383 // Only need 1 register in loop
384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385
386 // Have to waterfall the resource.
387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388
389 // Have to waterfall the resource, and the offset.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391 };
392
393 // rsrc, offset
394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396 }
397 case Intrinsic::amdgcn_ds_ordered_add:
398 case Intrinsic::amdgcn_ds_ordered_swap: {
399 // VGPR = M0, VGPR
400 static const OpRegBankEntry<3> Table[2] = {
401 // Perfectly legal.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403
404 // Need a readfirstlane for m0
405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406 };
407
408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410 }
411 case Intrinsic::amdgcn_s_sendmsg:
412 case Intrinsic::amdgcn_s_sendmsghalt: {
413 // FIXME: Should have no register for immediate
414 static const OpRegBankEntry<1> Table[2] = {
415 // Perfectly legal.
416 { { AMDGPU::SGPRRegBankID }, 1 },
417
418 // Need readlane
419 { { AMDGPU::VGPRRegBankID }, 3 }
420 };
421
422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424 }
425 default:
426 return RegisterBankInfo::getInstrAlternativeMappings(MI);
427 }
428 }
429
430 // FIXME: Returns uniform if there's no source value information. This is
431 // probably wrong.
isScalarLoadLegal(const MachineInstr & MI)432 static bool isScalarLoadLegal(const MachineInstr &MI) {
433 if (!MI.hasOneMemOperand())
434 return false;
435
436 const MachineMemOperand *MMO = *MI.memoperands_begin();
437 const unsigned AS = MMO->getAddrSpace();
438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
439 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
440 // Require 4-byte alignment.
441 return MMO->getAlign() >= Align(4) &&
442 // Can't do a scalar atomic load.
443 !MMO->isAtomic() &&
444 // Don't use scalar loads for volatile accesses to non-constant address
445 // spaces.
446 (IsConst || !MMO->isVolatile()) &&
447 // Memory must be known constant, or not written before this load.
448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
449 AMDGPUInstrInfo::isUniformMMO(MMO);
450 }
451
452 RegisterBankInfo::InstructionMappings
getInstrAlternativeMappings(const MachineInstr & MI) const453 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
454 const MachineInstr &MI) const {
455
456 const MachineFunction &MF = *MI.getParent()->getParent();
457 const MachineRegisterInfo &MRI = MF.getRegInfo();
458
459
460 InstructionMappings AltMappings;
461 switch (MI.getOpcode()) {
462 case TargetOpcode::G_CONSTANT: {
463 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
464 if (Size == 1) {
465 static const OpRegBankEntry<1> Table[3] = {
466 { { AMDGPU::VGPRRegBankID }, 1 },
467 { { AMDGPU::SGPRRegBankID }, 1 },
468 { { AMDGPU::VCCRegBankID }, 1 }
469 };
470
471 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
472 }
473
474 LLVM_FALLTHROUGH;
475 }
476 case TargetOpcode::G_FCONSTANT:
477 case TargetOpcode::G_FRAME_INDEX:
478 case TargetOpcode::G_GLOBAL_VALUE: {
479 static const OpRegBankEntry<1> Table[2] = {
480 { { AMDGPU::VGPRRegBankID }, 1 },
481 { { AMDGPU::SGPRRegBankID }, 1 }
482 };
483
484 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485 }
486 case TargetOpcode::G_AND:
487 case TargetOpcode::G_OR:
488 case TargetOpcode::G_XOR: {
489 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
490
491 if (Size == 1) {
492 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
493 const InstructionMapping &SCCMapping = getInstructionMapping(
494 1, 1, getOperandsMapping(
495 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
496 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
498 3); // Num Operands
499 AltMappings.push_back(&SCCMapping);
500
501 const InstructionMapping &VCCMapping0 = getInstructionMapping(
502 2, 1, getOperandsMapping(
503 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
504 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
506 3); // Num Operands
507 AltMappings.push_back(&VCCMapping0);
508 return AltMappings;
509 }
510
511 if (Size != 64)
512 break;
513
514 const InstructionMapping &SSMapping = getInstructionMapping(
515 1, 1, getOperandsMapping(
516 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
517 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
519 3); // Num Operands
520 AltMappings.push_back(&SSMapping);
521
522 const InstructionMapping &VVMapping = getInstructionMapping(
523 2, 2, getOperandsMapping(
524 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
525 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
527 3); // Num Operands
528 AltMappings.push_back(&VVMapping);
529 break;
530 }
531 case TargetOpcode::G_LOAD:
532 case TargetOpcode::G_ZEXTLOAD:
533 case TargetOpcode::G_SEXTLOAD: {
534 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
535 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
536 unsigned PtrSize = PtrTy.getSizeInBits();
537 unsigned AS = PtrTy.getAddressSpace();
538
539 if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
540 AS != AMDGPUAS::PRIVATE_ADDRESS) &&
541 isScalarLoadLegal(MI)) {
542 const InstructionMapping &SSMapping = getInstructionMapping(
543 1, 1, getOperandsMapping(
544 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
545 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
546 2); // Num Operands
547 AltMappings.push_back(&SSMapping);
548 }
549
550 const InstructionMapping &VVMapping = getInstructionMapping(
551 2, 1,
552 getOperandsMapping(
553 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
554 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
555 2); // Num Operands
556 AltMappings.push_back(&VVMapping);
557
558 // It may be possible to have a vgpr = load sgpr mapping here, because
559 // the mubuf instructions support this kind of load, but probably for only
560 // gfx7 and older. However, the addressing mode matching in the instruction
561 // selector should be able to do a better job of detecting and selecting
562 // these kinds of loads from the vgpr = load vgpr mapping.
563
564 return AltMappings;
565
566 }
567 case TargetOpcode::G_SELECT: {
568 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
569 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
570 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
571 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
574 4); // Num Operands
575 AltMappings.push_back(&SSMapping);
576
577 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
578 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
579 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
580 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
582 4); // Num Operands
583 AltMappings.push_back(&VVMapping);
584
585 return AltMappings;
586 }
587 case TargetOpcode::G_UADDE:
588 case TargetOpcode::G_USUBE:
589 case TargetOpcode::G_SADDE:
590 case TargetOpcode::G_SSUBE: {
591 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
592 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
593 getOperandsMapping(
594 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
595 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
599 5); // Num Operands
600 AltMappings.push_back(&SSMapping);
601
602 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
603 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
604 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
605 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
608 5); // Num Operands
609 AltMappings.push_back(&VVMapping);
610 return AltMappings;
611 }
612 case AMDGPU::G_BRCOND: {
613 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
614
615 // TODO: Change type to 32 for scalar
616 const InstructionMapping &SMapping = getInstructionMapping(
617 1, 1, getOperandsMapping(
618 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
619 2); // Num Operands
620 AltMappings.push_back(&SMapping);
621
622 const InstructionMapping &VMapping = getInstructionMapping(
623 1, 1, getOperandsMapping(
624 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
625 2); // Num Operands
626 AltMappings.push_back(&VMapping);
627 return AltMappings;
628 }
629 case AMDGPU::G_INTRINSIC:
630 return getInstrAlternativeMappingsIntrinsic(MI, MRI);
631 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
632 return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
633 default:
634 break;
635 }
636 return RegisterBankInfo::getInstrAlternativeMappings(MI);
637 }
638
split64BitValueForMapping(MachineIRBuilder & B,SmallVector<Register,2> & Regs,LLT HalfTy,Register Reg) const639 void AMDGPURegisterBankInfo::split64BitValueForMapping(
640 MachineIRBuilder &B,
641 SmallVector<Register, 2> &Regs,
642 LLT HalfTy,
643 Register Reg) const {
644 assert(HalfTy.getSizeInBits() == 32);
645 MachineRegisterInfo *MRI = B.getMRI();
646 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
647 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
648 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
649 MRI->setRegBank(LoLHS, *Bank);
650 MRI->setRegBank(HiLHS, *Bank);
651
652 Regs.push_back(LoLHS);
653 Regs.push_back(HiLHS);
654
655 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
656 .addDef(LoLHS)
657 .addDef(HiLHS)
658 .addUse(Reg);
659 }
660
661 /// Replace the current type each register in \p Regs has with \p NewTy
setRegsToType(MachineRegisterInfo & MRI,ArrayRef<Register> Regs,LLT NewTy)662 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
663 LLT NewTy) {
664 for (Register Reg : Regs) {
665 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
666 MRI.setType(Reg, NewTy);
667 }
668 }
669
getHalfSizedType(LLT Ty)670 static LLT getHalfSizedType(LLT Ty) {
671 if (Ty.isVector()) {
672 assert(Ty.getElementCount().isKnownMultipleOf(2));
673 return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
674 Ty.getElementType());
675 }
676
677 assert(Ty.getScalarSizeInBits() % 2 == 0);
678 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
679 }
680
681 // Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
682 // source value into a scalar register.
buildReadFirstLane(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Src) const683 Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
684 MachineRegisterInfo &MRI,
685 Register Src) const {
686 LLT Ty = MRI.getType(Src);
687 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
688
689 if (Bank == &AMDGPU::SGPRRegBank)
690 return Src;
691
692 unsigned Bits = Ty.getSizeInBits();
693 assert(Bits % 32 == 0);
694
695 if (Bank != &AMDGPU::VGPRRegBank) {
696 // We need to copy from AGPR to VGPR
697 Src = B.buildCopy(Ty, Src).getReg(0);
698 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
699 }
700
701 LLT S32 = LLT::scalar(32);
702 unsigned NumParts = Bits / 32;
703 SmallVector<Register, 8> SrcParts;
704 SmallVector<Register, 8> DstParts;
705
706 if (Bits == 32) {
707 SrcParts.push_back(Src);
708 } else {
709 auto Unmerge = B.buildUnmerge(S32, Src);
710 for (unsigned i = 0; i < NumParts; ++i)
711 SrcParts.push_back(Unmerge.getReg(i));
712 }
713
714 for (unsigned i = 0; i < NumParts; ++i) {
715 Register SrcPart = SrcParts[i];
716 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
717 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
718
719 const TargetRegisterClass *Constrained =
720 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
721 (void)Constrained;
722 assert(Constrained && "Failed to constrain readfirstlane src reg");
723
724 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
725
726 DstParts.push_back(DstPart);
727 }
728
729 if (Bits == 32)
730 return DstParts[0];
731
732 Register Dst = B.buildMerge(Ty, DstParts).getReg(0);
733 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
734 return Dst;
735 }
736
737 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
738 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
739 /// execute the instruction for each unique combination of values in all lanes
740 /// in the wave. The block will be split such that rest of the instructions are
741 /// moved to a new block.
742 ///
743 /// Essentially performs this loop:
744 //
745 /// Save Execution Mask
746 /// For (Lane : Wavefront) {
747 /// Enable Lane, Disable all other lanes
748 /// SGPR = read SGPR value for current lane from VGPR
749 /// VGPRResult[Lane] = use_op SGPR
750 /// }
751 /// Restore Execution Mask
752 ///
753 /// There is additional complexity to try for compare values to identify the
754 /// unique values used.
executeInWaterfallLoop(MachineIRBuilder & B,iterator_range<MachineBasicBlock::iterator> Range,SmallSet<Register,4> & SGPROperandRegs,MachineRegisterInfo & MRI) const755 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
756 MachineIRBuilder &B,
757 iterator_range<MachineBasicBlock::iterator> Range,
758 SmallSet<Register, 4> &SGPROperandRegs,
759 MachineRegisterInfo &MRI) const {
760
761 // Track use registers which have already been expanded with a readfirstlane
762 // sequence. This may have multiple uses if moving a sequence.
763 DenseMap<Register, Register> WaterfalledRegMap;
764
765 MachineBasicBlock &MBB = B.getMBB();
766 MachineFunction *MF = &B.getMF();
767
768 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
769 const unsigned MovExecOpc =
770 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
771 const unsigned MovExecTermOpc =
772 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
773
774 const unsigned XorTermOpc = Subtarget.isWave32() ?
775 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
776 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
777 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
778 const unsigned ExecReg = Subtarget.isWave32() ?
779 AMDGPU::EXEC_LO : AMDGPU::EXEC;
780
781 #ifndef NDEBUG
782 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
783 #endif
784
785 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
786 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
787
788 // Don't bother using generic instructions/registers for the exec mask.
789 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
790 .addDef(InitSaveExecReg);
791
792 Register PhiExec = MRI.createVirtualRegister(WaveRC);
793 Register NewExec = MRI.createVirtualRegister(WaveRC);
794
795 // To insert the loop we need to split the block. Move everything before this
796 // point to a new block, and insert a new empty block before this instruction.
797 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
798 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock();
799 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
800 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
801 MachineFunction::iterator MBBI(MBB);
802 ++MBBI;
803 MF->insert(MBBI, LoopBB);
804 MF->insert(MBBI, BodyBB);
805 MF->insert(MBBI, RestoreExecBB);
806 MF->insert(MBBI, RemainderBB);
807
808 LoopBB->addSuccessor(BodyBB);
809 BodyBB->addSuccessor(RestoreExecBB);
810 BodyBB->addSuccessor(LoopBB);
811
812 // Move the rest of the block into a new block.
813 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
814 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
815
816 MBB.addSuccessor(LoopBB);
817 RestoreExecBB->addSuccessor(RemainderBB);
818
819 B.setInsertPt(*LoopBB, LoopBB->end());
820
821 B.buildInstr(TargetOpcode::PHI)
822 .addDef(PhiExec)
823 .addReg(InitSaveExecReg)
824 .addMBB(&MBB)
825 .addReg(NewExec)
826 .addMBB(BodyBB);
827
828 const DebugLoc &DL = B.getDL();
829
830 MachineInstr &FirstInst = *Range.begin();
831
832 // Move the instruction into the loop body. Note we moved everything after
833 // Range.end() already into a new block, so Range.end() is no longer valid.
834 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
835
836 // Figure out the iterator range after splicing the instructions.
837 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
838 auto NewEnd = BodyBB->end();
839
840 B.setMBB(*LoopBB);
841
842 LLT S1 = LLT::scalar(1);
843 Register CondReg;
844
845 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
846
847 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
848 for (MachineOperand &Op : MI.uses()) {
849 if (!Op.isReg() || Op.isDef())
850 continue;
851
852 Register OldReg = Op.getReg();
853 if (!SGPROperandRegs.count(OldReg))
854 continue;
855
856 // See if we already processed this register in another instruction in the
857 // sequence.
858 auto OldVal = WaterfalledRegMap.find(OldReg);
859 if (OldVal != WaterfalledRegMap.end()) {
860 Op.setReg(OldVal->second);
861 continue;
862 }
863
864 Register OpReg = Op.getReg();
865 LLT OpTy = MRI.getType(OpReg);
866
867 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
868 if (OpBank != &AMDGPU::VGPRRegBank) {
869 // Insert copy from AGPR to VGPR before the loop.
870 B.setMBB(MBB);
871 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
872 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
873 B.setMBB(*LoopBB);
874 }
875
876 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
877
878 // Build the comparison(s).
879 unsigned OpSize = OpTy.getSizeInBits();
880 bool Is64 = OpSize % 64 == 0;
881 unsigned PartSize = Is64 ? 64 : 32;
882 LLT PartTy = LLT::scalar(PartSize);
883 unsigned NumParts = OpSize / PartSize;
884 SmallVector<Register, 8> OpParts;
885 SmallVector<Register, 8> CurrentLaneParts;
886
887 if (NumParts == 1) {
888 OpParts.push_back(OpReg);
889 CurrentLaneParts.push_back(CurrentLaneReg);
890 } else {
891 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
892 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
893 for (unsigned i = 0; i < NumParts; ++i) {
894 OpParts.push_back(UnmergeOp.getReg(i));
895 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
896 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
897 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
898 }
899 }
900
901 for (unsigned i = 0; i < NumParts; ++i) {
902 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
903 OpParts[i]).getReg(0);
904 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
905
906 if (!CondReg) {
907 CondReg = CmpReg;
908 } else {
909 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
910 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
911 }
912 }
913
914 Op.setReg(CurrentLaneReg);
915
916 // Make sure we don't re-process this register again.
917 WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
918 }
919 }
920
921 // The ballot becomes a no-op during instruction selection.
922 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
923 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
924 false)
925 .addReg(CondReg)
926 .getReg(0);
927 MRI.setRegClass(CondReg, WaveRC);
928
929 // Update EXEC, save the original EXEC value to VCC.
930 B.buildInstr(AndSaveExecOpc)
931 .addDef(NewExec)
932 .addReg(CondReg, RegState::Kill);
933
934 MRI.setSimpleHint(NewExec, CondReg);
935
936 B.setInsertPt(*BodyBB, BodyBB->end());
937
938 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
939 B.buildInstr(XorTermOpc)
940 .addDef(ExecReg)
941 .addReg(ExecReg)
942 .addReg(NewExec);
943
944 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
945 // s_cbranch_scc0?
946
947 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
948 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
949
950 // Save the EXEC mask before the loop.
951 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
952 .addReg(ExecReg);
953
954 // Restore the EXEC mask after the loop.
955 B.setMBB(*RestoreExecBB);
956 B.buildInstr(MovExecTermOpc)
957 .addDef(ExecReg)
958 .addReg(SaveExecReg);
959
960 // Set the insert point after the original instruction, so any new
961 // instructions will be in the remainder.
962 B.setInsertPt(*RemainderBB, RemainderBB->begin());
963
964 return true;
965 }
966
967 // Return any unique registers used by \p MI at \p OpIndices that need to be
968 // handled in a waterfall loop. Returns these registers in \p
969 // SGPROperandRegs. Returns true if there are any operands to handle and a
970 // waterfall loop is necessary.
collectWaterfallOperands(SmallSet<Register,4> & SGPROperandRegs,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const971 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
972 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
973 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
974 for (unsigned Op : OpIndices) {
975 assert(MI.getOperand(Op).isUse());
976 Register Reg = MI.getOperand(Op).getReg();
977 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
978 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
979 SGPROperandRegs.insert(Reg);
980 }
981
982 // No operands need to be replaced, so no need to loop.
983 return !SGPROperandRegs.empty();
984 }
985
executeInWaterfallLoop(MachineIRBuilder & B,MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const986 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
987 MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
988 ArrayRef<unsigned> OpIndices) const {
989 // Use a set to avoid extra readfirstlanes in the case where multiple operands
990 // are the same register.
991 SmallSet<Register, 4> SGPROperandRegs;
992
993 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
994 return false;
995
996 MachineBasicBlock::iterator I = MI.getIterator();
997 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
998 SGPROperandRegs, MRI);
999 }
1000
executeInWaterfallLoop(MachineInstr & MI,MachineRegisterInfo & MRI,ArrayRef<unsigned> OpIndices) const1001 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1002 MachineInstr &MI, MachineRegisterInfo &MRI,
1003 ArrayRef<unsigned> OpIndices) const {
1004 MachineIRBuilder B(MI);
1005 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1006 }
1007
1008 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
constrainOpWithReadfirstlane(MachineInstr & MI,MachineRegisterInfo & MRI,unsigned OpIdx) const1009 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1010 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1011 Register Reg = MI.getOperand(OpIdx).getReg();
1012 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1013 if (Bank == &AMDGPU::SGPRRegBank)
1014 return;
1015
1016 MachineIRBuilder B(MI);
1017
1018 Reg = buildReadFirstLane(B, MRI, Reg);
1019 MI.getOperand(OpIdx).setReg(Reg);
1020 }
1021
1022 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1023 /// rest will be in the remainder.
splitUnequalType(LLT Ty,unsigned FirstSize)1024 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1025 unsigned TotalSize = Ty.getSizeInBits();
1026 if (!Ty.isVector())
1027 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1028
1029 LLT EltTy = Ty.getElementType();
1030 unsigned EltSize = EltTy.getSizeInBits();
1031 assert(FirstSize % EltSize == 0);
1032
1033 unsigned FirstPartNumElts = FirstSize / EltSize;
1034 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1035
1036 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1037 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1038 }
1039
widen96To128(LLT Ty)1040 static LLT widen96To128(LLT Ty) {
1041 if (!Ty.isVector())
1042 return LLT::scalar(128);
1043
1044 LLT EltTy = Ty.getElementType();
1045 assert(128 % EltTy.getSizeInBits() == 0);
1046 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1047 }
1048
applyMappingLoad(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1049 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1050 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1051 MachineRegisterInfo &MRI) const {
1052 Register DstReg = MI.getOperand(0).getReg();
1053 const LLT LoadTy = MRI.getType(DstReg);
1054 unsigned LoadSize = LoadTy.getSizeInBits();
1055 const unsigned MaxNonSmrdLoadSize = 128;
1056
1057 const RegisterBank *DstBank =
1058 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1059 if (DstBank == &AMDGPU::SGPRRegBank) {
1060 // There are some special cases that we need to look at for 32 bit and 96
1061 // bit SGPR loads otherwise we have nothing to do.
1062 if (LoadSize != 32 && LoadSize != 96)
1063 return false;
1064
1065 MachineMemOperand *MMO = *MI.memoperands_begin();
1066 const unsigned MemSize = 8 * MMO->getSize();
1067 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1068 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1069 // scalar loads should have a load size of 32 but memory access size of less
1070 // than 32.
1071 if (LoadSize == 32 &&
1072 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1073 return false;
1074
1075 Register PtrReg = MI.getOperand(1).getReg();
1076
1077 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1078 MachineIRBuilder B(MI, O);
1079
1080 if (LoadSize == 32) {
1081 // This is an extending load from a sub-dword size. Widen the memory
1082 // access size to 4 bytes and clear the extra high bits appropriately
1083 const LLT S32 = LLT::scalar(32);
1084 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1085 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1086 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1087 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1088 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1089 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1090 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1091 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1092 } else
1093 // We do not need to touch the higher bits for regular loads.
1094 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1095 } else {
1096 // 96-bit loads are only available for vector loads. We need to split this
1097 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1098 if (MMO->getAlign() < Align(16)) {
1099 MachineFunction *MF = MI.getParent()->getParent();
1100 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1101 MachineIRBuilder B(MI, ApplyBank);
1102 LegalizerHelper Helper(*MF, ApplyBank, B);
1103 LLT Part64, Part32;
1104 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1105 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1106 LegalizerHelper::Legalized)
1107 return false;
1108 return true;
1109 } else {
1110 LLT WiderTy = widen96To128(LoadTy);
1111 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1112 if (WiderTy.isScalar())
1113 B.buildTrunc(MI.getOperand(0), WideLoad);
1114 else {
1115 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1116 WideLoad);
1117 }
1118 }
1119 }
1120
1121 MI.eraseFromParent();
1122 return true;
1123 }
1124
1125 // 128-bit loads are supported for all instruction types.
1126 if (LoadSize <= MaxNonSmrdLoadSize)
1127 return false;
1128
1129 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1130 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1131
1132 if (SrcRegs.empty())
1133 SrcRegs.push_back(MI.getOperand(1).getReg());
1134
1135 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1136
1137 // RegBankSelect only emits scalar types, so we need to reset the pointer
1138 // operand to a pointer type.
1139 Register BasePtrReg = SrcRegs[0];
1140 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1141 MRI.setType(BasePtrReg, PtrTy);
1142
1143 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1144 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1145 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1146 MachineIRBuilder B(MI, Observer);
1147 LegalizerHelper Helper(B.getMF(), Observer, B);
1148
1149 if (LoadTy.isVector()) {
1150 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1151 return false;
1152 } else {
1153 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1154 return false;
1155 }
1156
1157 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1158 return true;
1159 }
1160
applyMappingDynStackAlloc(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI) const1161 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1162 MachineInstr &MI,
1163 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1164 MachineRegisterInfo &MRI) const {
1165 const MachineFunction &MF = *MI.getMF();
1166 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1167 const auto &TFI = *ST.getFrameLowering();
1168
1169 // Guard in case the stack growth direction ever changes with scratch
1170 // instructions.
1171 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1172 return false;
1173
1174 Register Dst = MI.getOperand(0).getReg();
1175 Register AllocSize = MI.getOperand(1).getReg();
1176 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1177
1178 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1179
1180 // TODO: Need to emit a wave reduction to get the maximum size.
1181 if (SizeBank != &AMDGPU::SGPRRegBank)
1182 return false;
1183
1184 LLT PtrTy = MRI.getType(Dst);
1185 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1186
1187 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1188 Register SPReg = Info->getStackPtrOffsetReg();
1189 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1190 MachineIRBuilder B(MI, ApplyBank);
1191
1192 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1193 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1194
1195 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1196 if (Alignment > TFI.getStackAlign()) {
1197 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1198 B.buildMaskLowPtrBits(Dst, PtrAdd,
1199 Log2(Alignment) + ST.getWavefrontSizeLog2());
1200 } else {
1201 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1202 }
1203
1204 MI.eraseFromParent();
1205 return true;
1206 }
1207
applyMappingImage(MachineInstr & MI,const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,MachineRegisterInfo & MRI,int RsrcIdx) const1208 bool AMDGPURegisterBankInfo::applyMappingImage(
1209 MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1210 MachineRegisterInfo &MRI, int RsrcIdx) const {
1211 const int NumDefs = MI.getNumExplicitDefs();
1212
1213 // The reported argument index is relative to the IR intrinsic call arguments,
1214 // so we need to shift by the number of defs and the intrinsic ID.
1215 RsrcIdx += NumDefs + 1;
1216
1217 // Insert copies to VGPR arguments.
1218 applyDefaultMapping(OpdMapper);
1219
1220 // Fixup any SGPR arguments.
1221 SmallVector<unsigned, 4> SGPRIndexes;
1222 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1223 if (!MI.getOperand(I).isReg())
1224 continue;
1225
1226 // If this intrinsic has a sampler, it immediately follows rsrc.
1227 if (I == RsrcIdx || I == RsrcIdx + 1)
1228 SGPRIndexes.push_back(I);
1229 }
1230
1231 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1232 return true;
1233 }
1234
getSrcRegIgnoringCopies(const MachineRegisterInfo & MRI,Register Reg)1235 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1236 Register Reg) {
1237 MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1238 if (!Def)
1239 return Reg;
1240
1241 // TODO: Guard against this being an implicit def
1242 return Def->getOperand(0).getReg();
1243 }
1244
1245 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1246 // the three offsets (voffset, soffset and instoffset)
setBufferOffsets(MachineIRBuilder & B,const AMDGPURegisterBankInfo & RBI,Register CombinedOffset,Register & VOffsetReg,Register & SOffsetReg,int64_t & InstOffsetVal,Align Alignment)1247 static unsigned setBufferOffsets(MachineIRBuilder &B,
1248 const AMDGPURegisterBankInfo &RBI,
1249 Register CombinedOffset, Register &VOffsetReg,
1250 Register &SOffsetReg, int64_t &InstOffsetVal,
1251 Align Alignment) {
1252 const LLT S32 = LLT::scalar(32);
1253 MachineRegisterInfo *MRI = B.getMRI();
1254
1255 if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1256 uint32_t SOffset, ImmOffset;
1257 if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1258 Alignment)) {
1259 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1260 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1261 InstOffsetVal = ImmOffset;
1262
1263 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1264 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1265 return SOffset + ImmOffset;
1266 }
1267 }
1268
1269 Register Base;
1270 unsigned Offset;
1271
1272 std::tie(Base, Offset) =
1273 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1274
1275 uint32_t SOffset, ImmOffset;
1276 if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1277 &RBI.Subtarget, Alignment)) {
1278 if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1279 VOffsetReg = Base;
1280 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1281 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1282 InstOffsetVal = ImmOffset;
1283 return 0; // XXX - Why is this 0?
1284 }
1285
1286 // If we have SGPR base, we can use it for soffset.
1287 if (SOffset == 0) {
1288 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1289 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1290 SOffsetReg = Base;
1291 InstOffsetVal = ImmOffset;
1292 return 0; // XXX - Why is this 0?
1293 }
1294 }
1295
1296 // Handle the variable sgpr + vgpr case.
1297 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1298 if (Add && (int)Offset >= 0) {
1299 Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1300 Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1301
1302 const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1303 const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1304
1305 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1306 VOffsetReg = Src0;
1307 SOffsetReg = Src1;
1308 return 0;
1309 }
1310
1311 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1312 VOffsetReg = Src1;
1313 SOffsetReg = Src0;
1314 return 0;
1315 }
1316 }
1317
1318 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1319 // have an SGPR offset and a VGPR resource.
1320 if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1321 VOffsetReg = CombinedOffset;
1322 } else {
1323 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1324 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1325 }
1326
1327 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1328 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1329 return 0;
1330 }
1331
applyMappingSBufferLoad(const OperandsMapper & OpdMapper) const1332 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1333 const OperandsMapper &OpdMapper) const {
1334 MachineInstr &MI = OpdMapper.getMI();
1335 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1336
1337 const LLT S32 = LLT::scalar(32);
1338 Register Dst = MI.getOperand(0).getReg();
1339 LLT Ty = MRI.getType(Dst);
1340
1341 const RegisterBank *RSrcBank =
1342 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1343 const RegisterBank *OffsetBank =
1344 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1345 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1346 OffsetBank == &AMDGPU::SGPRRegBank)
1347 return true; // Legal mapping
1348
1349 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1350 // here but don't have an MMO.
1351
1352 unsigned LoadSize = Ty.getSizeInBits();
1353 int NumLoads = 1;
1354 if (LoadSize == 256 || LoadSize == 512) {
1355 NumLoads = LoadSize / 128;
1356 Ty = Ty.divide(NumLoads);
1357 }
1358
1359 // Use the alignment to ensure that the required offsets will fit into the
1360 // immediate offsets.
1361 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1362
1363 MachineIRBuilder B(MI);
1364 MachineFunction &MF = B.getMF();
1365
1366 Register SOffset;
1367 Register VOffset;
1368 int64_t ImmOffset = 0;
1369
1370 unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1371 VOffset, SOffset, ImmOffset, Alignment);
1372
1373 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1374 // can, but we need to track an MMO for that.
1375 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1376 const Align MemAlign(4); // FIXME: ABI type alignment?
1377 MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1378 MachinePointerInfo(),
1379 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1380 MachineMemOperand::MOInvariant,
1381 MemSize, MemAlign);
1382 if (MMOOffset != 0)
1383 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1384
1385 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1386 // assume that the buffer is unswizzled.
1387
1388 Register RSrc = MI.getOperand(1).getReg();
1389 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1390 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1391
1392 SmallVector<Register, 4> LoadParts(NumLoads);
1393
1394 MachineBasicBlock::iterator MII = MI.getIterator();
1395 MachineInstrSpan Span(MII, &B.getMBB());
1396
1397 for (int i = 0; i < NumLoads; ++i) {
1398 if (NumLoads == 1) {
1399 LoadParts[i] = Dst;
1400 } else {
1401 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1402 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1403 }
1404
1405 MachineMemOperand *MMO = BaseMMO;
1406 if (i != 0)
1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1408
1409 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1410 .addDef(LoadParts[i]) // vdata
1411 .addUse(RSrc) // rsrc
1412 .addUse(VIndex) // vindex
1413 .addUse(VOffset) // voffset
1414 .addUse(SOffset) // soffset
1415 .addImm(ImmOffset + 16 * i) // offset(imm)
1416 .addImm(0) // cachepolicy, swizzled buffer(imm)
1417 .addImm(0) // idxen(imm)
1418 .addMemOperand(MMO);
1419 }
1420
1421 // TODO: If only the resource is a VGPR, it may be better to execute the
1422 // scalar load in the waterfall loop if the resource is expected to frequently
1423 // be dynamically uniform.
1424 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1425 // Remove the original instruction to avoid potentially confusing the
1426 // waterfall loop logic.
1427 B.setInstr(*Span.begin());
1428 MI.eraseFromParent();
1429
1430 SmallSet<Register, 4> OpsToWaterfall;
1431
1432 OpsToWaterfall.insert(RSrc);
1433 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1434 OpsToWaterfall, MRI);
1435 }
1436
1437 if (NumLoads != 1) {
1438 if (Ty.isVector())
1439 B.buildConcatVectors(Dst, LoadParts);
1440 else
1441 B.buildMerge(Dst, LoadParts);
1442 }
1443
1444 // We removed the instruction earlier with a waterfall loop.
1445 if (RSrcBank == &AMDGPU::SGPRRegBank)
1446 MI.eraseFromParent();
1447
1448 return true;
1449 }
1450
applyMappingBFE(const OperandsMapper & OpdMapper,bool Signed) const1451 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1452 bool Signed) const {
1453 MachineInstr &MI = OpdMapper.getMI();
1454 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1455
1456 // Insert basic copies
1457 applyDefaultMapping(OpdMapper);
1458
1459 Register DstReg = MI.getOperand(0).getReg();
1460 LLT Ty = MRI.getType(DstReg);
1461
1462 const LLT S32 = LLT::scalar(32);
1463
1464 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1465 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1466 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1467 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1468
1469 const RegisterBank *DstBank =
1470 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1471 if (DstBank == &AMDGPU::VGPRRegBank) {
1472 if (Ty == S32)
1473 return true;
1474
1475 // There is no 64-bit vgpr bitfield extract instructions so the operation
1476 // is expanded to a sequence of instructions that implement the operation.
1477 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1478 MachineIRBuilder B(MI, ApplyBank);
1479
1480 const LLT S64 = LLT::scalar(64);
1481 // Shift the source operand so that extracted bits start at bit 0.
1482 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1483 : B.buildLShr(S64, SrcReg, OffsetReg);
1484 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1485
1486 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1487 // if the width is a constant.
1488 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1489 // Use the 32-bit bitfield extract instruction if the width is a constant.
1490 // Depending on the width size, use either the low or high 32-bits.
1491 auto Zero = B.buildConstant(S32, 0);
1492 auto WidthImm = ConstWidth->Value.getZExtValue();
1493 if (WidthImm <= 32) {
1494 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1495 // or clear the upper 32-bits.
1496 auto Extract =
1497 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1498 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1499 auto Extend =
1500 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1501 B.buildMerge(DstReg, {Extract, Extend});
1502 } else {
1503 // Use bitfield extract on upper 32-bit source, and combine with lower
1504 // 32-bit source.
1505 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1506 auto Extract =
1507 Signed
1508 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1509 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1510 B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1511 }
1512 MI.eraseFromParent();
1513 return true;
1514 }
1515
1516 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1517 // operations.
1518 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1519 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1520 if (Signed)
1521 B.buildAShr(S64, SignBit, ExtShift);
1522 else
1523 B.buildLShr(S64, SignBit, ExtShift);
1524 MI.eraseFromParent();
1525 return true;
1526 }
1527
1528 // The scalar form packs the offset and width in a single operand.
1529
1530 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1531 MachineIRBuilder B(MI, ApplyBank);
1532
1533 // Ensure the high bits are clear to insert the offset.
1534 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1535 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1536
1537 // Zeros out the low bits, so don't bother clamping the input value.
1538 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1539
1540 // Transformation function, pack the offset and width of a BFE into
1541 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1542 // source, bits [5:0] contain the offset and bits [22:16] the width.
1543 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1544
1545 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1546 // register class constraints.
1547 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1548 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1549
1550 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1551 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1552 llvm_unreachable("failed to constrain BFE");
1553
1554 MI.eraseFromParent();
1555 return true;
1556 }
1557
applyMappingMAD_64_32(const OperandsMapper & OpdMapper) const1558 bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
1559 const OperandsMapper &OpdMapper) const {
1560 MachineInstr &MI = OpdMapper.getMI();
1561 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1562
1563 // Insert basic copies.
1564 applyDefaultMapping(OpdMapper);
1565
1566 Register Dst0 = MI.getOperand(0).getReg();
1567 Register Dst1 = MI.getOperand(1).getReg();
1568 Register Src0 = MI.getOperand(2).getReg();
1569 Register Src1 = MI.getOperand(3).getReg();
1570 Register Src2 = MI.getOperand(4).getReg();
1571
1572 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1573 return true;
1574
1575 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1576 LLT S1 = LLT::scalar(1);
1577 LLT S32 = LLT::scalar(32);
1578
1579 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1580 bool Accumulate = true;
1581
1582 if (!DstOnValu) {
1583 if (mi_match(Src2, MRI, m_ZeroInt()))
1584 Accumulate = false;
1585 }
1586
1587 // Keep the multiplication on the SALU.
1588 MachineIRBuilder B(MI);
1589
1590 Register DstHi;
1591 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1592 bool MulHiInVgpr = false;
1593
1594 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1595
1596 if (Subtarget.hasSMulHi()) {
1597 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1598 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1599 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1600 } else {
1601 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1602 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1603
1604 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1605 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1606
1607 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1608 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1609 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1610
1611 if (!DstOnValu) {
1612 DstHi = buildReadFirstLane(B, MRI, DstHi);
1613 } else {
1614 MulHiInVgpr = true;
1615 }
1616 }
1617
1618 // Accumulate and produce the "carry-out" bit.
1619 //
1620 // The "carry-out" is defined as bit 64 of the result when computed as a
1621 // big integer. For unsigned multiply-add, this matches the usual definition
1622 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1623 // result, which is determined as:
1624 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1625 LLT CarryType = DstOnValu ? S1 : S32;
1626 const RegisterBank &CarryBank =
1627 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1628 const RegisterBank &DstBank =
1629 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1630 Register Carry;
1631 Register Zero;
1632
1633 if (!IsUnsigned) {
1634 Zero = B.buildConstant(S32, 0).getReg(0);
1635 MRI.setRegBank(Zero,
1636 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1637
1638 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1639 .getReg(0);
1640 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1641 : AMDGPU::SGPRRegBank);
1642
1643 if (DstOnValu && !MulHiInVgpr) {
1644 Carry = B.buildTrunc(S1, Carry).getReg(0);
1645 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1646 }
1647 }
1648
1649 if (Accumulate) {
1650 if (DstOnValu) {
1651 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1652 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1653 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1654 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1655 }
1656
1657 auto Unmerge = B.buildUnmerge(S32, Src2);
1658 Register Src2Lo = Unmerge.getReg(0);
1659 Register Src2Hi = Unmerge.getReg(1);
1660 MRI.setRegBank(Src2Lo, DstBank);
1661 MRI.setRegBank(Src2Hi, DstBank);
1662
1663 if (!IsUnsigned) {
1664 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1665 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1666
1667 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1668 MRI.setRegBank(Carry, CarryBank);
1669 }
1670
1671 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1672 DstLo = AddLo.getReg(0);
1673 Register CarryLo = AddLo.getReg(1);
1674 MRI.setRegBank(DstLo, DstBank);
1675 MRI.setRegBank(CarryLo, CarryBank);
1676
1677 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1678 DstHi = AddHi.getReg(0);
1679 MRI.setRegBank(DstHi, DstBank);
1680
1681 Register CarryHi = AddHi.getReg(1);
1682 MRI.setRegBank(CarryHi, CarryBank);
1683
1684 if (IsUnsigned) {
1685 Carry = CarryHi;
1686 } else {
1687 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1688 MRI.setRegBank(Carry, CarryBank);
1689 }
1690 } else {
1691 if (IsUnsigned) {
1692 Carry = B.buildConstant(CarryType, 0).getReg(0);
1693 MRI.setRegBank(Carry, CarryBank);
1694 }
1695 }
1696
1697 B.buildMerge(Dst0, {DstLo, DstHi});
1698
1699 if (DstOnValu) {
1700 B.buildCopy(Dst1, Carry);
1701 } else {
1702 B.buildTrunc(Dst1, Carry);
1703 }
1704
1705 MI.eraseFromParent();
1706 return true;
1707 }
1708
1709 // Return a suitable opcode for extending the operands of Opc when widening.
getExtendOp(unsigned Opc)1710 static unsigned getExtendOp(unsigned Opc) {
1711 switch (Opc) {
1712 case TargetOpcode::G_ASHR:
1713 case TargetOpcode::G_SMIN:
1714 case TargetOpcode::G_SMAX:
1715 return TargetOpcode::G_SEXT;
1716 case TargetOpcode::G_LSHR:
1717 case TargetOpcode::G_UMIN:
1718 case TargetOpcode::G_UMAX:
1719 return TargetOpcode::G_ZEXT;
1720 default:
1721 return TargetOpcode::G_ANYEXT;
1722 }
1723 }
1724
1725 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1726 // any illegal vector extend or unmerge operations.
1727 static std::pair<Register, Register>
unpackV2S16ToS32(MachineIRBuilder & B,Register Src,unsigned ExtOpcode)1728 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1729 const LLT S32 = LLT::scalar(32);
1730 auto Bitcast = B.buildBitcast(S32, Src);
1731
1732 if (ExtOpcode == TargetOpcode::G_SEXT) {
1733 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1734 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1735 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1736 }
1737
1738 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1739 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1740 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1741 return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1742 }
1743
1744 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1745 return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1746 }
1747
1748 // For cases where only a single copy is inserted for matching register banks.
1749 // Replace the register in the instruction operand
substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper & OpdMapper,unsigned OpIdx)1750 static bool substituteSimpleCopyRegs(
1751 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1752 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1753 if (!SrcReg.empty()) {
1754 assert(SrcReg.size() == 1);
1755 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1756 return true;
1757 }
1758
1759 return false;
1760 }
1761
1762 /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg) const1763 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1764 MachineRegisterInfo &MRI,
1765 Register Reg) const {
1766 if (!Subtarget.hasUnpackedD16VMem())
1767 return Reg;
1768
1769 const LLT S16 = LLT::scalar(16);
1770 LLT StoreVT = MRI.getType(Reg);
1771 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1772 return Reg;
1773
1774 auto Unmerge = B.buildUnmerge(S16, Reg);
1775
1776
1777 SmallVector<Register, 4> WideRegs;
1778 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1779 WideRegs.push_back(Unmerge.getReg(I));
1780
1781 const LLT S32 = LLT::scalar(32);
1782 int NumElts = StoreVT.getNumElements();
1783
1784 return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1785 }
1786
1787 static std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo & MRI,Register Reg)1788 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1789 int64_t Const;
1790 if (mi_match(Reg, MRI, m_ICst(Const)))
1791 return std::make_pair(Register(), Const);
1792
1793 Register Base;
1794 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1795 return std::make_pair(Base, Const);
1796
1797 // TODO: Handle G_OR used for add case
1798 return std::make_pair(Reg, 0);
1799 }
1800
1801 std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const1802 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1803 Register OrigOffset) const {
1804 const unsigned MaxImm = 4095;
1805 Register BaseReg;
1806 unsigned ImmOffset;
1807 const LLT S32 = LLT::scalar(32);
1808
1809 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1810 OrigOffset);
1811
1812 unsigned C1 = 0;
1813 if (ImmOffset != 0) {
1814 // If the immediate value is too big for the immoffset field, put the value
1815 // and -4096 into the immoffset field so that the value that is copied/added
1816 // for the voffset field is a multiple of 4096, and it stands more chance
1817 // of being CSEd with the copy/add for another similar load/store.
1818 // However, do not do that rounding down to a multiple of 4096 if that is a
1819 // negative number, as it appears to be illegal to have a negative offset
1820 // in the vgpr, even if adding the immediate offset makes it positive.
1821 unsigned Overflow = ImmOffset & ~MaxImm;
1822 ImmOffset -= Overflow;
1823 if ((int32_t)Overflow < 0) {
1824 Overflow += ImmOffset;
1825 ImmOffset = 0;
1826 }
1827
1828 C1 = ImmOffset;
1829 if (Overflow != 0) {
1830 if (!BaseReg)
1831 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1832 else {
1833 auto OverflowVal = B.buildConstant(S32, Overflow);
1834 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1835 }
1836 }
1837 }
1838
1839 if (!BaseReg)
1840 BaseReg = B.buildConstant(S32, 0).getReg(0);
1841
1842 return {BaseReg, C1};
1843 }
1844
buildVCopy(MachineIRBuilder & B,Register DstReg,Register SrcReg) const1845 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1846 Register SrcReg) const {
1847 MachineRegisterInfo &MRI = *B.getMRI();
1848 LLT SrcTy = MRI.getType(SrcReg);
1849 if (SrcTy.getSizeInBits() == 32) {
1850 // Use a v_mov_b32 here to make the exec dependency explicit.
1851 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1852 .addDef(DstReg)
1853 .addUse(SrcReg);
1854 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1855 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1856 }
1857
1858 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1859 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1860
1861 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1862 .addDef(TmpReg0)
1863 .addUse(SrcReg, 0, AMDGPU::sub0);
1864 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1865 .addDef(TmpReg1)
1866 .addUse(SrcReg, 0, AMDGPU::sub1);
1867 B.buildInstr(AMDGPU::REG_SEQUENCE)
1868 .addDef(DstReg)
1869 .addUse(TmpReg0)
1870 .addImm(AMDGPU::sub0)
1871 .addUse(TmpReg1)
1872 .addImm(AMDGPU::sub1);
1873
1874 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1875 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1876 }
1877
1878 /// Utility function for pushing dynamic vector indexes with a constant offset
1879 /// into waterfall loops.
reinsertVectorIndexAdd(MachineIRBuilder & B,MachineInstr & IdxUseInstr,unsigned OpIdx,unsigned ConstOffset)1880 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1881 MachineInstr &IdxUseInstr,
1882 unsigned OpIdx,
1883 unsigned ConstOffset) {
1884 MachineRegisterInfo &MRI = *B.getMRI();
1885 const LLT S32 = LLT::scalar(32);
1886 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1887 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1888
1889 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1890
1891 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1892 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1893 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1894 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1895 }
1896
1897 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1898 /// original 32-bit source value (to be inserted in the low part of the combined
1899 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1900 /// value.
extendLow32IntoHigh32(MachineIRBuilder & B,Register Hi32Reg,Register Lo32Reg,unsigned ExtOpc,const RegisterBank & RegBank,bool IsBooleanSrc=false)1901 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1902 Register Hi32Reg, Register Lo32Reg,
1903 unsigned ExtOpc,
1904 const RegisterBank &RegBank,
1905 bool IsBooleanSrc = false) {
1906 if (ExtOpc == AMDGPU::G_ZEXT) {
1907 B.buildConstant(Hi32Reg, 0);
1908 } else if (ExtOpc == AMDGPU::G_SEXT) {
1909 if (IsBooleanSrc) {
1910 // If we know the original source was an s1, the high half is the same as
1911 // the low.
1912 B.buildCopy(Hi32Reg, Lo32Reg);
1913 } else {
1914 // Replicate sign bit from 32-bit extended part.
1915 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1916 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1917 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1918 }
1919 } else {
1920 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1921 B.buildUndef(Hi32Reg);
1922 }
1923 }
1924
foldExtractEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const1925 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1926 MachineInstr &MI, MachineRegisterInfo &MRI,
1927 const OperandsMapper &OpdMapper) const {
1928
1929 Register VecReg = MI.getOperand(1).getReg();
1930 Register Idx = MI.getOperand(2).getReg();
1931
1932 const RegisterBank &IdxBank =
1933 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1934
1935 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1936
1937 LLT VecTy = MRI.getType(VecReg);
1938 unsigned EltSize = VecTy.getScalarSizeInBits();
1939 unsigned NumElem = VecTy.getNumElements();
1940
1941 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1942 IsDivergentIdx, &Subtarget))
1943 return false;
1944
1945 MachineIRBuilder B(MI);
1946 LLT S32 = LLT::scalar(32);
1947
1948 const RegisterBank &DstBank =
1949 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1950 const RegisterBank &SrcBank =
1951 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1952
1953 const RegisterBank &CCBank =
1954 (DstBank == AMDGPU::SGPRRegBank &&
1955 SrcBank == AMDGPU::SGPRRegBank &&
1956 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1957 : AMDGPU::VCCRegBank;
1958 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1959
1960 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1961 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1962 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1963 }
1964
1965 LLT EltTy = VecTy.getScalarType();
1966 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1967 unsigned NumLanes = DstRegs.size();
1968 if (!NumLanes)
1969 NumLanes = 1;
1970 else
1971 EltTy = MRI.getType(DstRegs[0]);
1972
1973 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1974 SmallVector<Register, 2> Res(NumLanes);
1975 for (unsigned L = 0; L < NumLanes; ++L)
1976 Res[L] = UnmergeToEltTy.getReg(L);
1977
1978 for (unsigned I = 1; I < NumElem; ++I) {
1979 auto IC = B.buildConstant(S32, I);
1980 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1981 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1982 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1983
1984 for (unsigned L = 0; L < NumLanes; ++L) {
1985 auto S = B.buildSelect(EltTy, Cmp,
1986 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1987
1988 for (unsigned N : { 0, 2, 3 })
1989 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1990
1991 Res[L] = S->getOperand(0).getReg();
1992 }
1993 }
1994
1995 for (unsigned L = 0; L < NumLanes; ++L) {
1996 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1997 B.buildCopy(DstReg, Res[L]);
1998 MRI.setRegBank(DstReg, DstBank);
1999 }
2000
2001 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2002 MI.eraseFromParent();
2003
2004 return true;
2005 }
2006
2007 // Insert a cross regbank copy for a register if it already has a bank that
2008 // differs from the one we want to set.
constrainRegToBank(MachineRegisterInfo & MRI,MachineIRBuilder & B,Register & Reg,const RegisterBank & Bank)2009 static Register constrainRegToBank(MachineRegisterInfo &MRI,
2010 MachineIRBuilder &B, Register &Reg,
2011 const RegisterBank &Bank) {
2012 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2013 if (CurrBank && *CurrBank != Bank) {
2014 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2015 MRI.setRegBank(Copy, Bank);
2016 return Copy;
2017 }
2018
2019 MRI.setRegBank(Reg, Bank);
2020 return Reg;
2021 }
2022
foldInsertEltToCmpSelect(MachineInstr & MI,MachineRegisterInfo & MRI,const OperandsMapper & OpdMapper) const2023 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2024 MachineInstr &MI, MachineRegisterInfo &MRI,
2025 const OperandsMapper &OpdMapper) const {
2026
2027 Register VecReg = MI.getOperand(1).getReg();
2028 Register Idx = MI.getOperand(3).getReg();
2029
2030 const RegisterBank &IdxBank =
2031 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2032
2033 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2034
2035 LLT VecTy = MRI.getType(VecReg);
2036 unsigned EltSize = VecTy.getScalarSizeInBits();
2037 unsigned NumElem = VecTy.getNumElements();
2038
2039 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2040 IsDivergentIdx, &Subtarget))
2041 return false;
2042
2043 MachineIRBuilder B(MI);
2044 LLT S32 = LLT::scalar(32);
2045
2046 const RegisterBank &DstBank =
2047 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2048 const RegisterBank &SrcBank =
2049 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2050 const RegisterBank &InsBank =
2051 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2052
2053 const RegisterBank &CCBank =
2054 (DstBank == AMDGPU::SGPRRegBank &&
2055 SrcBank == AMDGPU::SGPRRegBank &&
2056 InsBank == AMDGPU::SGPRRegBank &&
2057 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2058 : AMDGPU::VCCRegBank;
2059 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2060
2061 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2062 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2063 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2064 }
2065
2066 LLT EltTy = VecTy.getScalarType();
2067 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2068 unsigned NumLanes = InsRegs.size();
2069 if (!NumLanes) {
2070 NumLanes = 1;
2071 InsRegs.push_back(MI.getOperand(2).getReg());
2072 } else {
2073 EltTy = MRI.getType(InsRegs[0]);
2074 }
2075
2076 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2077 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2078
2079 for (unsigned I = 0; I < NumElem; ++I) {
2080 auto IC = B.buildConstant(S32, I);
2081 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2082 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2083 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2084
2085 for (unsigned L = 0; L < NumLanes; ++L) {
2086 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2087 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2088 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2089
2090 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2091 MRI.setRegBank(Select, DstBank);
2092
2093 Ops[I * NumLanes + L] = Select;
2094 }
2095 }
2096
2097 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2098 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2099 B.buildBuildVector(MI.getOperand(0), Ops);
2100 } else {
2101 auto Vec = B.buildBuildVector(MergeTy, Ops);
2102 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2103 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2104 }
2105
2106 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2107 MI.eraseFromParent();
2108
2109 return true;
2110 }
2111
applyMappingImpl(const OperandsMapper & OpdMapper) const2112 void AMDGPURegisterBankInfo::applyMappingImpl(
2113 const OperandsMapper &OpdMapper) const {
2114 MachineInstr &MI = OpdMapper.getMI();
2115 unsigned Opc = MI.getOpcode();
2116 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2117 switch (Opc) {
2118 case AMDGPU::G_PHI: {
2119 Register DstReg = MI.getOperand(0).getReg();
2120 LLT DstTy = MRI.getType(DstReg);
2121 if (DstTy != LLT::scalar(1))
2122 break;
2123
2124 const LLT S32 = LLT::scalar(32);
2125 const RegisterBank *DstBank =
2126 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2127 if (DstBank == &AMDGPU::VCCRegBank) {
2128 applyDefaultMapping(OpdMapper);
2129 // The standard handling only considers the result register bank for
2130 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2131 // produce an invalid copy. We can only copy with some kind of compare to
2132 // get a vector boolean result. Insert a register bank copy that will be
2133 // correctly lowered to a compare.
2134 MachineIRBuilder B(*MI.getParent()->getParent());
2135
2136 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2137 Register SrcReg = MI.getOperand(I).getReg();
2138 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2139
2140 if (SrcBank != &AMDGPU::VCCRegBank) {
2141 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2142 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2143
2144 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2145 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2146 MI.getOperand(I).setReg(Copy.getReg(0));
2147 }
2148 }
2149
2150 return;
2151 }
2152
2153 // Phi handling is strange and only considers the bank of the destination.
2154 substituteSimpleCopyRegs(OpdMapper, 0);
2155
2156 // Promote SGPR/VGPR booleans to s32
2157 MachineFunction *MF = MI.getParent()->getParent();
2158 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2159 MachineIRBuilder B(MI, ApplyBank);
2160 LegalizerHelper Helper(*MF, ApplyBank, B);
2161
2162 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2163 llvm_unreachable("widen scalar should have succeeded");
2164
2165 return;
2166 }
2167 case AMDGPU::G_ICMP:
2168 case AMDGPU::G_UADDO:
2169 case AMDGPU::G_USUBO:
2170 case AMDGPU::G_UADDE:
2171 case AMDGPU::G_SADDE:
2172 case AMDGPU::G_USUBE:
2173 case AMDGPU::G_SSUBE: {
2174 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2175 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2176
2177 const RegisterBank *DstBank =
2178 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2179 if (DstBank != &AMDGPU::SGPRRegBank)
2180 break;
2181
2182 const bool HasCarryIn = MI.getNumOperands() == 5;
2183
2184 // If this is a scalar compare, promote the result to s32, as the selection
2185 // will end up using a copy to a 32-bit vreg.
2186 const LLT S32 = LLT::scalar(32);
2187 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2188 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2189 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2190 MachineIRBuilder B(MI);
2191
2192 if (HasCarryIn) {
2193 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2194 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2195 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2196 MI.getOperand(4).setReg(NewSrcReg);
2197 }
2198
2199 MachineBasicBlock *MBB = MI.getParent();
2200 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2201
2202 // If we had a constrained VCC result register, a copy was inserted to VCC
2203 // from SGPR.
2204 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2205 if (DefRegs.empty())
2206 DefRegs.push_back(DstReg);
2207 B.buildTrunc(DefRegs[0], NewDstReg);
2208 return;
2209 }
2210 case AMDGPU::G_SELECT: {
2211 Register DstReg = MI.getOperand(0).getReg();
2212 LLT DstTy = MRI.getType(DstReg);
2213
2214 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2215 if (CondRegs.empty())
2216 CondRegs.push_back(MI.getOperand(1).getReg());
2217 else {
2218 assert(CondRegs.size() == 1);
2219 }
2220
2221 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2222 if (CondBank == &AMDGPU::SGPRRegBank) {
2223 MachineIRBuilder B(MI);
2224 const LLT S32 = LLT::scalar(32);
2225 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2226 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2227
2228 MI.getOperand(1).setReg(NewCondReg);
2229 B.buildZExt(NewCondReg, CondRegs[0]);
2230 }
2231
2232 if (DstTy.getSizeInBits() != 64)
2233 break;
2234
2235 MachineIRBuilder B(MI);
2236 LLT HalfTy = getHalfSizedType(DstTy);
2237
2238 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2239 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2240 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2241
2242 // All inputs are SGPRs, nothing special to do.
2243 if (DefRegs.empty()) {
2244 assert(Src1Regs.empty() && Src2Regs.empty());
2245 break;
2246 }
2247
2248 if (Src1Regs.empty())
2249 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2250 else {
2251 setRegsToType(MRI, Src1Regs, HalfTy);
2252 }
2253
2254 if (Src2Regs.empty())
2255 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2256 else
2257 setRegsToType(MRI, Src2Regs, HalfTy);
2258
2259 setRegsToType(MRI, DefRegs, HalfTy);
2260
2261 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2262 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2263
2264 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2265 MI.eraseFromParent();
2266 return;
2267 }
2268 case AMDGPU::G_BRCOND: {
2269 Register CondReg = MI.getOperand(0).getReg();
2270 // FIXME: Should use legalizer helper, but should change bool ext type.
2271 const RegisterBank *CondBank =
2272 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2273
2274 if (CondBank == &AMDGPU::SGPRRegBank) {
2275 MachineIRBuilder B(MI);
2276 const LLT S32 = LLT::scalar(32);
2277 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2278 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2279
2280 MI.getOperand(0).setReg(NewCondReg);
2281 B.buildZExt(NewCondReg, CondReg);
2282 return;
2283 }
2284
2285 break;
2286 }
2287 case AMDGPU::G_AND:
2288 case AMDGPU::G_OR:
2289 case AMDGPU::G_XOR: {
2290 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2291 // there is a VGPR input.
2292 Register DstReg = MI.getOperand(0).getReg();
2293 LLT DstTy = MRI.getType(DstReg);
2294
2295 if (DstTy.getSizeInBits() == 1) {
2296 const RegisterBank *DstBank =
2297 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2298 if (DstBank == &AMDGPU::VCCRegBank)
2299 break;
2300
2301 MachineFunction *MF = MI.getParent()->getParent();
2302 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2303 MachineIRBuilder B(MI, ApplyBank);
2304 LegalizerHelper Helper(*MF, ApplyBank, B);
2305
2306 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2307 LegalizerHelper::Legalized)
2308 llvm_unreachable("widen scalar should have succeeded");
2309 return;
2310 }
2311
2312 if (DstTy.getSizeInBits() != 64)
2313 break;
2314
2315 LLT HalfTy = getHalfSizedType(DstTy);
2316 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2317 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2318 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2319
2320 // All inputs are SGPRs, nothing special to do.
2321 if (DefRegs.empty()) {
2322 assert(Src0Regs.empty() && Src1Regs.empty());
2323 break;
2324 }
2325
2326 assert(DefRegs.size() == 2);
2327 assert(Src0Regs.size() == Src1Regs.size() &&
2328 (Src0Regs.empty() || Src0Regs.size() == 2));
2329
2330 // Depending on where the source registers came from, the generic code may
2331 // have decided to split the inputs already or not. If not, we still need to
2332 // extract the values.
2333 MachineIRBuilder B(MI);
2334
2335 if (Src0Regs.empty())
2336 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2337 else
2338 setRegsToType(MRI, Src0Regs, HalfTy);
2339
2340 if (Src1Regs.empty())
2341 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2342 else
2343 setRegsToType(MRI, Src1Regs, HalfTy);
2344
2345 setRegsToType(MRI, DefRegs, HalfTy);
2346
2347 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2348 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2349
2350 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2351 MI.eraseFromParent();
2352 return;
2353 }
2354 case AMDGPU::G_ABS: {
2355 Register SrcReg = MI.getOperand(1).getReg();
2356 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2357
2358 // There is no VALU abs instruction so we need to replace it with a sub and
2359 // max combination.
2360 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2361 MachineFunction *MF = MI.getParent()->getParent();
2362 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2363 MachineIRBuilder B(MI, Apply);
2364 LegalizerHelper Helper(*MF, Apply, B);
2365
2366 if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2367 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2368 return;
2369 }
2370 LLVM_FALLTHROUGH;
2371 }
2372 case AMDGPU::G_ADD:
2373 case AMDGPU::G_SUB:
2374 case AMDGPU::G_MUL:
2375 case AMDGPU::G_SHL:
2376 case AMDGPU::G_LSHR:
2377 case AMDGPU::G_ASHR:
2378 case AMDGPU::G_SMIN:
2379 case AMDGPU::G_SMAX:
2380 case AMDGPU::G_UMIN:
2381 case AMDGPU::G_UMAX: {
2382 Register DstReg = MI.getOperand(0).getReg();
2383 LLT DstTy = MRI.getType(DstReg);
2384
2385 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2386 // Packed 16-bit operations need to be scalarized and promoted.
2387 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2388 break;
2389
2390 const RegisterBank *DstBank =
2391 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2392 if (DstBank == &AMDGPU::VGPRRegBank)
2393 break;
2394
2395 const LLT S32 = LLT::scalar(32);
2396 MachineBasicBlock *MBB = MI.getParent();
2397 MachineFunction *MF = MBB->getParent();
2398 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2399 MachineIRBuilder B(MI, ApplySALU);
2400
2401 if (DstTy.isVector()) {
2402 Register WideSrc0Lo, WideSrc0Hi;
2403 Register WideSrc1Lo, WideSrc1Hi;
2404
2405 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2406 std::tie(WideSrc0Lo, WideSrc0Hi)
2407 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2408 std::tie(WideSrc1Lo, WideSrc1Hi)
2409 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2410 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2411 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2412 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2413 MI.eraseFromParent();
2414 } else {
2415 LegalizerHelper Helper(*MF, ApplySALU, B);
2416
2417 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2418 llvm_unreachable("widen scalar should have succeeded");
2419
2420 // FIXME: s16 shift amounts should be legal.
2421 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2422 Opc == AMDGPU::G_ASHR) {
2423 B.setInsertPt(*MBB, MI.getIterator());
2424 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2425 llvm_unreachable("widen scalar should have succeeded");
2426 }
2427 }
2428
2429 return;
2430 }
2431 case AMDGPU::G_SEXT_INREG: {
2432 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2433 if (SrcRegs.empty())
2434 break; // Nothing to repair
2435
2436 const LLT S32 = LLT::scalar(32);
2437 MachineIRBuilder B(MI);
2438 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2439 GISelObserverWrapper Observer(&O);
2440 B.setChangeObserver(Observer);
2441
2442 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2443 // we would need to further expand, and doesn't let us directly set the
2444 // result registers.
2445 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2446
2447 int Amt = MI.getOperand(2).getImm();
2448 if (Amt <= 32) {
2449 if (Amt == 32) {
2450 // The low bits are unchanged.
2451 B.buildCopy(DstRegs[0], SrcRegs[0]);
2452 } else {
2453 // Extend in the low bits and propagate the sign bit to the high half.
2454 B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2455 }
2456
2457 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2458 } else {
2459 // The low bits are unchanged, and extend in the high bits.
2460 B.buildCopy(DstRegs[0], SrcRegs[0]);
2461 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2462 }
2463
2464 Register DstReg = MI.getOperand(0).getReg();
2465 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2466 MI.eraseFromParent();
2467 return;
2468 }
2469 case AMDGPU::G_CTPOP:
2470 case AMDGPU::G_BITREVERSE: {
2471 const RegisterBank *DstBank =
2472 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2473 if (DstBank == &AMDGPU::SGPRRegBank)
2474 break;
2475
2476 Register SrcReg = MI.getOperand(1).getReg();
2477 const LLT S32 = LLT::scalar(32);
2478 LLT Ty = MRI.getType(SrcReg);
2479 if (Ty == S32)
2480 break;
2481
2482 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2483 MachineIRBuilder B(MI, ApplyVALU);
2484
2485 MachineFunction &MF = B.getMF();
2486 LegalizerHelper Helper(MF, ApplyVALU, B);
2487
2488 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2489 llvm_unreachable("narrowScalar should have succeeded");
2490 return;
2491 }
2492 case AMDGPU::G_AMDGPU_FFBH_U32:
2493 case AMDGPU::G_AMDGPU_FFBL_B32:
2494 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2495 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2496 const RegisterBank *DstBank =
2497 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2498 if (DstBank == &AMDGPU::SGPRRegBank)
2499 break;
2500
2501 Register SrcReg = MI.getOperand(1).getReg();
2502 const LLT S32 = LLT::scalar(32);
2503 LLT Ty = MRI.getType(SrcReg);
2504 if (Ty == S32)
2505 break;
2506
2507 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2508 // which return -1 when the input is zero:
2509 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2510 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2511 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2512 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2513 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2514 MachineIRBuilder B(MI, ApplyVALU);
2515 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2516 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2517 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2518 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2519 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2520 : Opc;
2521 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2522 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2523 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2524 unsigned AddOpc =
2525 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2526 ? AMDGPU::G_ADD
2527 : AMDGPU::G_UADDSAT;
2528 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2529 Register DstReg = MI.getOperand(0).getReg();
2530 B.buildUMin(DstReg, X, Y);
2531 MI.eraseFromParent();
2532 return;
2533 }
2534 case AMDGPU::G_SEXT:
2535 case AMDGPU::G_ZEXT:
2536 case AMDGPU::G_ANYEXT: {
2537 Register SrcReg = MI.getOperand(1).getReg();
2538 LLT SrcTy = MRI.getType(SrcReg);
2539 const bool Signed = Opc == AMDGPU::G_SEXT;
2540
2541 assert(empty(OpdMapper.getVRegs(1)));
2542
2543 MachineIRBuilder B(MI);
2544 const RegisterBank *SrcBank =
2545 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2546
2547 Register DstReg = MI.getOperand(0).getReg();
2548 LLT DstTy = MRI.getType(DstReg);
2549 if (DstTy.isScalar() &&
2550 SrcBank != &AMDGPU::SGPRRegBank &&
2551 SrcBank != &AMDGPU::VCCRegBank &&
2552 // FIXME: Should handle any type that round to s64 when irregular
2553 // breakdowns supported.
2554 DstTy.getSizeInBits() == 64 &&
2555 SrcTy.getSizeInBits() <= 32) {
2556 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2557
2558 // Extend to 32-bit, and then extend the low half.
2559 if (Signed) {
2560 // TODO: Should really be buildSExtOrCopy
2561 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2562 } else if (Opc == AMDGPU::G_ZEXT) {
2563 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2564 } else {
2565 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2566 }
2567
2568 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2569 MRI.setRegBank(DstReg, *SrcBank);
2570 MI.eraseFromParent();
2571 return;
2572 }
2573
2574 if (SrcTy != LLT::scalar(1))
2575 return;
2576
2577 // It is not legal to have a legalization artifact with a VCC source. Rather
2578 // than introducing a copy, insert the select we would have to select the
2579 // copy to.
2580 if (SrcBank == &AMDGPU::VCCRegBank) {
2581 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2582
2583 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2584
2585 unsigned DstSize = DstTy.getSizeInBits();
2586 // 64-bit select is SGPR only
2587 const bool UseSel64 = DstSize > 32 &&
2588 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2589
2590 // TODO: Should s16 select be legal?
2591 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2592 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2593 auto False = B.buildConstant(SelType, 0);
2594
2595 MRI.setRegBank(True.getReg(0), *DstBank);
2596 MRI.setRegBank(False.getReg(0), *DstBank);
2597 MRI.setRegBank(DstReg, *DstBank);
2598
2599 if (DstSize > 32) {
2600 B.buildSelect(DefRegs[0], SrcReg, True, False);
2601 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2602 } else if (DstSize < 32) {
2603 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2604 MRI.setRegBank(Sel.getReg(0), *DstBank);
2605 B.buildTrunc(DstReg, Sel);
2606 } else {
2607 B.buildSelect(DstReg, SrcReg, True, False);
2608 }
2609
2610 MI.eraseFromParent();
2611 return;
2612 }
2613
2614 break;
2615 }
2616 case AMDGPU::G_BUILD_VECTOR:
2617 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2618 Register DstReg = MI.getOperand(0).getReg();
2619 LLT DstTy = MRI.getType(DstReg);
2620 if (DstTy != LLT::fixed_vector(2, 16))
2621 break;
2622
2623 assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2624 substituteSimpleCopyRegs(OpdMapper, 1);
2625 substituteSimpleCopyRegs(OpdMapper, 2);
2626
2627 const RegisterBank *DstBank =
2628 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2629 if (DstBank == &AMDGPU::SGPRRegBank)
2630 break; // Can use S_PACK_* instructions.
2631
2632 MachineIRBuilder B(MI);
2633
2634 Register Lo = MI.getOperand(1).getReg();
2635 Register Hi = MI.getOperand(2).getReg();
2636 const LLT S32 = LLT::scalar(32);
2637
2638 const RegisterBank *BankLo =
2639 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2640 const RegisterBank *BankHi =
2641 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2642
2643 Register ZextLo;
2644 Register ShiftHi;
2645
2646 if (Opc == AMDGPU::G_BUILD_VECTOR) {
2647 ZextLo = B.buildZExt(S32, Lo).getReg(0);
2648 MRI.setRegBank(ZextLo, *BankLo);
2649
2650 Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2651 MRI.setRegBank(ZextHi, *BankHi);
2652
2653 auto ShiftAmt = B.buildConstant(S32, 16);
2654 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2655
2656 ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2657 MRI.setRegBank(ShiftHi, *BankHi);
2658 } else {
2659 Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2660 MRI.setRegBank(MaskLo, *BankLo);
2661
2662 auto ShiftAmt = B.buildConstant(S32, 16);
2663 MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2664
2665 ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2666 MRI.setRegBank(ShiftHi, *BankHi);
2667
2668 ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2669 MRI.setRegBank(ZextLo, *BankLo);
2670 }
2671
2672 auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2673 MRI.setRegBank(Or.getReg(0), *DstBank);
2674
2675 B.buildBitcast(DstReg, Or);
2676 MI.eraseFromParent();
2677 return;
2678 }
2679 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2680 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2681
2682 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2683
2684 Register DstReg = MI.getOperand(0).getReg();
2685 Register SrcReg = MI.getOperand(1).getReg();
2686
2687 const LLT S32 = LLT::scalar(32);
2688 LLT DstTy = MRI.getType(DstReg);
2689 LLT SrcTy = MRI.getType(SrcReg);
2690
2691 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2692 return;
2693
2694 MachineIRBuilder B(MI);
2695
2696 const ValueMapping &DstMapping
2697 = OpdMapper.getInstrMapping().getOperandMapping(0);
2698 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2699 const RegisterBank *SrcBank =
2700 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2701 const RegisterBank *IdxBank =
2702 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2703
2704 Register BaseIdxReg;
2705 unsigned ConstOffset;
2706 std::tie(BaseIdxReg, ConstOffset) =
2707 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2708
2709 // See if the index is an add of a constant which will be foldable by moving
2710 // the base register of the index later if this is going to be executed in a
2711 // waterfall loop. This is essentially to reassociate the add of a constant
2712 // with the readfirstlane.
2713 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2714 ConstOffset > 0 &&
2715 ConstOffset < SrcTy.getNumElements();
2716
2717 // Move the base register. We'll re-insert the add later.
2718 if (ShouldMoveIndexIntoLoop)
2719 MI.getOperand(2).setReg(BaseIdxReg);
2720
2721 // If this is a VGPR result only because the index was a VGPR result, the
2722 // actual indexing will be done on the SGPR source vector, which will
2723 // produce a scalar result. We need to copy to the VGPR result inside the
2724 // waterfall loop.
2725 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2726 SrcBank == &AMDGPU::SGPRRegBank;
2727 if (DstRegs.empty()) {
2728 applyDefaultMapping(OpdMapper);
2729
2730 executeInWaterfallLoop(MI, MRI, { 2 });
2731
2732 if (NeedCopyToVGPR) {
2733 // We don't want a phi for this temporary reg.
2734 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2735 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2736 MI.getOperand(0).setReg(TmpReg);
2737 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2738
2739 // Use a v_mov_b32 here to make the exec dependency explicit.
2740 buildVCopy(B, DstReg, TmpReg);
2741 }
2742
2743 // Re-insert the constant offset add inside the waterfall loop.
2744 if (ShouldMoveIndexIntoLoop)
2745 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2746
2747 return;
2748 }
2749
2750 assert(DstTy.getSizeInBits() == 64);
2751
2752 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2753
2754 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2755 auto One = B.buildConstant(S32, 1);
2756
2757 MachineBasicBlock::iterator MII = MI.getIterator();
2758
2759 // Split the vector index into 32-bit pieces. Prepare to move all of the
2760 // new instructions into a waterfall loop if necessary.
2761 //
2762 // Don't put the bitcast or constant in the loop.
2763 MachineInstrSpan Span(MII, &B.getMBB());
2764
2765 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2766 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2767 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2768
2769 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2770 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2771
2772 MRI.setRegBank(DstReg, *DstBank);
2773 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2774 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2775 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2776 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2777
2778 SmallSet<Register, 4> OpsToWaterfall;
2779 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2780 MI.eraseFromParent();
2781 return;
2782 }
2783
2784 // Remove the original instruction to avoid potentially confusing the
2785 // waterfall loop logic.
2786 B.setInstr(*Span.begin());
2787 MI.eraseFromParent();
2788 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2789 OpsToWaterfall, MRI);
2790
2791 if (NeedCopyToVGPR) {
2792 MachineBasicBlock *LoopBB = Extract1->getParent();
2793 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2794 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2795 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2796 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2797
2798 Extract0->getOperand(0).setReg(TmpReg0);
2799 Extract1->getOperand(0).setReg(TmpReg1);
2800
2801 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2802
2803 buildVCopy(B, DstRegs[0], TmpReg0);
2804 buildVCopy(B, DstRegs[1], TmpReg1);
2805 }
2806
2807 if (ShouldMoveIndexIntoLoop)
2808 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2809
2810 return;
2811 }
2812 case AMDGPU::G_INSERT_VECTOR_ELT: {
2813 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2814
2815 Register DstReg = MI.getOperand(0).getReg();
2816 LLT VecTy = MRI.getType(DstReg);
2817
2818 assert(OpdMapper.getVRegs(0).empty());
2819 assert(OpdMapper.getVRegs(3).empty());
2820
2821 if (substituteSimpleCopyRegs(OpdMapper, 1))
2822 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2823
2824 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2825 return;
2826
2827 const RegisterBank *IdxBank =
2828 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2829
2830 Register SrcReg = MI.getOperand(1).getReg();
2831 Register InsReg = MI.getOperand(2).getReg();
2832 LLT InsTy = MRI.getType(InsReg);
2833 (void)InsTy;
2834
2835 Register BaseIdxReg;
2836 unsigned ConstOffset;
2837 std::tie(BaseIdxReg, ConstOffset) =
2838 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2839
2840 // See if the index is an add of a constant which will be foldable by moving
2841 // the base register of the index later if this is going to be executed in a
2842 // waterfall loop. This is essentially to reassociate the add of a constant
2843 // with the readfirstlane.
2844 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2845 ConstOffset > 0 &&
2846 ConstOffset < VecTy.getNumElements();
2847
2848 // Move the base register. We'll re-insert the add later.
2849 if (ShouldMoveIndexIntoLoop)
2850 MI.getOperand(3).setReg(BaseIdxReg);
2851
2852
2853 if (InsRegs.empty()) {
2854 executeInWaterfallLoop(MI, MRI, { 3 });
2855
2856 // Re-insert the constant offset add inside the waterfall loop.
2857 if (ShouldMoveIndexIntoLoop) {
2858 MachineIRBuilder B(MI);
2859 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2860 }
2861
2862 return;
2863 }
2864
2865
2866 assert(InsTy.getSizeInBits() == 64);
2867
2868 const LLT S32 = LLT::scalar(32);
2869 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2870
2871 MachineIRBuilder B(MI);
2872 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2873 auto One = B.buildConstant(S32, 1);
2874
2875 // Split the vector index into 32-bit pieces. Prepare to move all of the
2876 // new instructions into a waterfall loop if necessary.
2877 //
2878 // Don't put the bitcast or constant in the loop.
2879 MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2880
2881 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2882 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2883 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2884
2885 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2886 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2887
2888 const RegisterBank *DstBank =
2889 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2890 const RegisterBank *SrcBank =
2891 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2892 const RegisterBank *InsSrcBank =
2893 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2894
2895 MRI.setRegBank(InsReg, *InsSrcBank);
2896 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2897 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2898 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2899 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2900 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2901 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2902
2903
2904 SmallSet<Register, 4> OpsToWaterfall;
2905 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2906 B.setInsertPt(B.getMBB(), MI);
2907 B.buildBitcast(DstReg, InsHi);
2908 MI.eraseFromParent();
2909 return;
2910 }
2911
2912 B.setInstr(*Span.begin());
2913 MI.eraseFromParent();
2914
2915 // Figure out the point after the waterfall loop before mangling the control
2916 // flow.
2917 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2918 OpsToWaterfall, MRI);
2919
2920 // The insertion point is now right after the original instruction.
2921 //
2922 // Keep the bitcast to the original vector type out of the loop. Doing this
2923 // saved an extra phi we don't need inside the loop.
2924 B.buildBitcast(DstReg, InsHi);
2925
2926 // Re-insert the constant offset add inside the waterfall loop.
2927 if (ShouldMoveIndexIntoLoop)
2928 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2929
2930 return;
2931 }
2932 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2933 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2934 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2935 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2936 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2937 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2938 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2939 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2940 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2941 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2942 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2943 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2944 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2945 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2946 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2947 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2948 applyDefaultMapping(OpdMapper);
2949 executeInWaterfallLoop(MI, MRI, {1, 4});
2950 return;
2951 }
2952 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2953 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2954 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2955 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2956 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2957 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2958 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2959 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2960 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2961 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2962 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2963 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2964 applyDefaultMapping(OpdMapper);
2965 executeInWaterfallLoop(MI, MRI, {2, 5});
2966 return;
2967 }
2968 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2969 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2970 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2971 applyDefaultMapping(OpdMapper);
2972 executeInWaterfallLoop(MI, MRI, {2, 5});
2973 return;
2974 }
2975 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2976 applyDefaultMapping(OpdMapper);
2977 executeInWaterfallLoop(MI, MRI, {3, 6});
2978 return;
2979 }
2980 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2981 applyMappingSBufferLoad(OpdMapper);
2982 return;
2983 }
2984 case AMDGPU::G_INTRINSIC: {
2985 switch (MI.getIntrinsicID()) {
2986 case Intrinsic::amdgcn_readlane: {
2987 substituteSimpleCopyRegs(OpdMapper, 2);
2988
2989 assert(OpdMapper.getVRegs(0).empty());
2990 assert(OpdMapper.getVRegs(3).empty());
2991
2992 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2993 // waterfall loop, so assume it's a uniform value.
2994 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2995 return;
2996 }
2997 case Intrinsic::amdgcn_writelane: {
2998 assert(OpdMapper.getVRegs(0).empty());
2999 assert(OpdMapper.getVRegs(2).empty());
3000 assert(OpdMapper.getVRegs(3).empty());
3001
3002 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3003 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
3004 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
3005 return;
3006 }
3007 case Intrinsic::amdgcn_interp_p1:
3008 case Intrinsic::amdgcn_interp_p2:
3009 case Intrinsic::amdgcn_interp_mov:
3010 case Intrinsic::amdgcn_interp_p1_f16:
3011 case Intrinsic::amdgcn_interp_p2_f16:
3012 case Intrinsic::amdgcn_lds_param_load: {
3013 applyDefaultMapping(OpdMapper);
3014
3015 // Readlane for m0 value, which is always the last operand.
3016 // FIXME: Should this be a waterfall loop instead?
3017 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3018 return;
3019 }
3020 case Intrinsic::amdgcn_interp_inreg_p10:
3021 case Intrinsic::amdgcn_interp_inreg_p2:
3022 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3023 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3024 applyDefaultMapping(OpdMapper);
3025 return;
3026 case Intrinsic::amdgcn_permlane16:
3027 case Intrinsic::amdgcn_permlanex16: {
3028 // Doing a waterfall loop over these wouldn't make any sense.
3029 substituteSimpleCopyRegs(OpdMapper, 2);
3030 substituteSimpleCopyRegs(OpdMapper, 3);
3031 constrainOpWithReadfirstlane(MI, MRI, 4);
3032 constrainOpWithReadfirstlane(MI, MRI, 5);
3033 return;
3034 }
3035 case Intrinsic::amdgcn_sbfe:
3036 applyMappingBFE(OpdMapper, true);
3037 return;
3038 case Intrinsic::amdgcn_ubfe:
3039 applyMappingBFE(OpdMapper, false);
3040 return;
3041 case Intrinsic::amdgcn_ballot:
3042 // Use default handling and insert copy to vcc source.
3043 break;
3044 }
3045 break;
3046 }
3047 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3048 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3049 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3050 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3051 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3052 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3053 assert(RSrcIntrin && RSrcIntrin->IsImage);
3054 // Non-images can have complications from operands that allow both SGPR
3055 // and VGPR. For now it's too complicated to figure out the final opcode
3056 // to derive the register bank from the MCInstrDesc.
3057 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3058 return;
3059 }
3060 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3061 unsigned N = MI.getNumExplicitOperands() - 2;
3062 applyDefaultMapping(OpdMapper);
3063 executeInWaterfallLoop(MI, MRI, { N });
3064 return;
3065 }
3066 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3067 auto IntrID = MI.getIntrinsicID();
3068 switch (IntrID) {
3069 case Intrinsic::amdgcn_ds_ordered_add:
3070 case Intrinsic::amdgcn_ds_ordered_swap: {
3071 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3072 assert(OpdMapper.getVRegs(0).empty());
3073 substituteSimpleCopyRegs(OpdMapper, 3);
3074 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3075 return;
3076 }
3077 case Intrinsic::amdgcn_ds_gws_init:
3078 case Intrinsic::amdgcn_ds_gws_barrier:
3079 case Intrinsic::amdgcn_ds_gws_sema_br: {
3080 // Only the first lane is executes, so readfirstlane is safe.
3081 substituteSimpleCopyRegs(OpdMapper, 1);
3082 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3083 return;
3084 }
3085 case Intrinsic::amdgcn_ds_gws_sema_v:
3086 case Intrinsic::amdgcn_ds_gws_sema_p:
3087 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3088 // Only the first lane is executes, so readfirstlane is safe.
3089 constrainOpWithReadfirstlane(MI, MRI, 1); // M0
3090 return;
3091 }
3092 case Intrinsic::amdgcn_ds_append:
3093 case Intrinsic::amdgcn_ds_consume: {
3094 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3095 return;
3096 }
3097 case Intrinsic::amdgcn_s_sendmsg:
3098 case Intrinsic::amdgcn_s_sendmsghalt: {
3099 // FIXME: Should this use a waterfall loop?
3100 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3101 return;
3102 }
3103 case Intrinsic::amdgcn_s_setreg: {
3104 constrainOpWithReadfirstlane(MI, MRI, 2);
3105 return;
3106 }
3107 case Intrinsic::amdgcn_raw_buffer_load_lds: {
3108 applyDefaultMapping(OpdMapper);
3109 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3110 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3111 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3112 return;
3113 }
3114 case Intrinsic::amdgcn_struct_buffer_load_lds: {
3115 applyDefaultMapping(OpdMapper);
3116 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3117 constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3118 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3119 return;
3120 }
3121 case Intrinsic::amdgcn_global_load_lds: {
3122 applyDefaultMapping(OpdMapper);
3123 constrainOpWithReadfirstlane(MI, MRI, 2);
3124 return;
3125 }
3126 case Intrinsic::amdgcn_lds_direct_load: {
3127 applyDefaultMapping(OpdMapper);
3128 // Readlane for m0 value, which is always the last operand.
3129 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3130 return;
3131 }
3132 case Intrinsic::amdgcn_exp_row:
3133 applyDefaultMapping(OpdMapper);
3134 constrainOpWithReadfirstlane(MI, MRI, 8); // M0
3135 return;
3136 default: {
3137 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3138 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3139 // Non-images can have complications from operands that allow both SGPR
3140 // and VGPR. For now it's too complicated to figure out the final opcode
3141 // to derive the register bank from the MCInstrDesc.
3142 if (RSrcIntrin->IsImage) {
3143 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3144 return;
3145 }
3146 }
3147
3148 break;
3149 }
3150 }
3151 break;
3152 }
3153 case AMDGPU::G_SI_CALL: {
3154 // Use a set to avoid extra readfirstlanes in the case where multiple
3155 // operands are the same register.
3156 SmallSet<Register, 4> SGPROperandRegs;
3157
3158 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3159 break;
3160
3161 // Move all copies to physical SGPRs that are used by the call instruction
3162 // into the loop block. Start searching for these copies until the
3163 // ADJCALLSTACKUP.
3164 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3165 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3166
3167 // Move all non-copies before the copies, so that a complete range can be
3168 // moved into the waterfall loop.
3169 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3170 // Count of NonCopyInstrs found until the current LastCopy.
3171 unsigned NonCopyInstrsLen = 0;
3172 MachineBasicBlock::iterator Start(&MI);
3173 MachineBasicBlock::iterator LastCopy = Start;
3174 MachineBasicBlock *MBB = MI.getParent();
3175 const SIMachineFunctionInfo *Info =
3176 MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3177 while (Start->getOpcode() != FrameSetupOpcode) {
3178 --Start;
3179 bool IsCopy = false;
3180 if (Start->getOpcode() == AMDGPU::COPY) {
3181 auto &Dst = Start->getOperand(0);
3182 if (Dst.isReg()) {
3183 Register Reg = Dst.getReg();
3184 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3185 IsCopy = true;
3186 } else {
3187 // Also move the copy from the scratch rsrc descriptor into the loop
3188 // to allow it to be optimized away.
3189 auto &Src = Start->getOperand(1);
3190 if (Src.isReg()) {
3191 Reg = Src.getReg();
3192 IsCopy = Info->getScratchRSrcReg() == Reg;
3193 }
3194 }
3195 }
3196 }
3197
3198 if (IsCopy) {
3199 LastCopy = Start;
3200 NonCopyInstrsLen = NonCopyInstrs.size();
3201 } else {
3202 NonCopyInstrs.push_back(&*Start);
3203 }
3204 }
3205 NonCopyInstrs.resize(NonCopyInstrsLen);
3206
3207 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3208 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3209 }
3210 Start = LastCopy;
3211
3212 // Do the same for copies after the loop
3213 NonCopyInstrs.clear();
3214 NonCopyInstrsLen = 0;
3215 MachineBasicBlock::iterator End(&MI);
3216 LastCopy = End;
3217 while (End->getOpcode() != FrameDestroyOpcode) {
3218 ++End;
3219 bool IsCopy = false;
3220 if (End->getOpcode() == AMDGPU::COPY) {
3221 auto &Src = End->getOperand(1);
3222 if (Src.isReg()) {
3223 Register Reg = Src.getReg();
3224 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3225 }
3226 }
3227
3228 if (IsCopy) {
3229 LastCopy = End;
3230 NonCopyInstrsLen = NonCopyInstrs.size();
3231 } else {
3232 NonCopyInstrs.push_back(&*End);
3233 }
3234 }
3235 NonCopyInstrs.resize(NonCopyInstrsLen);
3236
3237 End = LastCopy;
3238 ++LastCopy;
3239 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3240 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3241 }
3242
3243 ++End;
3244 MachineIRBuilder B(*Start);
3245 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3246 break;
3247 }
3248 case AMDGPU::G_LOAD:
3249 case AMDGPU::G_ZEXTLOAD:
3250 case AMDGPU::G_SEXTLOAD: {
3251 if (applyMappingLoad(MI, OpdMapper, MRI))
3252 return;
3253 break;
3254 }
3255 case AMDGPU::G_DYN_STACKALLOC:
3256 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3257 return;
3258 case AMDGPU::G_SBFX:
3259 applyMappingBFE(OpdMapper, /*Signed*/ true);
3260 return;
3261 case AMDGPU::G_UBFX:
3262 applyMappingBFE(OpdMapper, /*Signed*/ false);
3263 return;
3264 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3265 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3266 applyMappingMAD_64_32(OpdMapper);
3267 return;
3268 default:
3269 break;
3270 }
3271
3272 return applyDefaultMapping(OpdMapper);
3273 }
3274
3275 // vgpr, sgpr -> vgpr
3276 // vgpr, agpr -> vgpr
3277 // agpr, agpr -> agpr
3278 // agpr, sgpr -> vgpr
regBankUnion(unsigned RB0,unsigned RB1)3279 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3280 if (RB0 == AMDGPU::InvalidRegBankID)
3281 return RB1;
3282 if (RB1 == AMDGPU::InvalidRegBankID)
3283 return RB0;
3284
3285 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3286 return AMDGPU::SGPRRegBankID;
3287
3288 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3289 return AMDGPU::AGPRRegBankID;
3290
3291 return AMDGPU::VGPRRegBankID;
3292 }
3293
regBankBoolUnion(unsigned RB0,unsigned RB1)3294 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3295 if (RB0 == AMDGPU::InvalidRegBankID)
3296 return RB1;
3297 if (RB1 == AMDGPU::InvalidRegBankID)
3298 return RB0;
3299
3300 // vcc, vcc -> vcc
3301 // vcc, sgpr -> vcc
3302 // vcc, vgpr -> vcc
3303 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3304 return AMDGPU::VCCRegBankID;
3305
3306 // vcc, vgpr -> vgpr
3307 return regBankUnion(RB0, RB1);
3308 }
3309
getMappingType(const MachineRegisterInfo & MRI,const MachineInstr & MI) const3310 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3311 const MachineInstr &MI) const {
3312 unsigned RegBank = AMDGPU::InvalidRegBankID;
3313
3314 for (const MachineOperand &MO : MI.operands()) {
3315 if (!MO.isReg())
3316 continue;
3317 Register Reg = MO.getReg();
3318 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3319 RegBank = regBankUnion(RegBank, Bank->getID());
3320 if (RegBank == AMDGPU::VGPRRegBankID)
3321 break;
3322 }
3323 }
3324
3325 return RegBank;
3326 }
3327
isSALUMapping(const MachineInstr & MI) const3328 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3329 const MachineFunction &MF = *MI.getParent()->getParent();
3330 const MachineRegisterInfo &MRI = MF.getRegInfo();
3331 for (const MachineOperand &MO : MI.operands()) {
3332 if (!MO.isReg())
3333 continue;
3334 Register Reg = MO.getReg();
3335 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3336 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3337 return false;
3338 }
3339 }
3340 return true;
3341 }
3342
3343 const RegisterBankInfo::InstructionMapping &
getDefaultMappingSOP(const MachineInstr & MI) const3344 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3345 const MachineFunction &MF = *MI.getParent()->getParent();
3346 const MachineRegisterInfo &MRI = MF.getRegInfo();
3347 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3348
3349 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3350 const MachineOperand &SrcOp = MI.getOperand(i);
3351 if (!SrcOp.isReg())
3352 continue;
3353
3354 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3355 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3356 }
3357 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3358 MI.getNumOperands());
3359 }
3360
3361 const RegisterBankInfo::InstructionMapping &
getDefaultMappingVOP(const MachineInstr & MI) const3362 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3363 const MachineFunction &MF = *MI.getParent()->getParent();
3364 const MachineRegisterInfo &MRI = MF.getRegInfo();
3365 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3366
3367 // Even though we technically could use SGPRs, this would require knowledge of
3368 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3369 //
3370 // TODO: Unary ops are trivially OK, so accept SGPRs?
3371 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3372 const MachineOperand &Src = MI.getOperand(i);
3373 if (!Src.isReg())
3374 continue;
3375
3376 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3377 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3378 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3379 }
3380
3381 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3382 MI.getNumOperands());
3383 }
3384
3385 const RegisterBankInfo::InstructionMapping &
getDefaultMappingAllVGPR(const MachineInstr & MI) const3386 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3387 const MachineFunction &MF = *MI.getParent()->getParent();
3388 const MachineRegisterInfo &MRI = MF.getRegInfo();
3389 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3390
3391 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3392 const MachineOperand &Op = MI.getOperand(I);
3393 if (!Op.isReg())
3394 continue;
3395
3396 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3397 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3398 }
3399
3400 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3401 MI.getNumOperands());
3402 }
3403
3404 const RegisterBankInfo::InstructionMapping &
getImageMapping(const MachineRegisterInfo & MRI,const MachineInstr & MI,int RsrcIdx) const3405 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3406 const MachineInstr &MI,
3407 int RsrcIdx) const {
3408 // The reported argument index is relative to the IR intrinsic call arguments,
3409 // so we need to shift by the number of defs and the intrinsic ID.
3410 RsrcIdx += MI.getNumExplicitDefs() + 1;
3411
3412 const int NumOps = MI.getNumOperands();
3413 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3414
3415 // TODO: Should packed/unpacked D16 difference be reported here as part of
3416 // the value mapping?
3417 for (int I = 0; I != NumOps; ++I) {
3418 if (!MI.getOperand(I).isReg())
3419 continue;
3420
3421 Register OpReg = MI.getOperand(I).getReg();
3422 // We replace some dead address operands with $noreg
3423 if (!OpReg)
3424 continue;
3425
3426 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3427
3428 // FIXME: Probably need a new intrinsic register bank searchable table to
3429 // handle arbitrary intrinsics easily.
3430 //
3431 // If this has a sampler, it immediately follows rsrc.
3432 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3433
3434 if (MustBeSGPR) {
3435 // If this must be an SGPR, so we must report whatever it is as legal.
3436 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3437 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3438 } else {
3439 // Some operands must be VGPR, and these are easy to copy to.
3440 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3441 }
3442 }
3443
3444 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3445 }
3446
3447 /// Return the mapping for a pointer argument.
3448 const RegisterBankInfo::ValueMapping *
getValueMappingForPtr(const MachineRegisterInfo & MRI,Register PtrReg) const3449 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3450 Register PtrReg) const {
3451 LLT PtrTy = MRI.getType(PtrReg);
3452 unsigned Size = PtrTy.getSizeInBits();
3453 if (Subtarget.useFlatForGlobal() ||
3454 !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3455 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3456
3457 // If we're using MUBUF instructions for global memory, an SGPR base register
3458 // is possible. Otherwise this needs to be a VGPR.
3459 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3460 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3461 }
3462
3463 const RegisterBankInfo::InstructionMapping &
getInstrMappingForLoad(const MachineInstr & MI) const3464 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3465
3466 const MachineFunction &MF = *MI.getParent()->getParent();
3467 const MachineRegisterInfo &MRI = MF.getRegInfo();
3468 SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3469 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3470 Register PtrReg = MI.getOperand(1).getReg();
3471 LLT PtrTy = MRI.getType(PtrReg);
3472 unsigned AS = PtrTy.getAddressSpace();
3473 unsigned PtrSize = PtrTy.getSizeInBits();
3474
3475 const ValueMapping *ValMapping;
3476 const ValueMapping *PtrMapping;
3477
3478 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3479
3480 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3481 if (isScalarLoadLegal(MI)) {
3482 // We have a uniform instruction so we want to use an SMRD load
3483 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3484 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3485 } else {
3486 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3487
3488 // If we're using MUBUF instructions for global memory, an SGPR base
3489 // register is possible. Otherwise this needs to be a VGPR.
3490 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3491 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3492
3493 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3494 }
3495 } else {
3496 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3497 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3498 }
3499
3500 OpdsMapping[0] = ValMapping;
3501 OpdsMapping[1] = PtrMapping;
3502 const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3503 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3504 return Mapping;
3505
3506 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3507 // handle that during instruction selection?
3508 }
3509
3510 unsigned
getRegBankID(Register Reg,const MachineRegisterInfo & MRI,unsigned Default) const3511 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3512 const MachineRegisterInfo &MRI,
3513 unsigned Default) const {
3514 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3515 return Bank ? Bank->getID() : Default;
3516 }
3517
3518 const RegisterBankInfo::ValueMapping *
getSGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3519 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3520 const MachineRegisterInfo &MRI,
3521 const TargetRegisterInfo &TRI) const {
3522 // Lie and claim anything is legal, even though this needs to be an SGPR
3523 // applyMapping will have to deal with it as a waterfall loop.
3524 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3525 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3526 return AMDGPU::getValueMapping(Bank, Size);
3527 }
3528
3529 const RegisterBankInfo::ValueMapping *
getVGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3530 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3531 const MachineRegisterInfo &MRI,
3532 const TargetRegisterInfo &TRI) const {
3533 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3534 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3535 }
3536
3537 const RegisterBankInfo::ValueMapping *
getAGPROpMapping(Register Reg,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI) const3538 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3539 const MachineRegisterInfo &MRI,
3540 const TargetRegisterInfo &TRI) const {
3541 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3542 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3543 }
3544
3545 ///
3546 /// This function must return a legal mapping, because
3547 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3548 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3549 /// VGPR to SGPR generated is illegal.
3550 ///
3551 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3552 // legal. These will be dealt with in applyMappingImpl.
3553 //
3554 const RegisterBankInfo::InstructionMapping &
getInstrMapping(const MachineInstr & MI) const3555 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3556 const MachineFunction &MF = *MI.getParent()->getParent();
3557 const MachineRegisterInfo &MRI = MF.getRegInfo();
3558
3559 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3560 // The default logic bothers to analyze impossible alternative mappings. We
3561 // want the most straightforward mapping, so just directly handle this.
3562 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3563 *TRI);
3564 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3565 *TRI);
3566 assert(SrcBank && "src bank should have been assigned already");
3567 if (!DstBank)
3568 DstBank = SrcBank;
3569
3570 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3571 if (cannotCopy(*DstBank, *SrcBank, Size))
3572 return getInvalidInstructionMapping();
3573
3574 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3575 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3576 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3577 OpdsMapping[0] = &ValMap;
3578 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3579 OpdsMapping[1] = &ValMap;
3580
3581 return getInstructionMapping(
3582 1, /*Cost*/ 1,
3583 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3584 }
3585
3586 if (MI.isRegSequence()) {
3587 // If any input is a VGPR, the result must be a VGPR. The default handling
3588 // assumes any copy between banks is legal.
3589 unsigned BankID = AMDGPU::SGPRRegBankID;
3590
3591 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3592 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3593 // It doesn't make sense to use vcc or scc banks here, so just ignore
3594 // them.
3595 if (OpBank != AMDGPU::SGPRRegBankID) {
3596 BankID = AMDGPU::VGPRRegBankID;
3597 break;
3598 }
3599 }
3600 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3601
3602 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3603 return getInstructionMapping(
3604 1, /*Cost*/ 1,
3605 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3606 }
3607
3608 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3609 // properly.
3610 //
3611 // TODO: There are additional exec masking dependencies to analyze.
3612 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3613 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3614 Register DstReg = MI.getOperand(0).getReg();
3615
3616 // Sometimes the result may have already been assigned a bank.
3617 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3618 ResultBank = DstBank->getID();
3619
3620 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3621 Register Reg = MI.getOperand(I).getReg();
3622 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3623
3624 // FIXME: Assuming VGPR for any undetermined inputs.
3625 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3626 ResultBank = AMDGPU::VGPRRegBankID;
3627 break;
3628 }
3629
3630 // FIXME: Need to promote SGPR case to s32
3631 unsigned OpBank = Bank->getID();
3632 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3633 }
3634
3635 assert(ResultBank != AMDGPU::InvalidRegBankID);
3636
3637 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3638
3639 const ValueMapping &ValMap =
3640 getValueMapping(0, Size, getRegBank(ResultBank));
3641 return getInstructionMapping(
3642 1, /*Cost*/ 1,
3643 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3644 }
3645
3646 const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3647 if (Mapping.isValid())
3648 return Mapping;
3649
3650 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3651
3652 switch (MI.getOpcode()) {
3653 default:
3654 return getInvalidInstructionMapping();
3655
3656 case AMDGPU::G_AND:
3657 case AMDGPU::G_OR:
3658 case AMDGPU::G_XOR: {
3659 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3660 if (Size == 1) {
3661 const RegisterBank *DstBank
3662 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3663
3664 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3665 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3666 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3667 if (DstBank) {
3668 TargetBankID = DstBank->getID();
3669 if (DstBank == &AMDGPU::VCCRegBank) {
3670 TargetBankID = AMDGPU::VCCRegBankID;
3671 BankLHS = AMDGPU::VCCRegBankID;
3672 BankRHS = AMDGPU::VCCRegBankID;
3673 } else {
3674 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3675 AMDGPU::SGPRRegBankID);
3676 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3677 AMDGPU::SGPRRegBankID);
3678 }
3679 } else {
3680 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3681 AMDGPU::VCCRegBankID);
3682 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3683 AMDGPU::VCCRegBankID);
3684
3685 // Both inputs should be true booleans to produce a boolean result.
3686 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3687 TargetBankID = AMDGPU::VGPRRegBankID;
3688 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3689 TargetBankID = AMDGPU::VCCRegBankID;
3690 BankLHS = AMDGPU::VCCRegBankID;
3691 BankRHS = AMDGPU::VCCRegBankID;
3692 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3693 TargetBankID = AMDGPU::SGPRRegBankID;
3694 }
3695 }
3696
3697 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3698 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3699 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3700 break;
3701 }
3702
3703 if (Size == 64) {
3704
3705 if (isSALUMapping(MI)) {
3706 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3707 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3708 } else {
3709 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3710 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3711 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3712
3713 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3714 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3715 }
3716
3717 break;
3718 }
3719
3720 LLVM_FALLTHROUGH;
3721 }
3722 case AMDGPU::G_PTR_ADD:
3723 case AMDGPU::G_PTRMASK:
3724 case AMDGPU::G_ADD:
3725 case AMDGPU::G_SUB:
3726 case AMDGPU::G_MUL:
3727 case AMDGPU::G_SHL:
3728 case AMDGPU::G_LSHR:
3729 case AMDGPU::G_ASHR:
3730 case AMDGPU::G_UADDO:
3731 case AMDGPU::G_USUBO:
3732 case AMDGPU::G_UADDE:
3733 case AMDGPU::G_SADDE:
3734 case AMDGPU::G_USUBE:
3735 case AMDGPU::G_SSUBE:
3736 case AMDGPU::G_SMIN:
3737 case AMDGPU::G_SMAX:
3738 case AMDGPU::G_UMIN:
3739 case AMDGPU::G_UMAX:
3740 case AMDGPU::G_ABS:
3741 case AMDGPU::G_SHUFFLE_VECTOR:
3742 case AMDGPU::G_SBFX:
3743 case AMDGPU::G_UBFX:
3744 if (isSALUMapping(MI))
3745 return getDefaultMappingSOP(MI);
3746 LLVM_FALLTHROUGH;
3747
3748 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3749 case AMDGPU::G_SSUBSAT:
3750 case AMDGPU::G_UADDSAT:
3751 case AMDGPU::G_USUBSAT:
3752 case AMDGPU::G_FADD:
3753 case AMDGPU::G_FSUB:
3754 case AMDGPU::G_FPTOSI:
3755 case AMDGPU::G_FPTOUI:
3756 case AMDGPU::G_FMUL:
3757 case AMDGPU::G_FMA:
3758 case AMDGPU::G_FMAD:
3759 case AMDGPU::G_FSQRT:
3760 case AMDGPU::G_FFLOOR:
3761 case AMDGPU::G_FCEIL:
3762 case AMDGPU::G_FRINT:
3763 case AMDGPU::G_SITOFP:
3764 case AMDGPU::G_UITOFP:
3765 case AMDGPU::G_FPTRUNC:
3766 case AMDGPU::G_FPEXT:
3767 case AMDGPU::G_FEXP2:
3768 case AMDGPU::G_FLOG2:
3769 case AMDGPU::G_FMINNUM:
3770 case AMDGPU::G_FMAXNUM:
3771 case AMDGPU::G_FMINNUM_IEEE:
3772 case AMDGPU::G_FMAXNUM_IEEE:
3773 case AMDGPU::G_FCANONICALIZE:
3774 case AMDGPU::G_INTRINSIC_TRUNC:
3775 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3776 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3777 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3778 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3779 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3780 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3781 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3782 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3783 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3784 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3785 case AMDGPU::G_AMDGPU_SMED3:
3786 return getDefaultMappingVOP(MI);
3787 case AMDGPU::G_UMULH:
3788 case AMDGPU::G_SMULH: {
3789 if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3790 return getDefaultMappingSOP(MI);
3791 return getDefaultMappingVOP(MI);
3792 }
3793 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3794 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3795 // Three possible mappings:
3796 //
3797 // - Default SOP
3798 // - Default VOP
3799 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3800 //
3801 // This allows instruction selection to keep the multiplication part of the
3802 // instruction on the SALU.
3803 bool AllSalu = true;
3804 bool MulSalu = true;
3805 for (unsigned i = 0; i < 5; ++i) {
3806 Register Reg = MI.getOperand(i).getReg();
3807 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3808 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3809 AllSalu = false;
3810 if (i == 2 || i == 3) {
3811 MulSalu = false;
3812 break;
3813 }
3814 }
3815 }
3816 }
3817
3818 if (AllSalu)
3819 return getDefaultMappingSOP(MI);
3820
3821 // If the multiply-add is full-rate in VALU, use that even if the
3822 // multiplication part is scalar. Accumulating separately on the VALU would
3823 // take two instructions.
3824 if (!MulSalu || Subtarget.hasFullRate64Ops())
3825 return getDefaultMappingVOP(MI);
3826
3827 // Keep the multiplication on the SALU, then accumulate on the VALU.
3828 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3829 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3830 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3831 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3832 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3833 break;
3834 }
3835 case AMDGPU::G_IMPLICIT_DEF: {
3836 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3837 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3838 break;
3839 }
3840 case AMDGPU::G_FCONSTANT:
3841 case AMDGPU::G_CONSTANT:
3842 case AMDGPU::G_GLOBAL_VALUE:
3843 case AMDGPU::G_BLOCK_ADDR:
3844 case AMDGPU::G_READCYCLECOUNTER: {
3845 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3846 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3847 break;
3848 }
3849 case AMDGPU::G_FRAME_INDEX: {
3850 // TODO: This should be the same as other constants, but eliminateFrameIndex
3851 // currently assumes VALU uses.
3852 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3853 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3854 break;
3855 }
3856 case AMDGPU::G_DYN_STACKALLOC: {
3857 // Result is always uniform, and a wave reduction is needed for the source.
3858 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3859 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3860 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3861 break;
3862 }
3863 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3864 // This case is weird because we expect a physical register in the source,
3865 // but need to set a bank anyway.
3866 //
3867 // We could select the result to SGPR or VGPR, but for the one current use
3868 // it's more practical to always use VGPR.
3869 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3870 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3871 break;
3872 }
3873 case AMDGPU::G_INSERT: {
3874 unsigned BankID = getMappingType(MRI, MI);
3875 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3876 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3877 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3878 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3879 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3880 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3881 OpdsMapping[3] = nullptr;
3882 break;
3883 }
3884 case AMDGPU::G_EXTRACT: {
3885 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3886 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3887 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3888 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3889 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3890 OpdsMapping[2] = nullptr;
3891 break;
3892 }
3893 case AMDGPU::G_BUILD_VECTOR:
3894 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3895 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3896 if (DstTy == LLT::fixed_vector(2, 16)) {
3897 unsigned DstSize = DstTy.getSizeInBits();
3898 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3899 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3900 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3901 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3902
3903 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3904 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3905 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3906 break;
3907 }
3908
3909 LLVM_FALLTHROUGH;
3910 }
3911 case AMDGPU::G_MERGE_VALUES:
3912 case AMDGPU::G_CONCAT_VECTORS: {
3913 unsigned Bank = getMappingType(MRI, MI);
3914 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3915 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3916
3917 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3918 // Op1 and Dst should use the same register bank.
3919 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3920 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3921 break;
3922 }
3923 case AMDGPU::G_BITREVERSE:
3924 case AMDGPU::G_BITCAST:
3925 case AMDGPU::G_INTTOPTR:
3926 case AMDGPU::G_PTRTOINT:
3927 case AMDGPU::G_FABS:
3928 case AMDGPU::G_FNEG: {
3929 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3930 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3931 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3932 break;
3933 }
3934 case AMDGPU::G_AMDGPU_FFBH_U32:
3935 case AMDGPU::G_AMDGPU_FFBL_B32:
3936 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3937 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3938 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3939 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3940 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3941 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3942 break;
3943 }
3944 case AMDGPU::G_CTPOP: {
3945 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3946 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3947 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3948
3949 // This should really be getValueMappingSGPR64Only, but allowing the generic
3950 // code to handle the register split just makes using LegalizerHelper more
3951 // difficult.
3952 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3953 break;
3954 }
3955 case AMDGPU::G_TRUNC: {
3956 Register Dst = MI.getOperand(0).getReg();
3957 Register Src = MI.getOperand(1).getReg();
3958 unsigned Bank = getRegBankID(Src, MRI);
3959 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3960 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3961 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3962 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3963 break;
3964 }
3965 case AMDGPU::G_ZEXT:
3966 case AMDGPU::G_SEXT:
3967 case AMDGPU::G_ANYEXT:
3968 case AMDGPU::G_SEXT_INREG: {
3969 Register Dst = MI.getOperand(0).getReg();
3970 Register Src = MI.getOperand(1).getReg();
3971 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3972 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3973
3974 unsigned DstBank;
3975 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3976 assert(SrcBank);
3977 switch (SrcBank->getID()) {
3978 case AMDGPU::SGPRRegBankID:
3979 DstBank = AMDGPU::SGPRRegBankID;
3980 break;
3981 default:
3982 DstBank = AMDGPU::VGPRRegBankID;
3983 break;
3984 }
3985
3986 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3987 // 32-bits, and then to 64.
3988 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3989 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3990 SrcSize);
3991 break;
3992 }
3993 case AMDGPU::G_FCMP: {
3994 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3995 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3996 OpdsMapping[1] = nullptr; // Predicate Operand.
3997 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3998 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3999 break;
4000 }
4001 case AMDGPU::G_STORE: {
4002 assert(MI.getOperand(0).isReg());
4003 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4004
4005 // FIXME: We need to specify a different reg bank once scalar stores are
4006 // supported.
4007 const ValueMapping *ValMapping =
4008 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4009 OpdsMapping[0] = ValMapping;
4010 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4011 break;
4012 }
4013 case AMDGPU::G_ICMP: {
4014 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4015 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4016
4017 // See if the result register has already been constrained to vcc, which may
4018 // happen due to control flow intrinsic lowering.
4019 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4020 AMDGPU::SGPRRegBankID);
4021 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4022 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4023
4024 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4025 Op2Bank == AMDGPU::SGPRRegBankID &&
4026 Op3Bank == AMDGPU::SGPRRegBankID &&
4027 (Size == 32 || (Size == 64 &&
4028 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4029 Subtarget.hasScalarCompareEq64()));
4030
4031 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4032 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4033
4034 // TODO: Use 32-bit for scalar output size.
4035 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4036 const unsigned ResultSize = 1;
4037
4038 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4039 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4040 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4041 break;
4042 }
4043 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4044 // VGPR index can be used for waterfall when indexing a SGPR vector.
4045 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4046 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4047 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4048 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4049 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4050 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4051
4052 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4053 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4054
4055 // The index can be either if the source vector is VGPR.
4056 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4057 break;
4058 }
4059 case AMDGPU::G_INSERT_VECTOR_ELT: {
4060 unsigned OutputBankID = isSALUMapping(MI) ?
4061 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4062
4063 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4064 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4065 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4066 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4067 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4068
4069 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4070 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4071
4072 // This is a weird case, because we need to break down the mapping based on
4073 // the register bank of a different operand.
4074 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4075 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4076 InsertSize);
4077 } else {
4078 assert(InsertSize == 32 || InsertSize == 64);
4079 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4080 }
4081
4082 // The index can be either if the source vector is VGPR.
4083 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4084 break;
4085 }
4086 case AMDGPU::G_UNMERGE_VALUES: {
4087 unsigned Bank = getMappingType(MRI, MI);
4088
4089 // Op1 and Dst should use the same register bank.
4090 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4091 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4092 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4093 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4094 }
4095 break;
4096 }
4097 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4098 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4099 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4100 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4101 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4102 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4103 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4104 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4105 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4106 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4107 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4108 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4109 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4110 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4111 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4112 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4113 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4114
4115 // rsrc
4116 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4117
4118 // vindex
4119 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4120
4121 // voffset
4122 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4123
4124 // soffset
4125 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4126
4127 // Any remaining operands are immediates and were correctly null
4128 // initialized.
4129 break;
4130 }
4131 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4132 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4133 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4134 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4135 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4136 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4138 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4139 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4140 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4141 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4142 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4143 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4144 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4145 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4146 // vdata_out
4147 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4148
4149 // vdata_in
4150 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4151
4152 // rsrc
4153 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4154
4155 // vindex
4156 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4157
4158 // voffset
4159 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4160
4161 // soffset
4162 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4163
4164 // Any remaining operands are immediates and were correctly null
4165 // initialized.
4166 break;
4167 }
4168 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4169 // vdata_out
4170 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4171
4172 // vdata_in
4173 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4174
4175 // cmp
4176 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4177
4178 // rsrc
4179 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4180
4181 // vindex
4182 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4183
4184 // voffset
4185 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4186
4187 // soffset
4188 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4189
4190 // Any remaining operands are immediates and were correctly null
4191 // initialized.
4192 break;
4193 }
4194 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4195 // Lie and claim everything is legal, even though some need to be
4196 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4197 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4198 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4199
4200 // We need to convert this to a MUBUF if either the resource of offset is
4201 // VGPR.
4202 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4203 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4204 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4205
4206 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4207 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4208 break;
4209 }
4210 case AMDGPU::G_INTRINSIC: {
4211 switch (MI.getIntrinsicID()) {
4212 default:
4213 return getInvalidInstructionMapping();
4214 case Intrinsic::amdgcn_div_fmas:
4215 case Intrinsic::amdgcn_div_fixup:
4216 case Intrinsic::amdgcn_trig_preop:
4217 case Intrinsic::amdgcn_sin:
4218 case Intrinsic::amdgcn_cos:
4219 case Intrinsic::amdgcn_log_clamp:
4220 case Intrinsic::amdgcn_rcp:
4221 case Intrinsic::amdgcn_rcp_legacy:
4222 case Intrinsic::amdgcn_sqrt:
4223 case Intrinsic::amdgcn_rsq:
4224 case Intrinsic::amdgcn_rsq_legacy:
4225 case Intrinsic::amdgcn_rsq_clamp:
4226 case Intrinsic::amdgcn_fmul_legacy:
4227 case Intrinsic::amdgcn_fma_legacy:
4228 case Intrinsic::amdgcn_ldexp:
4229 case Intrinsic::amdgcn_frexp_mant:
4230 case Intrinsic::amdgcn_frexp_exp:
4231 case Intrinsic::amdgcn_fract:
4232 case Intrinsic::amdgcn_cvt_pkrtz:
4233 case Intrinsic::amdgcn_cvt_pknorm_i16:
4234 case Intrinsic::amdgcn_cvt_pknorm_u16:
4235 case Intrinsic::amdgcn_cvt_pk_i16:
4236 case Intrinsic::amdgcn_cvt_pk_u16:
4237 case Intrinsic::amdgcn_fmed3:
4238 case Intrinsic::amdgcn_cubeid:
4239 case Intrinsic::amdgcn_cubema:
4240 case Intrinsic::amdgcn_cubesc:
4241 case Intrinsic::amdgcn_cubetc:
4242 case Intrinsic::amdgcn_sffbh:
4243 case Intrinsic::amdgcn_fmad_ftz:
4244 case Intrinsic::amdgcn_mbcnt_lo:
4245 case Intrinsic::amdgcn_mbcnt_hi:
4246 case Intrinsic::amdgcn_mul_u24:
4247 case Intrinsic::amdgcn_mul_i24:
4248 case Intrinsic::amdgcn_mulhi_u24:
4249 case Intrinsic::amdgcn_mulhi_i24:
4250 case Intrinsic::amdgcn_lerp:
4251 case Intrinsic::amdgcn_sad_u8:
4252 case Intrinsic::amdgcn_msad_u8:
4253 case Intrinsic::amdgcn_sad_hi_u8:
4254 case Intrinsic::amdgcn_sad_u16:
4255 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4256 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4257 case Intrinsic::amdgcn_mqsad_u32_u8:
4258 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4259 case Intrinsic::amdgcn_alignbyte:
4260 case Intrinsic::amdgcn_perm:
4261 case Intrinsic::amdgcn_fdot2:
4262 case Intrinsic::amdgcn_sdot2:
4263 case Intrinsic::amdgcn_udot2:
4264 case Intrinsic::amdgcn_sdot4:
4265 case Intrinsic::amdgcn_udot4:
4266 case Intrinsic::amdgcn_sdot8:
4267 case Intrinsic::amdgcn_udot8:
4268 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4269 case Intrinsic::amdgcn_fdot2_f16_f16:
4270 case Intrinsic::amdgcn_fdot2_f32_bf16:
4271 case Intrinsic::amdgcn_sudot4:
4272 case Intrinsic::amdgcn_sudot8:
4273 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4274 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4275 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4276 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4277 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4278 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4279 return getDefaultMappingVOP(MI);
4280 case Intrinsic::amdgcn_sbfe:
4281 case Intrinsic::amdgcn_ubfe:
4282 if (isSALUMapping(MI))
4283 return getDefaultMappingSOP(MI);
4284 return getDefaultMappingVOP(MI);
4285 case Intrinsic::amdgcn_ds_swizzle:
4286 case Intrinsic::amdgcn_ds_permute:
4287 case Intrinsic::amdgcn_ds_bpermute:
4288 case Intrinsic::amdgcn_update_dpp:
4289 case Intrinsic::amdgcn_mov_dpp8:
4290 case Intrinsic::amdgcn_mov_dpp:
4291 case Intrinsic::amdgcn_strict_wwm:
4292 case Intrinsic::amdgcn_wwm:
4293 case Intrinsic::amdgcn_strict_wqm:
4294 case Intrinsic::amdgcn_wqm:
4295 case Intrinsic::amdgcn_softwqm:
4296 case Intrinsic::amdgcn_set_inactive:
4297 case Intrinsic::amdgcn_permlane64:
4298 return getDefaultMappingAllVGPR(MI);
4299 case Intrinsic::amdgcn_kernarg_segment_ptr:
4300 case Intrinsic::amdgcn_s_getpc:
4301 case Intrinsic::amdgcn_groupstaticsize:
4302 case Intrinsic::amdgcn_reloc_constant:
4303 case Intrinsic::returnaddress: {
4304 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4305 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4306 break;
4307 }
4308 case Intrinsic::amdgcn_wqm_vote: {
4309 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4310 OpdsMapping[0] = OpdsMapping[2]
4311 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4312 break;
4313 }
4314 case Intrinsic::amdgcn_ps_live: {
4315 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4316 break;
4317 }
4318 case Intrinsic::amdgcn_div_scale: {
4319 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4320 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4321 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4322 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4323
4324 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4325 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4326 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4327 break;
4328 }
4329 case Intrinsic::amdgcn_class: {
4330 Register Src0Reg = MI.getOperand(2).getReg();
4331 Register Src1Reg = MI.getOperand(3).getReg();
4332 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4333 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4334 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4335 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4336 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4337 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4338 break;
4339 }
4340 case Intrinsic::amdgcn_icmp:
4341 case Intrinsic::amdgcn_fcmp: {
4342 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4343 // This is not VCCRegBank because this is not used in boolean contexts.
4344 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4345 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4346 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4347 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4348 break;
4349 }
4350 case Intrinsic::amdgcn_readlane: {
4351 // This must be an SGPR, but accept a VGPR.
4352 Register IdxReg = MI.getOperand(3).getReg();
4353 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4354 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4355 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4356 LLVM_FALLTHROUGH;
4357 }
4358 case Intrinsic::amdgcn_readfirstlane: {
4359 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4360 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4361 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4362 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4363 break;
4364 }
4365 case Intrinsic::amdgcn_writelane: {
4366 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4367 Register SrcReg = MI.getOperand(2).getReg();
4368 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4369 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4370 Register IdxReg = MI.getOperand(3).getReg();
4371 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4372 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4373 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4374
4375 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4376 // to legalize.
4377 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4378 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4379 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4380 break;
4381 }
4382 case Intrinsic::amdgcn_if_break: {
4383 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4384 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4385 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4386 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4387 break;
4388 }
4389 case Intrinsic::amdgcn_permlane16:
4390 case Intrinsic::amdgcn_permlanex16: {
4391 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4392 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4393 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4394 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4395 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4396 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4397 break;
4398 }
4399 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4400 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4401 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4402 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4403 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4404 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4405 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4406 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4407 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4408 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4409 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4410 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4411 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4412 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4413 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4414 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4415 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4416 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4417 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4418 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4419 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4420 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4421 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4422 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4423 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4424 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4425 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4426 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4427 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4428 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4429 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4430 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4431 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4432 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4433 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4434 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4435 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4436 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4437 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4438 // Default for MAI intrinsics.
4439 // srcC can also be an immediate which can be folded later.
4440 // FIXME: Should we eventually add an alternative mapping with AGPR src
4441 // for srcA/srcB?
4442 //
4443 // vdst, srcA, srcB, srcC
4444 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4445 OpdsMapping[0] =
4446 Info->mayNeedAGPRs()
4447 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4448 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4449 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4450 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4451 OpdsMapping[4] =
4452 Info->mayNeedAGPRs()
4453 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4454 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4455 break;
4456 }
4457 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4458 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4459 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4460 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4461 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4462 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4463 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4464 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4465 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4466 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4467 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4468 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4469 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4470 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4471 // vdst, srcA, srcB, srcC, idx
4472 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4473 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4474 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4475 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4476 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4477 break;
4478 }
4479 case Intrinsic::amdgcn_interp_p1:
4480 case Intrinsic::amdgcn_interp_p2:
4481 case Intrinsic::amdgcn_interp_mov:
4482 case Intrinsic::amdgcn_interp_p1_f16:
4483 case Intrinsic::amdgcn_interp_p2_f16:
4484 case Intrinsic::amdgcn_lds_param_load: {
4485 const int M0Idx = MI.getNumOperands() - 1;
4486 Register M0Reg = MI.getOperand(M0Idx).getReg();
4487 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4488 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4489
4490 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4491 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4492 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4493
4494 // Must be SGPR, but we must take whatever the original bank is and fix it
4495 // later.
4496 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4497 break;
4498 }
4499 case Intrinsic::amdgcn_interp_inreg_p10:
4500 case Intrinsic::amdgcn_interp_inreg_p2:
4501 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4502 case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4503 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4504 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4505 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4506 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4507 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4508 break;
4509 }
4510 case Intrinsic::amdgcn_ballot: {
4511 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4512 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4513 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4514 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4515 break;
4516 }
4517 }
4518 break;
4519 }
4520 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4521 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4522 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4523 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4524 auto IntrID = MI.getIntrinsicID();
4525 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4526 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4527 // Non-images can have complications from operands that allow both SGPR
4528 // and VGPR. For now it's too complicated to figure out the final opcode
4529 // to derive the register bank from the MCInstrDesc.
4530 assert(RSrcIntrin->IsImage);
4531 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4532 }
4533 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4534 unsigned N = MI.getNumExplicitOperands() - 2;
4535 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4536 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4537 if (N == 3) {
4538 // Sequential form: all operands combined into VGPR256/VGPR512
4539 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4540 if (Size > 256)
4541 Size = 512;
4542 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4543 } else {
4544 // NSA form
4545 for (unsigned I = 2; I < N; ++I) {
4546 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4547 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4548 }
4549 }
4550 break;
4551 }
4552 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4553 auto IntrID = MI.getIntrinsicID();
4554 switch (IntrID) {
4555 case Intrinsic::amdgcn_s_getreg:
4556 case Intrinsic::amdgcn_s_memtime:
4557 case Intrinsic::amdgcn_s_memrealtime:
4558 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4559 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4560 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4561 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4562 break;
4563 }
4564 case Intrinsic::amdgcn_global_atomic_fadd:
4565 case Intrinsic::amdgcn_global_atomic_csub:
4566 case Intrinsic::amdgcn_global_atomic_fmin:
4567 case Intrinsic::amdgcn_global_atomic_fmax:
4568 case Intrinsic::amdgcn_flat_atomic_fadd:
4569 case Intrinsic::amdgcn_flat_atomic_fmin:
4570 case Intrinsic::amdgcn_flat_atomic_fmax:
4571 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4572 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4573 return getDefaultMappingAllVGPR(MI);
4574 case Intrinsic::amdgcn_ds_ordered_add:
4575 case Intrinsic::amdgcn_ds_ordered_swap: {
4576 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4577 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4578 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4579 AMDGPU::SGPRRegBankID);
4580 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4581 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4582 break;
4583 }
4584 case Intrinsic::amdgcn_ds_append:
4585 case Intrinsic::amdgcn_ds_consume: {
4586 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4587 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4588 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4589 break;
4590 }
4591 case Intrinsic::amdgcn_exp_compr:
4592 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4593 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4594 break;
4595 case Intrinsic::amdgcn_exp:
4596 // FIXME: Could we support packed types here?
4597 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4598 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4599 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4600 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4601 break;
4602 case Intrinsic::amdgcn_exp_row:
4603 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4604 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4605 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4606 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4607 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4608 break;
4609 case Intrinsic::amdgcn_s_sendmsg:
4610 case Intrinsic::amdgcn_s_sendmsghalt: {
4611 // This must be an SGPR, but accept a VGPR.
4612 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4613 AMDGPU::SGPRRegBankID);
4614 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4615 break;
4616 }
4617 case Intrinsic::amdgcn_s_setreg: {
4618 // This must be an SGPR, but accept a VGPR.
4619 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4620 AMDGPU::SGPRRegBankID);
4621 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4622 break;
4623 }
4624 case Intrinsic::amdgcn_end_cf: {
4625 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4626 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4627 break;
4628 }
4629 case Intrinsic::amdgcn_else: {
4630 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4631 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4632 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4633 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4634 break;
4635 }
4636 case Intrinsic::amdgcn_live_mask: {
4637 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4638 break;
4639 }
4640 case Intrinsic::amdgcn_wqm_demote:
4641 case Intrinsic::amdgcn_kill: {
4642 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4643 break;
4644 }
4645 case Intrinsic::amdgcn_raw_buffer_load:
4646 case Intrinsic::amdgcn_raw_tbuffer_load: {
4647 // FIXME: Should make intrinsic ID the last operand of the instruction,
4648 // then this would be the same as store
4649 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4650 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4651 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4652 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4653 break;
4654 }
4655 case Intrinsic::amdgcn_raw_buffer_load_lds: {
4656 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4657 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4658 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4659 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4660 break;
4661 }
4662 case Intrinsic::amdgcn_raw_buffer_store:
4663 case Intrinsic::amdgcn_raw_buffer_store_format:
4664 case Intrinsic::amdgcn_raw_tbuffer_store: {
4665 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4666 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4667 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4668 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4669 break;
4670 }
4671 case Intrinsic::amdgcn_struct_buffer_load:
4672 case Intrinsic::amdgcn_struct_tbuffer_load: {
4673 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4674 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4675 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4676 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4677 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4678 break;
4679 }
4680 case Intrinsic::amdgcn_struct_buffer_load_lds: {
4681 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4682 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4683 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4684 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4685 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4686 break;
4687 }
4688 case Intrinsic::amdgcn_struct_buffer_store:
4689 case Intrinsic::amdgcn_struct_tbuffer_store: {
4690 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4691 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4692 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4693 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4694 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4695 break;
4696 }
4697 case Intrinsic::amdgcn_init_exec_from_input: {
4698 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4699 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4700 break;
4701 }
4702 case Intrinsic::amdgcn_ds_gws_init:
4703 case Intrinsic::amdgcn_ds_gws_barrier:
4704 case Intrinsic::amdgcn_ds_gws_sema_br: {
4705 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4706
4707 // This must be an SGPR, but accept a VGPR.
4708 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4709 AMDGPU::SGPRRegBankID);
4710 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4711 break;
4712 }
4713 case Intrinsic::amdgcn_ds_gws_sema_v:
4714 case Intrinsic::amdgcn_ds_gws_sema_p:
4715 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4716 // This must be an SGPR, but accept a VGPR.
4717 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4718 AMDGPU::SGPRRegBankID);
4719 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4720 break;
4721 }
4722 case Intrinsic::amdgcn_global_load_lds: {
4723 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4724 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4725 break;
4726 }
4727 case Intrinsic::amdgcn_lds_direct_load: {
4728 const int M0Idx = MI.getNumOperands() - 1;
4729 Register M0Reg = MI.getOperand(M0Idx).getReg();
4730 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4731 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4732
4733 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4734 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4735 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4736
4737 // Must be SGPR, but we must take whatever the original bank is and fix it
4738 // later.
4739 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4740 break;
4741 }
4742 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4743 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4744 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4745 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4746 break;
4747 default:
4748 return getInvalidInstructionMapping();
4749 }
4750 break;
4751 }
4752 case AMDGPU::G_SELECT: {
4753 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4754 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4755 AMDGPU::SGPRRegBankID);
4756 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4757 AMDGPU::SGPRRegBankID);
4758 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4759 Op3Bank == AMDGPU::SGPRRegBankID;
4760
4761 unsigned CondBankDefault = SGPRSrcs ?
4762 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4763 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4764 CondBankDefault);
4765 if (CondBank == AMDGPU::SGPRRegBankID)
4766 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4767 else if (CondBank == AMDGPU::VGPRRegBankID)
4768 CondBank = AMDGPU::VCCRegBankID;
4769
4770 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4771 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4772
4773 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4774
4775 // TODO: Should report 32-bit for scalar condition type.
4776 if (Size == 64) {
4777 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4778 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4779 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4780 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4781 } else {
4782 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4783 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4784 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4785 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4786 }
4787
4788 break;
4789 }
4790
4791 case AMDGPU::G_SI_CALL: {
4792 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4793 // Lie and claim everything is legal, even though some need to be
4794 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4795 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4796
4797 // Allow anything for implicit arguments
4798 for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4799 if (MI.getOperand(I).isReg()) {
4800 Register Reg = MI.getOperand(I).getReg();
4801 auto OpBank = getRegBankID(Reg, MRI);
4802 unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4803 OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4804 }
4805 }
4806 break;
4807 }
4808 case AMDGPU::G_LOAD:
4809 case AMDGPU::G_ZEXTLOAD:
4810 case AMDGPU::G_SEXTLOAD:
4811 return getInstrMappingForLoad(MI);
4812
4813 case AMDGPU::G_ATOMICRMW_XCHG:
4814 case AMDGPU::G_ATOMICRMW_ADD:
4815 case AMDGPU::G_ATOMICRMW_SUB:
4816 case AMDGPU::G_ATOMICRMW_AND:
4817 case AMDGPU::G_ATOMICRMW_OR:
4818 case AMDGPU::G_ATOMICRMW_XOR:
4819 case AMDGPU::G_ATOMICRMW_MAX:
4820 case AMDGPU::G_ATOMICRMW_MIN:
4821 case AMDGPU::G_ATOMICRMW_UMAX:
4822 case AMDGPU::G_ATOMICRMW_UMIN:
4823 case AMDGPU::G_ATOMICRMW_FADD:
4824 case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4825 case AMDGPU::G_AMDGPU_ATOMIC_INC:
4826 case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4827 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4828 case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4829 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4830 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4831 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4832 break;
4833 }
4834 case AMDGPU::G_ATOMIC_CMPXCHG: {
4835 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4836 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4837 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4838 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4839 break;
4840 }
4841 case AMDGPU::G_BRCOND: {
4842 unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4843 AMDGPU::SGPRRegBankID);
4844 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4845 if (Bank != AMDGPU::SGPRRegBankID)
4846 Bank = AMDGPU::VCCRegBankID;
4847
4848 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4849 break;
4850 }
4851 case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4852 case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4853 return getDefaultMappingVOP(MI);
4854 }
4855
4856 return getInstructionMapping(/*ID*/1, /*Cost*/1,
4857 getOperandsMapping(OpdsMapping),
4858 MI.getNumOperands());
4859 }
4860