1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
79 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
80 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
81 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
82 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
83 #include "llvm/CodeGen/RegisterBank.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100   const AMDGPURegisterBankInfo &RBI;
101   MachineRegisterInfo &MRI;
102   const RegisterBank *NewBank;
103   SmallVector<MachineInstr *, 4> NewInsts;
104 
105 public:
106   ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107                       MachineRegisterInfo &MRI_, const RegisterBank *RB)
108     : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 
110   ~ApplyRegBankMapping() {
111     for (MachineInstr *MI : NewInsts)
112       applyBank(*MI);
113   }
114 
115   /// Set any registers that don't have a set register class or bank to SALU.
116   void applyBank(MachineInstr &MI) {
117     const unsigned Opc = MI.getOpcode();
118     if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119         Opc == AMDGPU::G_SEXT) {
120       // LegalizerHelper wants to use the basic legalization artifacts when
121       // widening etc. We don't handle selection with vcc in artifact sources,
122       // so we need to use a select instead to handle these properly.
123       Register DstReg = MI.getOperand(0).getReg();
124       Register SrcReg = MI.getOperand(1).getReg();
125       const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126       if (SrcBank == &AMDGPU::VCCRegBank) {
127         const LLT S32 = LLT::scalar(32);
128         assert(MRI.getType(SrcReg) == LLT::scalar(1));
129         assert(MRI.getType(DstReg) == S32);
130         assert(NewBank == &AMDGPU::VGPRRegBank);
131 
132         // Replace the extension with a select, which really uses the boolean
133         // source.
134         MachineIRBuilder B(MI);
135         auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136         auto False = B.buildConstant(S32, 0);
137         B.buildSelect(DstReg, SrcReg, True, False);
138         MRI.setRegBank(True.getReg(0), *NewBank);
139         MRI.setRegBank(False.getReg(0), *NewBank);
140         MI.eraseFromParent();
141       }
142 
143       assert(!MRI.getRegClassOrRegBank(DstReg));
144       MRI.setRegBank(DstReg, *NewBank);
145       return;
146     }
147 
148 #ifndef NDEBUG
149     if (Opc == AMDGPU::G_TRUNC) {
150       Register DstReg = MI.getOperand(0).getReg();
151       const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152       assert(DstBank != &AMDGPU::VCCRegBank);
153     }
154 #endif
155 
156     for (MachineOperand &Op : MI.operands()) {
157       if (!Op.isReg())
158         continue;
159 
160       // We may see physical registers if building a real MI
161       Register Reg = Op.getReg();
162       if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163         continue;
164 
165       const RegisterBank *RB = NewBank;
166       if (MRI.getType(Reg) == LLT::scalar(1)) {
167         assert(NewBank == &AMDGPU::VGPRRegBank &&
168                "s1 operands should only be used for vector bools");
169         assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170                 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171                "not expecting legalization artifacts here");
172         RB = &AMDGPU::VCCRegBank;
173       }
174 
175       MRI.setRegBank(Reg, *RB);
176     }
177   }
178 
179   void erasingInstr(MachineInstr &MI) override {}
180 
181   void createdInstr(MachineInstr &MI) override {
182     // At this point, the instruction was just inserted and has no operands.
183     NewInsts.push_back(&MI);
184   }
185 
186   void changingInstr(MachineInstr &MI) override {}
187   void changedInstr(MachineInstr &MI) override {
188     // FIXME: In principle we should probably add the instruction to NewInsts,
189     // but the way the LegalizerHelper uses the observer, we will always see the
190     // registers we need to set the regbank on also referenced in a new
191     // instruction.
192   }
193 };
194 
195 }
196 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
197     : AMDGPUGenRegisterBankInfo(),
198       Subtarget(ST),
199       TRI(Subtarget.getRegisterInfo()),
200       TII(Subtarget.getInstrInfo()) {
201 
202   // HACK: Until this is fully tablegen'd.
203   static llvm::once_flag InitializeRegisterBankFlag;
204 
205   static auto InitializeRegisterBankOnce = [this]() {
206     assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
207            &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
208            &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
209     (void)this;
210   };
211 
212   llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
213 }
214 
215 static bool isVectorRegisterBank(const RegisterBank &Bank) {
216   unsigned BankID = Bank.getID();
217   return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
218 }
219 
220 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
221                                           const RegisterBank &Src,
222                                           unsigned Size) const {
223   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
224   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
225       (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
226     return std::numeric_limits<unsigned>::max();
227   }
228 
229   // Bool values are tricky, because the meaning is based on context. The SCC
230   // and VCC banks are for the natural scalar and vector conditions produced by
231   // a compare.
232   //
233   // Legalization doesn't know about the necessary context, so an s1 use may
234   // have been a truncate from an arbitrary value, in which case a copy (lowered
235   // as a compare with 0) needs to be inserted.
236   if (Size == 1 &&
237       (Dst.getID() == AMDGPU::SGPRRegBankID) &&
238       (isVectorRegisterBank(Src) ||
239        Src.getID() == AMDGPU::SGPRRegBankID ||
240        Src.getID() == AMDGPU::VCCRegBankID))
241     return std::numeric_limits<unsigned>::max();
242 
243   // There is no direct copy between AGPRs.
244   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
245       Src.getID() == AMDGPU::AGPRRegBankID)
246     return 4;
247 
248   return RegisterBankInfo::copyCost(Dst, Src, Size);
249 }
250 
251 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
252   const ValueMapping &ValMapping,
253   const RegisterBank *CurBank) const {
254   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
255   // VGPR.
256   // FIXME: Is there a better way to do this?
257   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
258     return 10; // This is expensive.
259 
260   assert(ValMapping.NumBreakDowns == 2 &&
261          ValMapping.BreakDown[0].Length == 32 &&
262          ValMapping.BreakDown[0].StartIdx == 0 &&
263          ValMapping.BreakDown[1].Length == 32 &&
264          ValMapping.BreakDown[1].StartIdx == 32 &&
265          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
266 
267   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
268   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
269   // want.
270 
271   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
272   // alignment restrictions, but this probably isn't important.
273   return 1;
274 }
275 
276 const RegisterBank &
277 AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
278                                                LLT Ty) const {
279   if (&RC == &AMDGPU::SReg_1RegClass)
280     return AMDGPU::VCCRegBank;
281 
282   // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
283   // VCC-like use.
284   if (TRI->isSGPRClass(&RC)) {
285     // FIXME: This probably came from a copy from a physical register, which
286     // should be inferable from the copied to-type. We don't have many boolean
287     // physical register constraints so just assume a normal SGPR for now.
288     if (!Ty.isValid())
289       return AMDGPU::SGPRRegBank;
290 
291     return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
292   }
293 
294   return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
295 }
296 
297 template <unsigned NumOps>
298 RegisterBankInfo::InstructionMappings
299 AMDGPURegisterBankInfo::addMappingFromTable(
300     const MachineInstr &MI, const MachineRegisterInfo &MRI,
301     const std::array<unsigned, NumOps> RegSrcOpIdx,
302     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
303 
304   InstructionMappings AltMappings;
305 
306   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
307 
308   unsigned Sizes[NumOps];
309   for (unsigned I = 0; I < NumOps; ++I) {
310     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
311     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
312   }
313 
314   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
315     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
316     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
317   }
318 
319   // getInstrMapping's default mapping uses ID 1, so start at 2.
320   unsigned MappingID = 2;
321   for (const auto &Entry : Table) {
322     for (unsigned I = 0; I < NumOps; ++I) {
323       int OpIdx = RegSrcOpIdx[I];
324       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
325     }
326 
327     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
328                                                  getOperandsMapping(Operands),
329                                                  Operands.size()));
330   }
331 
332   return AltMappings;
333 }
334 
335 RegisterBankInfo::InstructionMappings
336 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
337     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
338   switch (MI.getIntrinsicID()) {
339   case Intrinsic::amdgcn_readlane: {
340     static const OpRegBankEntry<3> Table[2] = {
341       // Perfectly legal.
342       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
343 
344       // Need a readfirstlane for the index.
345       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
346     };
347 
348     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
349     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
350   }
351   case Intrinsic::amdgcn_writelane: {
352     static const OpRegBankEntry<4> Table[4] = {
353       // Perfectly legal.
354       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
355 
356       // Need readfirstlane of first op
357       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
358 
359       // Need readfirstlane of second op
360       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
361 
362       // Need readfirstlane of both ops
363       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
364     };
365 
366     // rsrc, voffset, offset
367     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
368     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
369   }
370   default:
371     return RegisterBankInfo::getInstrAlternativeMappings(MI);
372   }
373 }
374 
375 RegisterBankInfo::InstructionMappings
376 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
377     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
378 
379   switch (MI.getIntrinsicID()) {
380   case Intrinsic::amdgcn_s_buffer_load: {
381     static const OpRegBankEntry<2> Table[4] = {
382       // Perfectly legal.
383       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
384 
385       // Only need 1 register in loop
386       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
387 
388       // Have to waterfall the resource.
389       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
390 
391       // Have to waterfall the resource, and the offset.
392       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
393     };
394 
395     // rsrc, offset
396     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
397     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
398   }
399   case Intrinsic::amdgcn_ds_ordered_add:
400   case Intrinsic::amdgcn_ds_ordered_swap: {
401     // VGPR = M0, VGPR
402     static const OpRegBankEntry<3> Table[2] = {
403       // Perfectly legal.
404       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
405 
406       // Need a readfirstlane for m0
407       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
408     };
409 
410     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
411     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
412   }
413   case Intrinsic::amdgcn_s_sendmsg:
414   case Intrinsic::amdgcn_s_sendmsghalt: {
415     // FIXME: Should have no register for immediate
416     static const OpRegBankEntry<1> Table[2] = {
417       // Perfectly legal.
418       { { AMDGPU::SGPRRegBankID }, 1 },
419 
420       // Need readlane
421       { { AMDGPU::VGPRRegBankID }, 3 }
422     };
423 
424     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
425     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
426   }
427   default:
428     return RegisterBankInfo::getInstrAlternativeMappings(MI);
429   }
430 }
431 
432 // FIXME: Returns uniform if there's no source value information. This is
433 // probably wrong.
434 static bool isScalarLoadLegal(const MachineInstr &MI) {
435   if (!MI.hasOneMemOperand())
436     return false;
437 
438   const MachineMemOperand *MMO = *MI.memoperands_begin();
439   const unsigned AS = MMO->getAddrSpace();
440   const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
441                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
442   // Require 4-byte alignment.
443   return MMO->getAlign() >= Align(4) &&
444          // Can't do a scalar atomic load.
445          !MMO->isAtomic() &&
446          // Don't use scalar loads for volatile accesses to non-constant address
447          // spaces.
448          (IsConst || !MMO->isVolatile()) &&
449          // Memory must be known constant, or not written before this load.
450          (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
451          AMDGPUInstrInfo::isUniformMMO(MMO);
452 }
453 
454 RegisterBankInfo::InstructionMappings
455 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
456     const MachineInstr &MI) const {
457 
458   const MachineFunction &MF = *MI.getParent()->getParent();
459   const MachineRegisterInfo &MRI = MF.getRegInfo();
460 
461 
462   InstructionMappings AltMappings;
463   switch (MI.getOpcode()) {
464   case TargetOpcode::G_CONSTANT: {
465     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
466     if (Size == 1) {
467       static const OpRegBankEntry<1> Table[3] = {
468         { { AMDGPU::VGPRRegBankID }, 1 },
469         { { AMDGPU::SGPRRegBankID }, 1 },
470         { { AMDGPU::VCCRegBankID }, 1 }
471       };
472 
473       return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
474     }
475 
476     LLVM_FALLTHROUGH;
477   }
478   case TargetOpcode::G_FCONSTANT:
479   case TargetOpcode::G_FRAME_INDEX:
480   case TargetOpcode::G_GLOBAL_VALUE: {
481     static const OpRegBankEntry<1> Table[2] = {
482       { { AMDGPU::VGPRRegBankID }, 1 },
483       { { AMDGPU::SGPRRegBankID }, 1 }
484     };
485 
486     return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
487   }
488   case TargetOpcode::G_AND:
489   case TargetOpcode::G_OR:
490   case TargetOpcode::G_XOR: {
491     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
492 
493     if (Size == 1) {
494       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
495       const InstructionMapping &SCCMapping = getInstructionMapping(
496         1, 1, getOperandsMapping(
497           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
498            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
499            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
500         3); // Num Operands
501       AltMappings.push_back(&SCCMapping);
502 
503       const InstructionMapping &VCCMapping0 = getInstructionMapping(
504         2, 1, getOperandsMapping(
505           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
506            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
507            AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
508         3); // Num Operands
509       AltMappings.push_back(&VCCMapping0);
510       return AltMappings;
511     }
512 
513     if (Size != 64)
514       break;
515 
516     const InstructionMapping &SSMapping = getInstructionMapping(
517       1, 1, getOperandsMapping(
518         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
519          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
520          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
521       3); // Num Operands
522     AltMappings.push_back(&SSMapping);
523 
524     const InstructionMapping &VVMapping = getInstructionMapping(
525       2, 2, getOperandsMapping(
526         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
527          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
528          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
529       3); // Num Operands
530     AltMappings.push_back(&VVMapping);
531     break;
532   }
533   case TargetOpcode::G_LOAD:
534   case TargetOpcode::G_ZEXTLOAD:
535   case TargetOpcode::G_SEXTLOAD: {
536     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
537     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
538     unsigned PtrSize = PtrTy.getSizeInBits();
539     unsigned AS = PtrTy.getAddressSpace();
540 
541     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
542          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
543         isScalarLoadLegal(MI)) {
544       const InstructionMapping &SSMapping = getInstructionMapping(
545           1, 1, getOperandsMapping(
546                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
547                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
548           2); // Num Operands
549       AltMappings.push_back(&SSMapping);
550     }
551 
552     const InstructionMapping &VVMapping = getInstructionMapping(
553         2, 1,
554         getOperandsMapping(
555             {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
556              AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
557         2); // Num Operands
558     AltMappings.push_back(&VVMapping);
559 
560     // It may be possible to have a vgpr = load sgpr mapping here, because
561     // the mubuf instructions support this kind of load, but probably for only
562     // gfx7 and older.  However, the addressing mode matching in the instruction
563     // selector should be able to do a better job of detecting and selecting
564     // these kinds of loads from the vgpr = load vgpr mapping.
565 
566     return AltMappings;
567 
568   }
569   case TargetOpcode::G_SELECT: {
570     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
571     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
572       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
573                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
574                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
575                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
576       4); // Num Operands
577     AltMappings.push_back(&SSMapping);
578 
579     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
580       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
581                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
582                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
583                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
584       4); // Num Operands
585     AltMappings.push_back(&VVMapping);
586 
587     return AltMappings;
588   }
589   case TargetOpcode::G_UADDE:
590   case TargetOpcode::G_USUBE:
591   case TargetOpcode::G_SADDE:
592   case TargetOpcode::G_SSUBE: {
593     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
594     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
595       getOperandsMapping(
596         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
597          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
598          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
599          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
600          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
601       5); // Num Operands
602     AltMappings.push_back(&SSMapping);
603 
604     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
605       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
606                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
607                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
608                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
609                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
610       5); // Num Operands
611     AltMappings.push_back(&VVMapping);
612     return AltMappings;
613   }
614   case AMDGPU::G_BRCOND: {
615     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
616 
617     // TODO: Change type to 32 for scalar
618     const InstructionMapping &SMapping = getInstructionMapping(
619       1, 1, getOperandsMapping(
620         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
621       2); // Num Operands
622     AltMappings.push_back(&SMapping);
623 
624     const InstructionMapping &VMapping = getInstructionMapping(
625       1, 1, getOperandsMapping(
626         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
627       2); // Num Operands
628     AltMappings.push_back(&VMapping);
629     return AltMappings;
630   }
631   case AMDGPU::G_INTRINSIC:
632     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
633   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
634     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
635   default:
636     break;
637   }
638   return RegisterBankInfo::getInstrAlternativeMappings(MI);
639 }
640 
641 void AMDGPURegisterBankInfo::split64BitValueForMapping(
642   MachineIRBuilder &B,
643   SmallVector<Register, 2> &Regs,
644   LLT HalfTy,
645   Register Reg) const {
646   assert(HalfTy.getSizeInBits() == 32);
647   MachineRegisterInfo *MRI = B.getMRI();
648   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
649   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
650   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
651   MRI->setRegBank(LoLHS, *Bank);
652   MRI->setRegBank(HiLHS, *Bank);
653 
654   Regs.push_back(LoLHS);
655   Regs.push_back(HiLHS);
656 
657   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
658     .addDef(LoLHS)
659     .addDef(HiLHS)
660     .addUse(Reg);
661 }
662 
663 /// Replace the current type each register in \p Regs has with \p NewTy
664 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
665                           LLT NewTy) {
666   for (Register Reg : Regs) {
667     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
668     MRI.setType(Reg, NewTy);
669   }
670 }
671 
672 static LLT getHalfSizedType(LLT Ty) {
673   if (Ty.isVector()) {
674     assert(Ty.getElementCount().isKnownMultipleOf(2));
675     return LLT::scalarOrVector(Ty.getElementCount().divideCoefficientBy(2),
676                                Ty.getElementType());
677   }
678 
679   assert(Ty.getScalarSizeInBits() % 2 == 0);
680   return LLT::scalar(Ty.getScalarSizeInBits() / 2);
681 }
682 
683 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
684 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
685 /// execute the instruction for each unique combination of values in all lanes
686 /// in the wave. The block will be split such that rest of the instructions are
687 /// moved to a new block.
688 ///
689 /// Essentially performs this loop:
690 //
691 /// Save Execution Mask
692 /// For (Lane : Wavefront) {
693 ///   Enable Lane, Disable all other lanes
694 ///   SGPR = read SGPR value for current lane from VGPR
695 ///   VGPRResult[Lane] = use_op SGPR
696 /// }
697 /// Restore Execution Mask
698 ///
699 /// There is additional complexity to try for compare values to identify the
700 /// unique values used.
701 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
702   MachineIRBuilder &B,
703   iterator_range<MachineBasicBlock::iterator> Range,
704   SmallSet<Register, 4> &SGPROperandRegs,
705   MachineRegisterInfo &MRI) const {
706 
707   // Track use registers which have already been expanded with a readfirstlane
708   // sequence. This may have multiple uses if moving a sequence.
709   DenseMap<Register, Register> WaterfalledRegMap;
710 
711   MachineBasicBlock &MBB = B.getMBB();
712   MachineFunction *MF = &B.getMF();
713 
714   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
715   const unsigned WaveAndOpc = Subtarget.isWave32() ?
716     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
717   const unsigned MovExecOpc =
718       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
719   const unsigned MovExecTermOpc =
720       Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
721 
722   const unsigned XorTermOpc = Subtarget.isWave32() ?
723     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
724   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
725     AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
726   const unsigned ExecReg =  Subtarget.isWave32() ?
727     AMDGPU::EXEC_LO : AMDGPU::EXEC;
728 
729 #ifndef NDEBUG
730   const int OrigRangeSize = std::distance(Range.begin(), Range.end());
731 #endif
732 
733   Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
734   Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
735 
736   // Don't bother using generic instructions/registers for the exec mask.
737   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
738     .addDef(InitSaveExecReg);
739 
740   Register PhiExec = MRI.createVirtualRegister(WaveRC);
741   Register NewExec = MRI.createVirtualRegister(WaveRC);
742 
743   // To insert the loop we need to split the block. Move everything before this
744   // point to a new block, and insert a new empty block before this instruction.
745   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
746   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
747   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
748   MachineFunction::iterator MBBI(MBB);
749   ++MBBI;
750   MF->insert(MBBI, LoopBB);
751   MF->insert(MBBI, RestoreExecBB);
752   MF->insert(MBBI, RemainderBB);
753 
754   LoopBB->addSuccessor(RestoreExecBB);
755   LoopBB->addSuccessor(LoopBB);
756 
757   // Move the rest of the block into a new block.
758   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
759   RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
760 
761   MBB.addSuccessor(LoopBB);
762   RestoreExecBB->addSuccessor(RemainderBB);
763 
764   B.setInsertPt(*LoopBB, LoopBB->end());
765 
766   B.buildInstr(TargetOpcode::PHI)
767     .addDef(PhiExec)
768     .addReg(InitSaveExecReg)
769     .addMBB(&MBB)
770     .addReg(NewExec)
771     .addMBB(LoopBB);
772 
773   const DebugLoc &DL = B.getDL();
774 
775   MachineInstr &FirstInst = *Range.begin();
776 
777   // Move the instruction into the loop. Note we moved everything after
778   // Range.end() already into a new block, so Range.end() is no longer valid.
779   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
780 
781   // Figure out the iterator range after splicing the instructions.
782   MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
783   auto NewEnd = LoopBB->end();
784 
785   MachineBasicBlock::iterator I = Range.begin();
786   B.setInsertPt(*LoopBB, I);
787 
788   Register CondReg;
789 
790   assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
791 
792   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
793     for (MachineOperand &Op : MI.uses()) {
794       if (!Op.isReg() || Op.isDef())
795         continue;
796 
797       Register OldReg = Op.getReg();
798       if (!SGPROperandRegs.count(OldReg))
799         continue;
800 
801       // See if we already processed this register in another instruction in the
802       // sequence.
803       auto OldVal = WaterfalledRegMap.find(OldReg);
804       if (OldVal != WaterfalledRegMap.end()) {
805         Op.setReg(OldVal->second);
806         continue;
807       }
808 
809       Register OpReg = Op.getReg();
810       LLT OpTy = MRI.getType(OpReg);
811 
812       const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
813       if (OpBank != &AMDGPU::VGPRRegBank) {
814         // Insert copy from AGPR to VGPR before the loop.
815         B.setMBB(MBB);
816         OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
817         MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
818         B.setInstr(*I);
819       }
820 
821       unsigned OpSize = OpTy.getSizeInBits();
822 
823       // Can only do a readlane of 32-bit pieces.
824       if (OpSize == 32) {
825         // Avoid extra copies in the simple case of one 32-bit register.
826         Register CurrentLaneOpReg
827           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
828         MRI.setType(CurrentLaneOpReg, OpTy);
829 
830         constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
831         // Read the next variant <- also loop target.
832         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
833                 CurrentLaneOpReg)
834           .addReg(OpReg);
835 
836         Register NewCondReg = MRI.createVirtualRegister(WaveRC);
837         bool First = CondReg == AMDGPU::NoRegister;
838         if (First)
839           CondReg = NewCondReg;
840 
841         // Compare the just read M0 value to all possible Idx values.
842         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
843           .addDef(NewCondReg)
844           .addReg(CurrentLaneOpReg)
845           .addReg(OpReg);
846         Op.setReg(CurrentLaneOpReg);
847 
848         if (!First) {
849           Register AndReg = MRI.createVirtualRegister(WaveRC);
850 
851           // If there are multiple operands to consider, and the conditions.
852           B.buildInstr(WaveAndOpc)
853             .addDef(AndReg)
854             .addReg(NewCondReg)
855             .addReg(CondReg);
856           CondReg = AndReg;
857         }
858       } else {
859         LLT S32 = LLT::scalar(32);
860         SmallVector<Register, 8> ReadlanePieces;
861 
862         // The compares can be done as 64-bit, but the extract needs to be done
863         // in 32-bit pieces.
864 
865         bool Is64 = OpSize % 64 == 0;
866 
867         unsigned UnmergeTySize = Is64 ? 64 : 32;
868         unsigned CmpOp =
869             Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
870 
871         // Insert the unmerge before the loop.
872 
873         B.setMBB(MBB);
874         unsigned NumPieces = OpSize / UnmergeTySize;
875         SmallVector<Register, 8> UnmergePieces;
876         if (NumPieces == 1) {
877           UnmergePieces.push_back(OpReg);
878         } else {
879           LLT UnmergeTy = LLT::scalar(UnmergeTySize);
880           MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
881           for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
882             UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
883         }
884         B.setInstr(*I);
885 
886         for (Register UnmergePiece : UnmergePieces) {
887           Register CurrentLaneOpReg;
888           if (Is64) {
889             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
890             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
891 
892             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
893             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
894             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
895 
896             // Read the next variant <- also loop target.
897             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
898                     CurrentLaneOpRegLo)
899               .addReg(UnmergePiece, 0, AMDGPU::sub0);
900 
901             // Read the next variant <- also loop target.
902             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
903                     CurrentLaneOpRegHi)
904               .addReg(UnmergePiece, 0, AMDGPU::sub1);
905 
906             CurrentLaneOpReg =
907               B.buildMerge(LLT::scalar(64),
908                            {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
909               .getReg(0);
910 
911             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
912 
913             if (OpTy.getScalarSizeInBits() == 64) {
914               // If we need to produce a 64-bit element vector, so use the
915               // merged pieces
916               ReadlanePieces.push_back(CurrentLaneOpReg);
917             } else {
918               // 32-bit element type.
919               ReadlanePieces.push_back(CurrentLaneOpRegLo);
920               ReadlanePieces.push_back(CurrentLaneOpRegHi);
921             }
922           } else {
923             CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
924             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
925             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
926 
927             // Read the next variant <- also loop target.
928             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
929                     CurrentLaneOpReg)
930               .addReg(UnmergePiece);
931             ReadlanePieces.push_back(CurrentLaneOpReg);
932           }
933 
934           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
935           bool First = CondReg == AMDGPU::NoRegister;
936           if (First)
937             CondReg = NewCondReg;
938 
939           B.buildInstr(CmpOp)
940             .addDef(NewCondReg)
941             .addReg(CurrentLaneOpReg)
942             .addReg(UnmergePiece);
943 
944           if (!First) {
945             Register AndReg = MRI.createVirtualRegister(WaveRC);
946 
947             // If there are multiple operands to consider, and the conditions.
948             B.buildInstr(WaveAndOpc)
949               .addDef(AndReg)
950               .addReg(NewCondReg)
951               .addReg(CondReg);
952             CondReg = AndReg;
953           }
954         }
955 
956         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
957         // BUILD_VECTOR
958         if (OpTy.isVector()) {
959           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
960           Op.setReg(Merge.getReg(0));
961           MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
962         } else if (ReadlanePieces.size() > 1) {
963           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
964           Op.setReg(Merge.getReg(0));
965           MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
966         } else {
967           Op.setReg(ReadlanePieces[0]);
968         }
969       }
970 
971       // Make sure we don't re-process this register again.
972       WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
973     }
974   }
975 
976   // Update EXEC, save the original EXEC value to VCC.
977   B.buildInstr(AndSaveExecOpc)
978     .addDef(NewExec)
979     .addReg(CondReg, RegState::Kill);
980 
981   MRI.setSimpleHint(NewExec, CondReg);
982 
983   B.setInsertPt(*LoopBB, LoopBB->end());
984 
985   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
986   B.buildInstr(XorTermOpc)
987     .addDef(ExecReg)
988     .addReg(ExecReg)
989     .addReg(NewExec);
990 
991   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
992   // s_cbranch_scc0?
993 
994   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
995   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
996 
997   // Save the EXEC mask before the loop.
998   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
999     .addReg(ExecReg);
1000 
1001   // Restore the EXEC mask after the loop.
1002   B.setMBB(*RestoreExecBB);
1003   B.buildInstr(MovExecTermOpc)
1004     .addDef(ExecReg)
1005     .addReg(SaveExecReg);
1006 
1007   // Set the insert point after the original instruction, so any new
1008   // instructions will be in the remainder.
1009   B.setInsertPt(*RemainderBB, RemainderBB->begin());
1010 
1011   return true;
1012 }
1013 
1014 // Return any unique registers used by \p MI at \p OpIndices that need to be
1015 // handled in a waterfall loop. Returns these registers in \p
1016 // SGPROperandRegs. Returns true if there are any operands to handle and a
1017 // waterfall loop is necessary.
1018 bool AMDGPURegisterBankInfo::collectWaterfallOperands(
1019   SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1020   MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1021   for (unsigned Op : OpIndices) {
1022     assert(MI.getOperand(Op).isUse());
1023     Register Reg = MI.getOperand(Op).getReg();
1024     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1025     if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1026       SGPROperandRegs.insert(Reg);
1027   }
1028 
1029   // No operands need to be replaced, so no need to loop.
1030   return !SGPROperandRegs.empty();
1031 }
1032 
1033 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1034   MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
1035   ArrayRef<unsigned> OpIndices) const {
1036   // Use a set to avoid extra readfirstlanes in the case where multiple operands
1037   // are the same register.
1038   SmallSet<Register, 4> SGPROperandRegs;
1039 
1040   if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1041     return false;
1042 
1043   MachineBasicBlock::iterator I = MI.getIterator();
1044   return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1045                                 SGPROperandRegs, MRI);
1046 }
1047 
1048 bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
1049   MachineInstr &MI, MachineRegisterInfo &MRI,
1050   ArrayRef<unsigned> OpIndices) const {
1051   MachineIRBuilder B(MI);
1052   return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1053 }
1054 
1055 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1056 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
1057     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1058   Register Reg = MI.getOperand(OpIdx).getReg();
1059   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1060   if (Bank == &AMDGPU::SGPRRegBank)
1061     return;
1062 
1063   LLT Ty = MRI.getType(Reg);
1064   MachineIRBuilder B(MI);
1065 
1066   if (Bank != &AMDGPU::VGPRRegBank) {
1067     // We need to copy from AGPR to VGPR
1068     Reg = B.buildCopy(Ty, Reg).getReg(0);
1069     MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1070   }
1071 
1072   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1073   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1074     .addDef(SGPR)
1075     .addReg(Reg);
1076 
1077   MRI.setType(SGPR, Ty);
1078 
1079   const TargetRegisterClass *Constrained =
1080       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1081   (void)Constrained;
1082   assert(Constrained && "Failed to constrain readfirstlane src reg");
1083 
1084   MI.getOperand(OpIdx).setReg(SGPR);
1085 }
1086 
1087 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1088 /// rest will be in the remainder.
1089 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1090   unsigned TotalSize = Ty.getSizeInBits();
1091   if (!Ty.isVector())
1092     return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1093 
1094   LLT EltTy = Ty.getElementType();
1095   unsigned EltSize = EltTy.getSizeInBits();
1096   assert(FirstSize % EltSize == 0);
1097 
1098   unsigned FirstPartNumElts = FirstSize / EltSize;
1099   unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1100 
1101   return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1102           LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1103 }
1104 
1105 static LLT widen96To128(LLT Ty) {
1106   if (!Ty.isVector())
1107     return LLT::scalar(128);
1108 
1109   LLT EltTy = Ty.getElementType();
1110   assert(128 % EltTy.getSizeInBits() == 0);
1111   return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1112 }
1113 
1114 bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
1115                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1116                                               MachineRegisterInfo &MRI) const {
1117   Register DstReg = MI.getOperand(0).getReg();
1118   const LLT LoadTy = MRI.getType(DstReg);
1119   unsigned LoadSize = LoadTy.getSizeInBits();
1120   const unsigned MaxNonSmrdLoadSize = 128;
1121 
1122   const RegisterBank *DstBank =
1123       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1124   if (DstBank == &AMDGPU::SGPRRegBank) {
1125     // There are some special cases that we need to look at for 32 bit and 96
1126     // bit SGPR loads otherwise we have nothing to do.
1127     if (LoadSize != 32 && LoadSize != 96)
1128       return false;
1129 
1130     MachineMemOperand *MMO = *MI.memoperands_begin();
1131     const unsigned MemSize = 8 * MMO->getSize();
1132     // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1133     // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1134     // scalar loads should have a load size of 32 but memory access size of less
1135     // than 32.
1136     if (LoadSize == 32 &&
1137         (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1138       return false;
1139 
1140     Register PtrReg = MI.getOperand(1).getReg();
1141 
1142     ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1143     MachineIRBuilder B(MI, O);
1144 
1145     if (LoadSize == 32) {
1146       // This is an extending load from a sub-dword size. Widen the memory
1147       // access size to 4 bytes and clear the extra high bits appropriately
1148       const LLT S32 = LLT::scalar(32);
1149       if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1150         // Must extend the sign bit into higher bits for a G_SEXTLOAD
1151         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1152         B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1153       } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1154         // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1155         auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1156         B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1157       } else
1158         // We do not need to touch the higher bits for regular loads.
1159         B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1160     } else {
1161       // 96-bit loads are only available for vector loads. We need to split this
1162       // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1163       if (MMO->getAlign() < Align(16)) {
1164         MachineFunction *MF = MI.getParent()->getParent();
1165         ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1166         MachineIRBuilder B(MI, ApplyBank);
1167         LegalizerHelper Helper(*MF, ApplyBank, B);
1168         LLT Part64, Part32;
1169         std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1170         if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1171             LegalizerHelper::Legalized)
1172           return false;
1173         return true;
1174       } else {
1175         LLT WiderTy = widen96To128(LoadTy);
1176         auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1177         if (WiderTy.isScalar())
1178           B.buildTrunc(MI.getOperand(0), WideLoad);
1179         else {
1180           B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1181                                               WideLoad);
1182         }
1183       }
1184     }
1185 
1186     MI.eraseFromParent();
1187     return true;
1188   }
1189 
1190   // 128-bit loads are supported for all instruction types.
1191   if (LoadSize <= MaxNonSmrdLoadSize)
1192     return false;
1193 
1194   SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1195   SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1196 
1197   if (SrcRegs.empty())
1198     SrcRegs.push_back(MI.getOperand(1).getReg());
1199 
1200   assert(LoadSize % MaxNonSmrdLoadSize == 0);
1201 
1202   // RegBankSelect only emits scalar types, so we need to reset the pointer
1203   // operand to a pointer type.
1204   Register BasePtrReg = SrcRegs[0];
1205   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1206   MRI.setType(BasePtrReg, PtrTy);
1207 
1208   unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1209   const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1210   ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1211   MachineIRBuilder B(MI, Observer);
1212   LegalizerHelper Helper(B.getMF(), Observer, B);
1213 
1214   if (LoadTy.isVector()) {
1215     if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1216       return false;
1217   } else {
1218     if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1219       return false;
1220   }
1221 
1222   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1223   return true;
1224 }
1225 
1226 bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
1227   MachineInstr &MI,
1228   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1229   MachineRegisterInfo &MRI) const {
1230   const MachineFunction &MF = *MI.getMF();
1231   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1232   const auto &TFI = *ST.getFrameLowering();
1233 
1234   // Guard in case the stack growth direction ever changes with scratch
1235   // instructions.
1236   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1237     return false;
1238 
1239   Register Dst = MI.getOperand(0).getReg();
1240   Register AllocSize = MI.getOperand(1).getReg();
1241   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1242 
1243   const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1244 
1245   // TODO: Need to emit a wave reduction to get the maximum size.
1246   if (SizeBank != &AMDGPU::SGPRRegBank)
1247     return false;
1248 
1249   LLT PtrTy = MRI.getType(Dst);
1250   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1251 
1252   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1253   Register SPReg = Info->getStackPtrOffsetReg();
1254   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1255   MachineIRBuilder B(MI, ApplyBank);
1256 
1257   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1258   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1259 
1260   auto SPCopy = B.buildCopy(PtrTy, SPReg);
1261   if (Alignment > TFI.getStackAlign()) {
1262     auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1263     B.buildMaskLowPtrBits(Dst, PtrAdd,
1264                           Log2(Alignment) + ST.getWavefrontSizeLog2());
1265   } else {
1266     B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1267   }
1268 
1269   MI.eraseFromParent();
1270   return true;
1271 }
1272 
1273 bool AMDGPURegisterBankInfo::applyMappingImage(
1274     MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1275     MachineRegisterInfo &MRI, int RsrcIdx) const {
1276   const int NumDefs = MI.getNumExplicitDefs();
1277 
1278   // The reported argument index is relative to the IR intrinsic call arguments,
1279   // so we need to shift by the number of defs and the intrinsic ID.
1280   RsrcIdx += NumDefs + 1;
1281 
1282   // Insert copies to VGPR arguments.
1283   applyDefaultMapping(OpdMapper);
1284 
1285   // Fixup any SGPR arguments.
1286   SmallVector<unsigned, 4> SGPRIndexes;
1287   for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1288     if (!MI.getOperand(I).isReg())
1289       continue;
1290 
1291     // If this intrinsic has a sampler, it immediately follows rsrc.
1292     if (I == RsrcIdx || I == RsrcIdx + 1)
1293       SGPRIndexes.push_back(I);
1294   }
1295 
1296   executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1297   return true;
1298 }
1299 
1300 static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
1301                                         Register Reg) {
1302   MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
1303   if (!Def)
1304     return Reg;
1305 
1306   // TODO: Guard against this being an implicit def
1307   return Def->getOperand(0).getReg();
1308 }
1309 
1310 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1311 // the three offsets (voffset, soffset and instoffset)
1312 static unsigned setBufferOffsets(MachineIRBuilder &B,
1313                                  const AMDGPURegisterBankInfo &RBI,
1314                                  Register CombinedOffset, Register &VOffsetReg,
1315                                  Register &SOffsetReg, int64_t &InstOffsetVal,
1316                                  Align Alignment) {
1317   const LLT S32 = LLT::scalar(32);
1318   MachineRegisterInfo *MRI = B.getMRI();
1319 
1320   if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1321     uint32_t SOffset, ImmOffset;
1322     if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1323                                  Alignment)) {
1324       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1325       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1326       InstOffsetVal = ImmOffset;
1327 
1328       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1329       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1330       return SOffset + ImmOffset;
1331     }
1332   }
1333 
1334   Register Base;
1335   unsigned Offset;
1336 
1337   std::tie(Base, Offset) =
1338       AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1339 
1340   uint32_t SOffset, ImmOffset;
1341   if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1342                                                   &RBI.Subtarget, Alignment)) {
1343     if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1344       VOffsetReg = Base;
1345       SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1346       B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1347       InstOffsetVal = ImmOffset;
1348       return 0; // XXX - Why is this 0?
1349     }
1350 
1351     // If we have SGPR base, we can use it for soffset.
1352     if (SOffset == 0) {
1353       VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1354       B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1355       SOffsetReg = Base;
1356       InstOffsetVal = ImmOffset;
1357       return 0; // XXX - Why is this 0?
1358     }
1359   }
1360 
1361   // Handle the variable sgpr + vgpr case.
1362   MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1363   if (Add && (int)Offset >= 0) {
1364     Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1365     Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1366 
1367     const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1368     const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1369 
1370     if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1371       VOffsetReg = Src0;
1372       SOffsetReg = Src1;
1373       return 0;
1374     }
1375 
1376     if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1377       VOffsetReg = Src1;
1378       SOffsetReg = Src0;
1379       return 0;
1380     }
1381   }
1382 
1383   // Ensure we have a VGPR for the combined offset. This could be an issue if we
1384   // have an SGPR offset and a VGPR resource.
1385   if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1386     VOffsetReg = CombinedOffset;
1387   } else {
1388     VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1389     B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1390   }
1391 
1392   SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1393   B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1394   return 0;
1395 }
1396 
1397 bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
1398   const OperandsMapper &OpdMapper) const {
1399   MachineInstr &MI = OpdMapper.getMI();
1400   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1401 
1402   const LLT S32 = LLT::scalar(32);
1403   Register Dst = MI.getOperand(0).getReg();
1404   LLT Ty = MRI.getType(Dst);
1405 
1406   const RegisterBank *RSrcBank =
1407     OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1408   const RegisterBank *OffsetBank =
1409     OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1410   if (RSrcBank == &AMDGPU::SGPRRegBank &&
1411       OffsetBank == &AMDGPU::SGPRRegBank)
1412     return true; // Legal mapping
1413 
1414   // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1415   // here but don't have an MMO.
1416 
1417   unsigned LoadSize = Ty.getSizeInBits();
1418   int NumLoads = 1;
1419   if (LoadSize == 256 || LoadSize == 512) {
1420     NumLoads = LoadSize / 128;
1421     Ty = Ty.divide(NumLoads);
1422   }
1423 
1424   // Use the alignment to ensure that the required offsets will fit into the
1425   // immediate offsets.
1426   const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1427 
1428   MachineIRBuilder B(MI);
1429   MachineFunction &MF = B.getMF();
1430 
1431   Register SOffset;
1432   Register VOffset;
1433   int64_t ImmOffset = 0;
1434 
1435   unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1436                                         VOffset, SOffset, ImmOffset, Alignment);
1437 
1438   // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1439   // can, but we need to track an MMO for that.
1440   const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1441   const Align MemAlign(4); // FIXME: ABI type alignment?
1442   MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
1443     MachinePointerInfo(),
1444     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1445     MachineMemOperand::MOInvariant,
1446     MemSize, MemAlign);
1447   if (MMOOffset != 0)
1448     BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1449 
1450   // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1451   // assume that the buffer is unswizzled.
1452 
1453   Register RSrc = MI.getOperand(1).getReg();
1454   Register VIndex = B.buildConstant(S32, 0).getReg(0);
1455   B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1456 
1457   SmallVector<Register, 4> LoadParts(NumLoads);
1458 
1459   MachineBasicBlock::iterator MII = MI.getIterator();
1460   MachineInstrSpan Span(MII, &B.getMBB());
1461 
1462   for (int i = 0; i < NumLoads; ++i) {
1463     if (NumLoads == 1) {
1464       LoadParts[i] = Dst;
1465     } else {
1466       LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1467       MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1468     }
1469 
1470     MachineMemOperand *MMO = BaseMMO;
1471     if (i != 0)
1472       BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1473 
1474     B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1475       .addDef(LoadParts[i])       // vdata
1476       .addUse(RSrc)               // rsrc
1477       .addUse(VIndex)             // vindex
1478       .addUse(VOffset)            // voffset
1479       .addUse(SOffset)            // soffset
1480       .addImm(ImmOffset + 16 * i) // offset(imm)
1481       .addImm(0)                  // cachepolicy, swizzled buffer(imm)
1482       .addImm(0)                  // idxen(imm)
1483       .addMemOperand(MMO);
1484   }
1485 
1486   // TODO: If only the resource is a VGPR, it may be better to execute the
1487   // scalar load in the waterfall loop if the resource is expected to frequently
1488   // be dynamically uniform.
1489   if (RSrcBank != &AMDGPU::SGPRRegBank) {
1490     // Remove the original instruction to avoid potentially confusing the
1491     // waterfall loop logic.
1492     B.setInstr(*Span.begin());
1493     MI.eraseFromParent();
1494 
1495     SmallSet<Register, 4> OpsToWaterfall;
1496 
1497     OpsToWaterfall.insert(RSrc);
1498     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1499                            OpsToWaterfall, MRI);
1500   }
1501 
1502   if (NumLoads != 1) {
1503     if (Ty.isVector())
1504       B.buildConcatVectors(Dst, LoadParts);
1505     else
1506       B.buildMerge(Dst, LoadParts);
1507   }
1508 
1509   // We removed the instruction earlier with a waterfall loop.
1510   if (RSrcBank == &AMDGPU::SGPRRegBank)
1511     MI.eraseFromParent();
1512 
1513   return true;
1514 }
1515 
1516 bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
1517                                              bool Signed) const {
1518   MachineInstr &MI = OpdMapper.getMI();
1519   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1520 
1521   // Insert basic copies
1522   applyDefaultMapping(OpdMapper);
1523 
1524   Register DstReg = MI.getOperand(0).getReg();
1525   LLT Ty = MRI.getType(DstReg);
1526 
1527   const LLT S32 = LLT::scalar(32);
1528 
1529   unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1530   Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1531   Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1532   Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1533 
1534   const RegisterBank *DstBank =
1535     OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1536   if (DstBank == &AMDGPU::VGPRRegBank) {
1537     if (Ty == S32)
1538       return true;
1539 
1540     // There is no 64-bit vgpr bitfield extract instructions so the operation
1541     // is expanded to a sequence of instructions that implement the operation.
1542     ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1543     MachineIRBuilder B(MI, ApplyBank);
1544 
1545     const LLT S64 = LLT::scalar(64);
1546     // Shift the source operand so that extracted bits start at bit 0.
1547     auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1548                               : B.buildLShr(S64, SrcReg, OffsetReg);
1549     auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1550 
1551     // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1552     // if the width is a constant.
1553     if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1554       // Use the 32-bit bitfield extract instruction if the width is a constant.
1555       // Depending on the width size, use either the low or high 32-bits.
1556       auto Zero = B.buildConstant(S32, 0);
1557       auto WidthImm = ConstWidth->Value.getZExtValue();
1558       if (WidthImm <= 32) {
1559         // Use bitfield extract on the lower 32-bit source, and then sign-extend
1560         // or clear the upper 32-bits.
1561         auto Extract =
1562             Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1563                    : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1564         auto Extend =
1565             Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1566         B.buildMerge(DstReg, {Extract, Extend});
1567       } else {
1568         // Use bitfield extract on upper 32-bit source, and combine with lower
1569         // 32-bit source.
1570         auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1571         auto Extract =
1572             Signed
1573                 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1574                 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1575         B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1576       }
1577       MI.eraseFromParent();
1578       return true;
1579     }
1580 
1581     // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1582     // operations.
1583     auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1584     auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1585     if (Signed)
1586       B.buildAShr(S64, SignBit, ExtShift);
1587     else
1588       B.buildLShr(S64, SignBit, ExtShift);
1589     MI.eraseFromParent();
1590     return true;
1591   }
1592 
1593   // The scalar form packs the offset and width in a single operand.
1594 
1595   ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1596   MachineIRBuilder B(MI, ApplyBank);
1597 
1598   // Ensure the high bits are clear to insert the offset.
1599   auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1600   auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1601 
1602   // Zeros out the low bits, so don't bother clamping the input value.
1603   auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1604 
1605   // Transformation function, pack the offset and width of a BFE into
1606   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1607   // source, bits [5:0] contain the offset and bits [22:16] the width.
1608   auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1609 
1610   // TODO: It might be worth using a pseudo here to avoid scc clobber and
1611   // register class constraints.
1612   unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1613                              (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1614 
1615   auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1616   if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1617     llvm_unreachable("failed to constrain BFE");
1618 
1619   MI.eraseFromParent();
1620   return true;
1621 }
1622 
1623 // Return a suitable opcode for extending the operands of Opc when widening.
1624 static unsigned getExtendOp(unsigned Opc) {
1625   switch (Opc) {
1626   case TargetOpcode::G_ASHR:
1627   case TargetOpcode::G_SMIN:
1628   case TargetOpcode::G_SMAX:
1629     return TargetOpcode::G_SEXT;
1630   case TargetOpcode::G_LSHR:
1631   case TargetOpcode::G_UMIN:
1632   case TargetOpcode::G_UMAX:
1633     return TargetOpcode::G_ZEXT;
1634   default:
1635     return TargetOpcode::G_ANYEXT;
1636   }
1637 }
1638 
1639 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1640 // any illegal vector extend or unmerge operations.
1641 static std::pair<Register, Register>
1642 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1643   const LLT S32 = LLT::scalar(32);
1644   auto Bitcast = B.buildBitcast(S32, Src);
1645 
1646   if (ExtOpcode == TargetOpcode::G_SEXT) {
1647     auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1648     auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1649     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1650   }
1651 
1652   auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1653   if (ExtOpcode == TargetOpcode::G_ZEXT) {
1654     auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1655     return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1656   }
1657 
1658   assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1659   return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1660 }
1661 
1662 // For cases where only a single copy is inserted for matching register banks.
1663 // Replace the register in the instruction operand
1664 static bool substituteSimpleCopyRegs(
1665   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1666   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1667   if (!SrcReg.empty()) {
1668     assert(SrcReg.size() == 1);
1669     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1670     return true;
1671   }
1672 
1673   return false;
1674 }
1675 
1676 /// Handle register layout difference for f16 images for some subtargets.
1677 Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
1678                                                 MachineRegisterInfo &MRI,
1679                                                 Register Reg) const {
1680   if (!Subtarget.hasUnpackedD16VMem())
1681     return Reg;
1682 
1683   const LLT S16 = LLT::scalar(16);
1684   LLT StoreVT = MRI.getType(Reg);
1685   if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1686     return Reg;
1687 
1688   auto Unmerge = B.buildUnmerge(S16, Reg);
1689 
1690 
1691   SmallVector<Register, 4> WideRegs;
1692   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1693     WideRegs.push_back(Unmerge.getReg(I));
1694 
1695   const LLT S32 = LLT::scalar(32);
1696   int NumElts = StoreVT.getNumElements();
1697 
1698   return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1699 }
1700 
1701 static std::pair<Register, unsigned>
1702 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
1703   int64_t Const;
1704   if (mi_match(Reg, MRI, m_ICst(Const)))
1705     return std::make_pair(Register(), Const);
1706 
1707   Register Base;
1708   if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1709     return std::make_pair(Base, Const);
1710 
1711   // TODO: Handle G_OR used for add case
1712   return std::make_pair(Reg, 0);
1713 }
1714 
1715 std::pair<Register, unsigned>
1716 AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
1717                                            Register OrigOffset) const {
1718   const unsigned MaxImm = 4095;
1719   Register BaseReg;
1720   unsigned ImmOffset;
1721   const LLT S32 = LLT::scalar(32);
1722 
1723   std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1724                                                            OrigOffset);
1725 
1726   unsigned C1 = 0;
1727   if (ImmOffset != 0) {
1728     // If the immediate value is too big for the immoffset field, put the value
1729     // and -4096 into the immoffset field so that the value that is copied/added
1730     // for the voffset field is a multiple of 4096, and it stands more chance
1731     // of being CSEd with the copy/add for another similar load/store.
1732     // However, do not do that rounding down to a multiple of 4096 if that is a
1733     // negative number, as it appears to be illegal to have a negative offset
1734     // in the vgpr, even if adding the immediate offset makes it positive.
1735     unsigned Overflow = ImmOffset & ~MaxImm;
1736     ImmOffset -= Overflow;
1737     if ((int32_t)Overflow < 0) {
1738       Overflow += ImmOffset;
1739       ImmOffset = 0;
1740     }
1741 
1742     C1 = ImmOffset;
1743     if (Overflow != 0) {
1744       if (!BaseReg)
1745         BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1746       else {
1747         auto OverflowVal = B.buildConstant(S32, Overflow);
1748         BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1749       }
1750     }
1751   }
1752 
1753   if (!BaseReg)
1754     BaseReg = B.buildConstant(S32, 0).getReg(0);
1755 
1756   return {BaseReg, C1};
1757 }
1758 
1759 bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
1760                                         Register SrcReg) const {
1761   MachineRegisterInfo &MRI = *B.getMRI();
1762   LLT SrcTy = MRI.getType(SrcReg);
1763   if (SrcTy.getSizeInBits() == 32) {
1764     // Use a v_mov_b32 here to make the exec dependency explicit.
1765     B.buildInstr(AMDGPU::V_MOV_B32_e32)
1766       .addDef(DstReg)
1767       .addUse(SrcReg);
1768     return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1769            constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1770   }
1771 
1772   Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1773   Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1774 
1775   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1776     .addDef(TmpReg0)
1777     .addUse(SrcReg, 0, AMDGPU::sub0);
1778   B.buildInstr(AMDGPU::V_MOV_B32_e32)
1779     .addDef(TmpReg1)
1780     .addUse(SrcReg, 0, AMDGPU::sub1);
1781   B.buildInstr(AMDGPU::REG_SEQUENCE)
1782     .addDef(DstReg)
1783     .addUse(TmpReg0)
1784     .addImm(AMDGPU::sub0)
1785     .addUse(TmpReg1)
1786     .addImm(AMDGPU::sub1);
1787 
1788   return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1789          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1790 }
1791 
1792 /// Utility function for pushing dynamic vector indexes with a constant offset
1793 /// into waterfall loops.
1794 static void reinsertVectorIndexAdd(MachineIRBuilder &B,
1795                                    MachineInstr &IdxUseInstr,
1796                                    unsigned OpIdx,
1797                                    unsigned ConstOffset) {
1798   MachineRegisterInfo &MRI = *B.getMRI();
1799   const LLT S32 = LLT::scalar(32);
1800   Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1801   B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1802 
1803   auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1804 
1805   auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1806   MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1807   MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1808   IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1809 }
1810 
1811 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1812 /// original 32-bit source value (to be inserted in the low part of the combined
1813 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1814 /// value.
1815 static void extendLow32IntoHigh32(MachineIRBuilder &B,
1816                                   Register Hi32Reg, Register Lo32Reg,
1817                                   unsigned ExtOpc,
1818                                   const RegisterBank &RegBank,
1819                                   bool IsBooleanSrc = false) {
1820   if (ExtOpc == AMDGPU::G_ZEXT) {
1821     B.buildConstant(Hi32Reg, 0);
1822   } else if (ExtOpc == AMDGPU::G_SEXT) {
1823     if (IsBooleanSrc) {
1824       // If we know the original source was an s1, the high half is the same as
1825       // the low.
1826       B.buildCopy(Hi32Reg, Lo32Reg);
1827     } else {
1828       // Replicate sign bit from 32-bit extended part.
1829       auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1830       B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1831       B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1832     }
1833   } else {
1834     assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1835     B.buildUndef(Hi32Reg);
1836   }
1837 }
1838 
1839 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1840   MachineInstr &MI, MachineRegisterInfo &MRI,
1841   const OperandsMapper &OpdMapper) const {
1842 
1843   Register VecReg = MI.getOperand(1).getReg();
1844   Register Idx = MI.getOperand(2).getReg();
1845 
1846   const RegisterBank &IdxBank =
1847     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1848 
1849   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1850 
1851   LLT VecTy = MRI.getType(VecReg);
1852   unsigned EltSize = VecTy.getScalarSizeInBits();
1853   unsigned NumElem = VecTy.getNumElements();
1854 
1855   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1856                                                   IsDivergentIdx))
1857     return false;
1858 
1859   MachineIRBuilder B(MI);
1860   LLT S32 = LLT::scalar(32);
1861 
1862   const RegisterBank &DstBank =
1863     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1864   const RegisterBank &SrcBank =
1865     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1866 
1867   const RegisterBank &CCBank =
1868     (DstBank == AMDGPU::SGPRRegBank &&
1869      SrcBank == AMDGPU::SGPRRegBank &&
1870      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1871                                      : AMDGPU::VCCRegBank;
1872   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1873 
1874   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1875     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1876     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1877   }
1878 
1879   LLT EltTy = VecTy.getScalarType();
1880   SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1881   unsigned NumLanes = DstRegs.size();
1882   if (!NumLanes)
1883     NumLanes = 1;
1884   else
1885     EltTy = MRI.getType(DstRegs[0]);
1886 
1887   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1888   SmallVector<Register, 2> Res(NumLanes);
1889   for (unsigned L = 0; L < NumLanes; ++L)
1890     Res[L] = UnmergeToEltTy.getReg(L);
1891 
1892   for (unsigned I = 1; I < NumElem; ++I) {
1893     auto IC = B.buildConstant(S32, I);
1894     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1895     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1896     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1897 
1898     for (unsigned L = 0; L < NumLanes; ++L) {
1899       auto S = B.buildSelect(EltTy, Cmp,
1900                              UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1901 
1902       for (unsigned N : { 0, 2, 3 })
1903         MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1904 
1905       Res[L] = S->getOperand(0).getReg();
1906     }
1907   }
1908 
1909   for (unsigned L = 0; L < NumLanes; ++L) {
1910     Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1911     B.buildCopy(DstReg, Res[L]);
1912     MRI.setRegBank(DstReg, DstBank);
1913   }
1914 
1915   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1916   MI.eraseFromParent();
1917 
1918   return true;
1919 }
1920 
1921 // Insert a cross regbank copy for a register if it already has a bank that
1922 // differs from the one we want to set.
1923 static Register constrainRegToBank(MachineRegisterInfo &MRI,
1924                                    MachineIRBuilder &B, Register &Reg,
1925                                    const RegisterBank &Bank) {
1926   const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1927   if (CurrBank && *CurrBank != Bank) {
1928     Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1929     MRI.setRegBank(Copy, Bank);
1930     return Copy;
1931   }
1932 
1933   MRI.setRegBank(Reg, Bank);
1934   return Reg;
1935 }
1936 
1937 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1938   MachineInstr &MI, MachineRegisterInfo &MRI,
1939   const OperandsMapper &OpdMapper) const {
1940 
1941   Register VecReg = MI.getOperand(1).getReg();
1942   Register Idx = MI.getOperand(3).getReg();
1943 
1944   const RegisterBank &IdxBank =
1945     *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1946 
1947   bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1948 
1949   LLT VecTy = MRI.getType(VecReg);
1950   unsigned EltSize = VecTy.getScalarSizeInBits();
1951   unsigned NumElem = VecTy.getNumElements();
1952 
1953   if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1954                                                   IsDivergentIdx))
1955     return false;
1956 
1957   MachineIRBuilder B(MI);
1958   LLT S32 = LLT::scalar(32);
1959 
1960   const RegisterBank &DstBank =
1961     *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1962   const RegisterBank &SrcBank =
1963     *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1964   const RegisterBank &InsBank =
1965     *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1966 
1967   const RegisterBank &CCBank =
1968     (DstBank == AMDGPU::SGPRRegBank &&
1969      SrcBank == AMDGPU::SGPRRegBank &&
1970      InsBank == AMDGPU::SGPRRegBank &&
1971      IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1972                                      : AMDGPU::VCCRegBank;
1973   LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1974 
1975   if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1976     Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1977     MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1978   }
1979 
1980   LLT EltTy = VecTy.getScalarType();
1981   SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1982   unsigned NumLanes = InsRegs.size();
1983   if (!NumLanes) {
1984     NumLanes = 1;
1985     InsRegs.push_back(MI.getOperand(2).getReg());
1986   } else {
1987     EltTy = MRI.getType(InsRegs[0]);
1988   }
1989 
1990   auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1991   SmallVector<Register, 16> Ops(NumElem * NumLanes);
1992 
1993   for (unsigned I = 0; I < NumElem; ++I) {
1994     auto IC = B.buildConstant(S32, I);
1995     MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1996     auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1997     MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1998 
1999     for (unsigned L = 0; L < NumLanes; ++L) {
2000       Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2001       Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2002       Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2003 
2004       Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2005       MRI.setRegBank(Select, DstBank);
2006 
2007       Ops[I * NumLanes + L] = Select;
2008     }
2009   }
2010 
2011   LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2012   if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2013     B.buildBuildVector(MI.getOperand(0), Ops);
2014   } else {
2015     auto Vec = B.buildBuildVector(MergeTy, Ops);
2016     MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2017     B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2018   }
2019 
2020   MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2021   MI.eraseFromParent();
2022 
2023   return true;
2024 }
2025 
2026 void AMDGPURegisterBankInfo::applyMappingImpl(
2027     const OperandsMapper &OpdMapper) const {
2028   MachineInstr &MI = OpdMapper.getMI();
2029   unsigned Opc = MI.getOpcode();
2030   MachineRegisterInfo &MRI = OpdMapper.getMRI();
2031   switch (Opc) {
2032   case AMDGPU::G_PHI: {
2033     Register DstReg = MI.getOperand(0).getReg();
2034     LLT DstTy = MRI.getType(DstReg);
2035     if (DstTy != LLT::scalar(1))
2036       break;
2037 
2038     const LLT S32 = LLT::scalar(32);
2039     const RegisterBank *DstBank =
2040       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2041     if (DstBank == &AMDGPU::VCCRegBank) {
2042       applyDefaultMapping(OpdMapper);
2043       // The standard handling only considers the result register bank for
2044       // phis. For VCC, blindly inserting a copy when the phi is lowered will
2045       // produce an invalid copy. We can only copy with some kind of compare to
2046       // get a vector boolean result. Insert a register bank copy that will be
2047       // correctly lowered to a compare.
2048       MachineIRBuilder B(*MI.getParent()->getParent());
2049 
2050       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2051         Register SrcReg = MI.getOperand(I).getReg();
2052         const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2053 
2054         if (SrcBank != &AMDGPU::VCCRegBank) {
2055           MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2056           B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2057 
2058           auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2059           MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2060           MI.getOperand(I).setReg(Copy.getReg(0));
2061         }
2062       }
2063 
2064       return;
2065     }
2066 
2067     // Phi handling is strange and only considers the bank of the destination.
2068     substituteSimpleCopyRegs(OpdMapper, 0);
2069 
2070     // Promote SGPR/VGPR booleans to s32
2071     MachineFunction *MF = MI.getParent()->getParent();
2072     ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2073     MachineIRBuilder B(MI, ApplyBank);
2074     LegalizerHelper Helper(*MF, ApplyBank, B);
2075 
2076     if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2077       llvm_unreachable("widen scalar should have succeeded");
2078 
2079     return;
2080   }
2081   case AMDGPU::G_ICMP:
2082   case AMDGPU::G_UADDO:
2083   case AMDGPU::G_USUBO:
2084   case AMDGPU::G_UADDE:
2085   case AMDGPU::G_SADDE:
2086   case AMDGPU::G_USUBE:
2087   case AMDGPU::G_SSUBE: {
2088     unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2089     Register DstReg = MI.getOperand(BoolDstOp).getReg();
2090 
2091     const RegisterBank *DstBank =
2092       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2093     if (DstBank != &AMDGPU::SGPRRegBank)
2094       break;
2095 
2096     const bool HasCarryIn = MI.getNumOperands() == 5;
2097 
2098     // If this is a scalar compare, promote the result to s32, as the selection
2099     // will end up using a copy to a 32-bit vreg.
2100     const LLT S32 = LLT::scalar(32);
2101     Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2102     MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2103     MI.getOperand(BoolDstOp).setReg(NewDstReg);
2104     MachineIRBuilder B(MI);
2105 
2106     if (HasCarryIn) {
2107       Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2108       MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2109       B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2110       MI.getOperand(4).setReg(NewSrcReg);
2111     }
2112 
2113     MachineBasicBlock *MBB = MI.getParent();
2114     B.setInsertPt(*MBB, std::next(MI.getIterator()));
2115 
2116     // If we had a constrained VCC result register, a copy was inserted to VCC
2117     // from SGPR.
2118     SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2119     if (DefRegs.empty())
2120       DefRegs.push_back(DstReg);
2121     B.buildTrunc(DefRegs[0], NewDstReg);
2122     return;
2123   }
2124   case AMDGPU::G_SELECT: {
2125     Register DstReg = MI.getOperand(0).getReg();
2126     LLT DstTy = MRI.getType(DstReg);
2127 
2128     SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2129     if (CondRegs.empty())
2130       CondRegs.push_back(MI.getOperand(1).getReg());
2131     else {
2132       assert(CondRegs.size() == 1);
2133     }
2134 
2135     const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2136     if (CondBank == &AMDGPU::SGPRRegBank) {
2137       MachineIRBuilder B(MI);
2138       const LLT S32 = LLT::scalar(32);
2139       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2140       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2141 
2142       MI.getOperand(1).setReg(NewCondReg);
2143       B.buildZExt(NewCondReg, CondRegs[0]);
2144     }
2145 
2146     if (DstTy.getSizeInBits() != 64)
2147       break;
2148 
2149     MachineIRBuilder B(MI);
2150     LLT HalfTy = getHalfSizedType(DstTy);
2151 
2152     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2153     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2154     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2155 
2156     // All inputs are SGPRs, nothing special to do.
2157     if (DefRegs.empty()) {
2158       assert(Src1Regs.empty() && Src2Regs.empty());
2159       break;
2160     }
2161 
2162     if (Src1Regs.empty())
2163       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2164     else {
2165       setRegsToType(MRI, Src1Regs, HalfTy);
2166     }
2167 
2168     if (Src2Regs.empty())
2169       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2170     else
2171       setRegsToType(MRI, Src2Regs, HalfTy);
2172 
2173     setRegsToType(MRI, DefRegs, HalfTy);
2174 
2175     B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2176     B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2177 
2178     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2179     MI.eraseFromParent();
2180     return;
2181   }
2182   case AMDGPU::G_BRCOND: {
2183     Register CondReg = MI.getOperand(0).getReg();
2184     // FIXME: Should use legalizer helper, but should change bool ext type.
2185     const RegisterBank *CondBank =
2186       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2187 
2188     if (CondBank == &AMDGPU::SGPRRegBank) {
2189       MachineIRBuilder B(MI);
2190       const LLT S32 = LLT::scalar(32);
2191       Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2192       MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2193 
2194       MI.getOperand(0).setReg(NewCondReg);
2195       B.buildZExt(NewCondReg, CondReg);
2196       return;
2197     }
2198 
2199     break;
2200   }
2201   case AMDGPU::G_AND:
2202   case AMDGPU::G_OR:
2203   case AMDGPU::G_XOR: {
2204     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2205     // there is a VGPR input.
2206     Register DstReg = MI.getOperand(0).getReg();
2207     LLT DstTy = MRI.getType(DstReg);
2208 
2209     if (DstTy.getSizeInBits() == 1) {
2210       const RegisterBank *DstBank =
2211         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2212       if (DstBank == &AMDGPU::VCCRegBank)
2213         break;
2214 
2215       MachineFunction *MF = MI.getParent()->getParent();
2216       ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2217       MachineIRBuilder B(MI, ApplyBank);
2218       LegalizerHelper Helper(*MF, ApplyBank, B);
2219 
2220       if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2221           LegalizerHelper::Legalized)
2222         llvm_unreachable("widen scalar should have succeeded");
2223       return;
2224     }
2225 
2226     if (DstTy.getSizeInBits() != 64)
2227       break;
2228 
2229     LLT HalfTy = getHalfSizedType(DstTy);
2230     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2231     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2232     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2233 
2234     // All inputs are SGPRs, nothing special to do.
2235     if (DefRegs.empty()) {
2236       assert(Src0Regs.empty() && Src1Regs.empty());
2237       break;
2238     }
2239 
2240     assert(DefRegs.size() == 2);
2241     assert(Src0Regs.size() == Src1Regs.size() &&
2242            (Src0Regs.empty() || Src0Regs.size() == 2));
2243 
2244     // Depending on where the source registers came from, the generic code may
2245     // have decided to split the inputs already or not. If not, we still need to
2246     // extract the values.
2247     MachineIRBuilder B(MI);
2248 
2249     if (Src0Regs.empty())
2250       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2251     else
2252       setRegsToType(MRI, Src0Regs, HalfTy);
2253 
2254     if (Src1Regs.empty())
2255       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2256     else
2257       setRegsToType(MRI, Src1Regs, HalfTy);
2258 
2259     setRegsToType(MRI, DefRegs, HalfTy);
2260 
2261     B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2262     B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2263 
2264     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2265     MI.eraseFromParent();
2266     return;
2267   }
2268   case AMDGPU::G_ABS: {
2269     Register SrcReg = MI.getOperand(1).getReg();
2270     const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2271 
2272     // There is no VALU abs instruction so we need to replace it with a sub and
2273     // max combination.
2274     if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2275       MachineFunction *MF = MI.getParent()->getParent();
2276       ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2277       MachineIRBuilder B(MI, Apply);
2278       LegalizerHelper Helper(*MF, Apply, B);
2279 
2280       if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
2281         llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2282       return;
2283     }
2284     LLVM_FALLTHROUGH;
2285   }
2286   case AMDGPU::G_ADD:
2287   case AMDGPU::G_SUB:
2288   case AMDGPU::G_MUL:
2289   case AMDGPU::G_SHL:
2290   case AMDGPU::G_LSHR:
2291   case AMDGPU::G_ASHR:
2292   case AMDGPU::G_SMIN:
2293   case AMDGPU::G_SMAX:
2294   case AMDGPU::G_UMIN:
2295   case AMDGPU::G_UMAX: {
2296     Register DstReg = MI.getOperand(0).getReg();
2297     LLT DstTy = MRI.getType(DstReg);
2298 
2299     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2300     // Packed 16-bit operations need to be scalarized and promoted.
2301     if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2302       break;
2303 
2304     const RegisterBank *DstBank =
2305       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2306     if (DstBank == &AMDGPU::VGPRRegBank)
2307       break;
2308 
2309     const LLT S32 = LLT::scalar(32);
2310     MachineBasicBlock *MBB = MI.getParent();
2311     MachineFunction *MF = MBB->getParent();
2312     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2313     MachineIRBuilder B(MI, ApplySALU);
2314 
2315     if (DstTy.isVector()) {
2316       Register WideSrc0Lo, WideSrc0Hi;
2317       Register WideSrc1Lo, WideSrc1Hi;
2318 
2319       unsigned ExtendOp = getExtendOp(MI.getOpcode());
2320       std::tie(WideSrc0Lo, WideSrc0Hi)
2321         = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2322       std::tie(WideSrc1Lo, WideSrc1Hi)
2323         = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2324       auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2325       auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2326       B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2327       MI.eraseFromParent();
2328     } else {
2329       LegalizerHelper Helper(*MF, ApplySALU, B);
2330 
2331       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2332         llvm_unreachable("widen scalar should have succeeded");
2333 
2334       // FIXME: s16 shift amounts should be legal.
2335       if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2336           Opc == AMDGPU::G_ASHR) {
2337         B.setInsertPt(*MBB, MI.getIterator());
2338         if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2339           llvm_unreachable("widen scalar should have succeeded");
2340       }
2341     }
2342 
2343     return;
2344   }
2345   case AMDGPU::G_SEXT_INREG: {
2346     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2347     if (SrcRegs.empty())
2348       break; // Nothing to repair
2349 
2350     const LLT S32 = LLT::scalar(32);
2351     MachineIRBuilder B(MI);
2352     ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2353     GISelObserverWrapper Observer(&O);
2354     B.setChangeObserver(Observer);
2355 
2356     // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2357     // we would need to further expand, and doesn't let us directly set the
2358     // result registers.
2359     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2360 
2361     int Amt = MI.getOperand(2).getImm();
2362     if (Amt <= 32) {
2363       if (Amt == 32) {
2364         // The low bits are unchanged.
2365         B.buildCopy(DstRegs[0], SrcRegs[0]);
2366       } else {
2367         // Extend in the low bits and propagate the sign bit to the high half.
2368         B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2369       }
2370 
2371       B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2372     } else {
2373       // The low bits are unchanged, and extend in the high bits.
2374       B.buildCopy(DstRegs[0], SrcRegs[0]);
2375       B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2376     }
2377 
2378     Register DstReg = MI.getOperand(0).getReg();
2379     MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2380     MI.eraseFromParent();
2381     return;
2382   }
2383   case AMDGPU::G_CTPOP:
2384   case AMDGPU::G_BITREVERSE: {
2385     const RegisterBank *DstBank =
2386       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2387     if (DstBank == &AMDGPU::SGPRRegBank)
2388       break;
2389 
2390     Register SrcReg = MI.getOperand(1).getReg();
2391     const LLT S32 = LLT::scalar(32);
2392     LLT Ty = MRI.getType(SrcReg);
2393     if (Ty == S32)
2394       break;
2395 
2396     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2397     MachineIRBuilder B(MI, ApplyVALU);
2398 
2399     MachineFunction &MF = B.getMF();
2400     LegalizerHelper Helper(MF, ApplyVALU, B);
2401 
2402     if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2403       llvm_unreachable("narrowScalar should have succeeded");
2404     return;
2405   }
2406   case AMDGPU::G_AMDGPU_FFBH_U32:
2407   case AMDGPU::G_AMDGPU_FFBL_B32:
2408   case AMDGPU::G_CTLZ_ZERO_UNDEF:
2409   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2410     const RegisterBank *DstBank =
2411         OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2412     if (DstBank == &AMDGPU::SGPRRegBank)
2413       break;
2414 
2415     Register SrcReg = MI.getOperand(1).getReg();
2416     const LLT S32 = LLT::scalar(32);
2417     LLT Ty = MRI.getType(SrcReg);
2418     if (Ty == S32)
2419       break;
2420 
2421     // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2422     // which return -1 when the input is zero:
2423     // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2424     // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2425     // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2426     // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2427     ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2428     MachineIRBuilder B(MI, ApplyVALU);
2429     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2430     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2431                           ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2432                           : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2433                                 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2434                                 : Opc;
2435     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2436     auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2437     auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2438     unsigned AddOpc =
2439         Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2440             ? AMDGPU::G_ADD
2441             : AMDGPU::G_UADDSAT;
2442     Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2443     Register DstReg = MI.getOperand(0).getReg();
2444     B.buildUMin(DstReg, X, Y);
2445     MI.eraseFromParent();
2446     return;
2447   }
2448   case AMDGPU::G_SEXT:
2449   case AMDGPU::G_ZEXT:
2450   case AMDGPU::G_ANYEXT: {
2451     Register SrcReg = MI.getOperand(1).getReg();
2452     LLT SrcTy = MRI.getType(SrcReg);
2453     const bool Signed = Opc == AMDGPU::G_SEXT;
2454 
2455     assert(empty(OpdMapper.getVRegs(1)));
2456 
2457     MachineIRBuilder B(MI);
2458     const RegisterBank *SrcBank =
2459       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2460 
2461     Register DstReg = MI.getOperand(0).getReg();
2462     LLT DstTy = MRI.getType(DstReg);
2463     if (DstTy.isScalar() &&
2464         SrcBank != &AMDGPU::SGPRRegBank &&
2465         SrcBank != &AMDGPU::VCCRegBank &&
2466         // FIXME: Should handle any type that round to s64 when irregular
2467         // breakdowns supported.
2468         DstTy.getSizeInBits() == 64 &&
2469         SrcTy.getSizeInBits() <= 32) {
2470       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2471 
2472       // Extend to 32-bit, and then extend the low half.
2473       if (Signed) {
2474         // TODO: Should really be buildSExtOrCopy
2475         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2476       } else if (Opc == AMDGPU::G_ZEXT) {
2477         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2478       } else {
2479         B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2480       }
2481 
2482       extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2483       MRI.setRegBank(DstReg, *SrcBank);
2484       MI.eraseFromParent();
2485       return;
2486     }
2487 
2488     if (SrcTy != LLT::scalar(1))
2489       return;
2490 
2491     // It is not legal to have a legalization artifact with a VCC source. Rather
2492     // than introducing a copy, insert the select we would have to select the
2493     // copy to.
2494     if (SrcBank == &AMDGPU::VCCRegBank) {
2495       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2496 
2497       const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2498 
2499       unsigned DstSize = DstTy.getSizeInBits();
2500       // 64-bit select is SGPR only
2501       const bool UseSel64 = DstSize > 32 &&
2502         SrcBank->getID() == AMDGPU::SGPRRegBankID;
2503 
2504       // TODO: Should s16 select be legal?
2505       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2506       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2507       auto False = B.buildConstant(SelType, 0);
2508 
2509       MRI.setRegBank(True.getReg(0), *DstBank);
2510       MRI.setRegBank(False.getReg(0), *DstBank);
2511       MRI.setRegBank(DstReg, *DstBank);
2512 
2513       if (DstSize > 32) {
2514         B.buildSelect(DefRegs[0], SrcReg, True, False);
2515         extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2516       } else if (DstSize < 32) {
2517         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2518         MRI.setRegBank(Sel.getReg(0), *DstBank);
2519         B.buildTrunc(DstReg, Sel);
2520       } else {
2521         B.buildSelect(DstReg, SrcReg, True, False);
2522       }
2523 
2524       MI.eraseFromParent();
2525       return;
2526     }
2527 
2528     break;
2529   }
2530   case AMDGPU::G_BUILD_VECTOR:
2531   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2532     Register DstReg = MI.getOperand(0).getReg();
2533     LLT DstTy = MRI.getType(DstReg);
2534     if (DstTy != LLT::fixed_vector(2, 16))
2535       break;
2536 
2537     assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2538     substituteSimpleCopyRegs(OpdMapper, 1);
2539     substituteSimpleCopyRegs(OpdMapper, 2);
2540 
2541     const RegisterBank *DstBank =
2542       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2543     if (DstBank == &AMDGPU::SGPRRegBank)
2544       break; // Can use S_PACK_* instructions.
2545 
2546     MachineIRBuilder B(MI);
2547 
2548     Register Lo = MI.getOperand(1).getReg();
2549     Register Hi = MI.getOperand(2).getReg();
2550     const LLT S32 = LLT::scalar(32);
2551 
2552     const RegisterBank *BankLo =
2553       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2554     const RegisterBank *BankHi =
2555       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2556 
2557     Register ZextLo;
2558     Register ShiftHi;
2559 
2560     if (Opc == AMDGPU::G_BUILD_VECTOR) {
2561       ZextLo = B.buildZExt(S32, Lo).getReg(0);
2562       MRI.setRegBank(ZextLo, *BankLo);
2563 
2564       Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2565       MRI.setRegBank(ZextHi, *BankHi);
2566 
2567       auto ShiftAmt = B.buildConstant(S32, 16);
2568       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2569 
2570       ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2571       MRI.setRegBank(ShiftHi, *BankHi);
2572     } else {
2573       Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2574       MRI.setRegBank(MaskLo, *BankLo);
2575 
2576       auto ShiftAmt = B.buildConstant(S32, 16);
2577       MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2578 
2579       ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2580       MRI.setRegBank(ShiftHi, *BankHi);
2581 
2582       ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2583       MRI.setRegBank(ZextLo, *BankLo);
2584     }
2585 
2586     auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2587     MRI.setRegBank(Or.getReg(0), *DstBank);
2588 
2589     B.buildBitcast(DstReg, Or);
2590     MI.eraseFromParent();
2591     return;
2592   }
2593   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2594     SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2595 
2596     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2597 
2598     Register DstReg = MI.getOperand(0).getReg();
2599     Register SrcReg = MI.getOperand(1).getReg();
2600 
2601     const LLT S32 = LLT::scalar(32);
2602     LLT DstTy = MRI.getType(DstReg);
2603     LLT SrcTy = MRI.getType(SrcReg);
2604 
2605     if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2606       return;
2607 
2608     MachineIRBuilder B(MI);
2609 
2610     const ValueMapping &DstMapping
2611       = OpdMapper.getInstrMapping().getOperandMapping(0);
2612     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2613     const RegisterBank *SrcBank =
2614       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2615     const RegisterBank *IdxBank =
2616         OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2617 
2618     Register BaseIdxReg;
2619     unsigned ConstOffset;
2620     std::tie(BaseIdxReg, ConstOffset) =
2621         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2622 
2623     // See if the index is an add of a constant which will be foldable by moving
2624     // the base register of the index later if this is going to be executed in a
2625     // waterfall loop. This is essentially to reassociate the add of a constant
2626     // with the readfirstlane.
2627     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2628                                    ConstOffset > 0 &&
2629                                    ConstOffset < SrcTy.getNumElements();
2630 
2631     // Move the base register. We'll re-insert the add later.
2632     if (ShouldMoveIndexIntoLoop)
2633       MI.getOperand(2).setReg(BaseIdxReg);
2634 
2635     // If this is a VGPR result only because the index was a VGPR result, the
2636     // actual indexing will be done on the SGPR source vector, which will
2637     // produce a scalar result. We need to copy to the VGPR result inside the
2638     // waterfall loop.
2639     const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2640                                 SrcBank == &AMDGPU::SGPRRegBank;
2641     if (DstRegs.empty()) {
2642       applyDefaultMapping(OpdMapper);
2643 
2644       executeInWaterfallLoop(MI, MRI, { 2 });
2645 
2646       if (NeedCopyToVGPR) {
2647         // We don't want a phi for this temporary reg.
2648         Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2649         MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2650         MI.getOperand(0).setReg(TmpReg);
2651         B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2652 
2653         // Use a v_mov_b32 here to make the exec dependency explicit.
2654         buildVCopy(B, DstReg, TmpReg);
2655       }
2656 
2657       // Re-insert the constant offset add inside the waterfall loop.
2658       if (ShouldMoveIndexIntoLoop)
2659         reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2660 
2661       return;
2662     }
2663 
2664     assert(DstTy.getSizeInBits() == 64);
2665 
2666     LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2667 
2668     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2669     auto One = B.buildConstant(S32, 1);
2670 
2671     MachineBasicBlock::iterator MII = MI.getIterator();
2672 
2673     // Split the vector index into 32-bit pieces. Prepare to move all of the
2674     // new instructions into a waterfall loop if necessary.
2675     //
2676     // Don't put the bitcast or constant in the loop.
2677     MachineInstrSpan Span(MII, &B.getMBB());
2678 
2679     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2680     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2681     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2682 
2683     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2684     auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2685 
2686     MRI.setRegBank(DstReg, *DstBank);
2687     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2688     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2689     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2690     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2691 
2692     SmallSet<Register, 4> OpsToWaterfall;
2693     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2694       MI.eraseFromParent();
2695       return;
2696     }
2697 
2698     // Remove the original instruction to avoid potentially confusing the
2699     // waterfall loop logic.
2700     B.setInstr(*Span.begin());
2701     MI.eraseFromParent();
2702     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2703                            OpsToWaterfall, MRI);
2704 
2705     if (NeedCopyToVGPR) {
2706       MachineBasicBlock *LoopBB = Extract1->getParent();
2707       Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2708       Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2709       MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2710       MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2711 
2712       Extract0->getOperand(0).setReg(TmpReg0);
2713       Extract1->getOperand(0).setReg(TmpReg1);
2714 
2715       B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2716 
2717       buildVCopy(B, DstRegs[0], TmpReg0);
2718       buildVCopy(B, DstRegs[1], TmpReg1);
2719     }
2720 
2721     if (ShouldMoveIndexIntoLoop)
2722       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2723 
2724     return;
2725   }
2726   case AMDGPU::G_INSERT_VECTOR_ELT: {
2727     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2728 
2729     Register DstReg = MI.getOperand(0).getReg();
2730     LLT VecTy = MRI.getType(DstReg);
2731 
2732     assert(OpdMapper.getVRegs(0).empty());
2733     assert(OpdMapper.getVRegs(3).empty());
2734 
2735     if (substituteSimpleCopyRegs(OpdMapper, 1))
2736       MRI.setType(MI.getOperand(1).getReg(), VecTy);
2737 
2738     if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2739       return;
2740 
2741     const RegisterBank *IdxBank =
2742       OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2743 
2744     Register SrcReg = MI.getOperand(1).getReg();
2745     Register InsReg = MI.getOperand(2).getReg();
2746     LLT InsTy = MRI.getType(InsReg);
2747     (void)InsTy;
2748 
2749     Register BaseIdxReg;
2750     unsigned ConstOffset;
2751     std::tie(BaseIdxReg, ConstOffset) =
2752         AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2753 
2754     // See if the index is an add of a constant which will be foldable by moving
2755     // the base register of the index later if this is going to be executed in a
2756     // waterfall loop. This is essentially to reassociate the add of a constant
2757     // with the readfirstlane.
2758     bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2759       ConstOffset > 0 &&
2760       ConstOffset < VecTy.getNumElements();
2761 
2762     // Move the base register. We'll re-insert the add later.
2763     if (ShouldMoveIndexIntoLoop)
2764       MI.getOperand(3).setReg(BaseIdxReg);
2765 
2766 
2767     if (InsRegs.empty()) {
2768       executeInWaterfallLoop(MI, MRI, { 3 });
2769 
2770       // Re-insert the constant offset add inside the waterfall loop.
2771       if (ShouldMoveIndexIntoLoop) {
2772         MachineIRBuilder B(MI);
2773         reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2774       }
2775 
2776       return;
2777     }
2778 
2779 
2780     assert(InsTy.getSizeInBits() == 64);
2781 
2782     const LLT S32 = LLT::scalar(32);
2783     LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2784 
2785     MachineIRBuilder B(MI);
2786     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2787     auto One = B.buildConstant(S32, 1);
2788 
2789     // Split the vector index into 32-bit pieces. Prepare to move all of the
2790     // new instructions into a waterfall loop if necessary.
2791     //
2792     // Don't put the bitcast or constant in the loop.
2793     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2794 
2795     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2796     auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2797     auto IdxHi = B.buildAdd(S32, IdxLo, One);
2798 
2799     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2800     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2801 
2802     const RegisterBank *DstBank =
2803       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2804     const RegisterBank *SrcBank =
2805       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2806     const RegisterBank *InsSrcBank =
2807       OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2808 
2809     MRI.setRegBank(InsReg, *InsSrcBank);
2810     MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2811     MRI.setRegBank(InsLo.getReg(0), *DstBank);
2812     MRI.setRegBank(InsHi.getReg(0), *DstBank);
2813     MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2814     MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2815     MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2816 
2817 
2818     SmallSet<Register, 4> OpsToWaterfall;
2819     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2820       B.setInsertPt(B.getMBB(), MI);
2821       B.buildBitcast(DstReg, InsHi);
2822       MI.eraseFromParent();
2823       return;
2824     }
2825 
2826     B.setInstr(*Span.begin());
2827     MI.eraseFromParent();
2828 
2829     // Figure out the point after the waterfall loop before mangling the control
2830     // flow.
2831     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2832                            OpsToWaterfall, MRI);
2833 
2834     // The insertion point is now right after the original instruction.
2835     //
2836     // Keep the bitcast to the original vector type out of the loop. Doing this
2837     // saved an extra phi we don't need inside the loop.
2838     B.buildBitcast(DstReg, InsHi);
2839 
2840     // Re-insert the constant offset add inside the waterfall loop.
2841     if (ShouldMoveIndexIntoLoop)
2842       reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2843 
2844     return;
2845   }
2846   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2847   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2848   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2849   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2850   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2851   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2852   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2853   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2854   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2855   case AMDGPU::G_AMDGPU_BUFFER_STORE:
2856   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2857   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2858   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2859   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2860   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2861   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2862     applyDefaultMapping(OpdMapper);
2863     executeInWaterfallLoop(MI, MRI, {1, 4});
2864     return;
2865   }
2866   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2867   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2868   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2869   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2870   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2871   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2872   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2873   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2874   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2875   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2876   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2877   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2878     applyDefaultMapping(OpdMapper);
2879     executeInWaterfallLoop(MI, MRI, {2, 5});
2880     return;
2881   }
2882   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2883   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2884   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2885     applyDefaultMapping(OpdMapper);
2886     executeInWaterfallLoop(MI, MRI, {2, 5});
2887     return;
2888   }
2889   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2890     applyDefaultMapping(OpdMapper);
2891     executeInWaterfallLoop(MI, MRI, {3, 6});
2892     return;
2893   }
2894   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2895     applyMappingSBufferLoad(OpdMapper);
2896     return;
2897   }
2898   case AMDGPU::G_INTRINSIC: {
2899     switch (MI.getIntrinsicID()) {
2900     case Intrinsic::amdgcn_readlane: {
2901       substituteSimpleCopyRegs(OpdMapper, 2);
2902 
2903       assert(OpdMapper.getVRegs(0).empty());
2904       assert(OpdMapper.getVRegs(3).empty());
2905 
2906       // Make sure the index is an SGPR. It doesn't make sense to run this in a
2907       // waterfall loop, so assume it's a uniform value.
2908       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2909       return;
2910     }
2911     case Intrinsic::amdgcn_writelane: {
2912       assert(OpdMapper.getVRegs(0).empty());
2913       assert(OpdMapper.getVRegs(2).empty());
2914       assert(OpdMapper.getVRegs(3).empty());
2915 
2916       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2917       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2918       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2919       return;
2920     }
2921     case Intrinsic::amdgcn_interp_p1:
2922     case Intrinsic::amdgcn_interp_p2:
2923     case Intrinsic::amdgcn_interp_mov:
2924     case Intrinsic::amdgcn_interp_p1_f16:
2925     case Intrinsic::amdgcn_interp_p2_f16: {
2926       applyDefaultMapping(OpdMapper);
2927 
2928       // Readlane for m0 value, which is always the last operand.
2929       // FIXME: Should this be a waterfall loop instead?
2930       constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2931       return;
2932     }
2933     case Intrinsic::amdgcn_permlane16:
2934     case Intrinsic::amdgcn_permlanex16: {
2935       // Doing a waterfall loop over these wouldn't make any sense.
2936       substituteSimpleCopyRegs(OpdMapper, 2);
2937       substituteSimpleCopyRegs(OpdMapper, 3);
2938       constrainOpWithReadfirstlane(MI, MRI, 4);
2939       constrainOpWithReadfirstlane(MI, MRI, 5);
2940       return;
2941     }
2942     case Intrinsic::amdgcn_sbfe:
2943       applyMappingBFE(OpdMapper, true);
2944       return;
2945     case Intrinsic::amdgcn_ubfe:
2946       applyMappingBFE(OpdMapper, false);
2947       return;
2948     case Intrinsic::amdgcn_ballot:
2949       // Use default handling and insert copy to vcc source.
2950       break;
2951     }
2952     break;
2953   }
2954   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2955   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
2956   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
2957   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
2958     const AMDGPU::RsrcIntrinsic *RSrcIntrin
2959       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2960     assert(RSrcIntrin && RSrcIntrin->IsImage);
2961     // Non-images can have complications from operands that allow both SGPR
2962     // and VGPR. For now it's too complicated to figure out the final opcode
2963     // to derive the register bank from the MCInstrDesc.
2964     applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2965     return;
2966   }
2967   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2968     unsigned N = MI.getNumExplicitOperands() - 2;
2969     applyDefaultMapping(OpdMapper);
2970     executeInWaterfallLoop(MI, MRI, { N });
2971     return;
2972   }
2973   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2974     auto IntrID = MI.getIntrinsicID();
2975     switch (IntrID) {
2976     case Intrinsic::amdgcn_ds_ordered_add:
2977     case Intrinsic::amdgcn_ds_ordered_swap: {
2978       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2979       assert(OpdMapper.getVRegs(0).empty());
2980       substituteSimpleCopyRegs(OpdMapper, 3);
2981       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2982       return;
2983     }
2984     case Intrinsic::amdgcn_ds_gws_init:
2985     case Intrinsic::amdgcn_ds_gws_barrier:
2986     case Intrinsic::amdgcn_ds_gws_sema_br: {
2987       // Only the first lane is executes, so readfirstlane is safe.
2988       substituteSimpleCopyRegs(OpdMapper, 1);
2989       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2990       return;
2991     }
2992     case Intrinsic::amdgcn_ds_gws_sema_v:
2993     case Intrinsic::amdgcn_ds_gws_sema_p:
2994     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2995       // Only the first lane is executes, so readfirstlane is safe.
2996       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2997       return;
2998     }
2999     case Intrinsic::amdgcn_ds_append:
3000     case Intrinsic::amdgcn_ds_consume: {
3001       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3002       return;
3003     }
3004     case Intrinsic::amdgcn_s_sendmsg:
3005     case Intrinsic::amdgcn_s_sendmsghalt: {
3006       // FIXME: Should this use a waterfall loop?
3007       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3008       return;
3009     }
3010     case Intrinsic::amdgcn_s_setreg: {
3011       constrainOpWithReadfirstlane(MI, MRI, 2);
3012       return;
3013     }
3014     default: {
3015       if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3016               AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3017         // Non-images can have complications from operands that allow both SGPR
3018         // and VGPR. For now it's too complicated to figure out the final opcode
3019         // to derive the register bank from the MCInstrDesc.
3020         if (RSrcIntrin->IsImage) {
3021           applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3022           return;
3023         }
3024       }
3025 
3026       break;
3027     }
3028     }
3029     break;
3030   }
3031   case AMDGPU::G_SI_CALL: {
3032     // Use a set to avoid extra readfirstlanes in the case where multiple
3033     // operands are the same register.
3034     SmallSet<Register, 4> SGPROperandRegs;
3035 
3036     if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3037       break;
3038 
3039     // Move all copies to physical SGPRs that are used by the call instruction
3040     // into the loop block. Start searching for these copies until the
3041     // ADJCALLSTACKUP.
3042     unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3043     unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3044 
3045     // Move all non-copies before the copies, so that a complete range can be
3046     // moved into the waterfall loop.
3047     SmallVector<MachineInstr *, 4> NonCopyInstrs;
3048     // Count of NonCopyInstrs found until the current LastCopy.
3049     unsigned NonCopyInstrsLen = 0;
3050     MachineBasicBlock::iterator Start(&MI);
3051     MachineBasicBlock::iterator LastCopy = Start;
3052     MachineBasicBlock *MBB = MI.getParent();
3053     const SIMachineFunctionInfo *Info =
3054         MBB->getParent()->getInfo<SIMachineFunctionInfo>();
3055     while (Start->getOpcode() != FrameSetupOpcode) {
3056       --Start;
3057       bool IsCopy = false;
3058       if (Start->getOpcode() == AMDGPU::COPY) {
3059         auto &Dst = Start->getOperand(0);
3060         if (Dst.isReg()) {
3061           Register Reg = Dst.getReg();
3062           if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3063             IsCopy = true;
3064           } else {
3065             // Also move the copy from the scratch rsrc descriptor into the loop
3066             // to allow it to be optimized away.
3067             auto &Src = Start->getOperand(1);
3068             if (Src.isReg()) {
3069               Reg = Src.getReg();
3070               IsCopy = Info->getScratchRSrcReg() == Reg;
3071             }
3072           }
3073         }
3074       }
3075 
3076       if (IsCopy) {
3077         LastCopy = Start;
3078         NonCopyInstrsLen = NonCopyInstrs.size();
3079       } else {
3080         NonCopyInstrs.push_back(&*Start);
3081       }
3082     }
3083     NonCopyInstrs.resize(NonCopyInstrsLen);
3084 
3085     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3086       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3087     }
3088     Start = LastCopy;
3089 
3090     // Do the same for copies after the loop
3091     NonCopyInstrs.clear();
3092     NonCopyInstrsLen = 0;
3093     MachineBasicBlock::iterator End(&MI);
3094     LastCopy = End;
3095     while (End->getOpcode() != FrameDestroyOpcode) {
3096       ++End;
3097       bool IsCopy = false;
3098       if (End->getOpcode() == AMDGPU::COPY) {
3099         auto &Src = End->getOperand(1);
3100         if (Src.isReg()) {
3101           Register Reg = Src.getReg();
3102           IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3103         }
3104       }
3105 
3106       if (IsCopy) {
3107         LastCopy = End;
3108         NonCopyInstrsLen = NonCopyInstrs.size();
3109       } else {
3110         NonCopyInstrs.push_back(&*End);
3111       }
3112     }
3113     NonCopyInstrs.resize(NonCopyInstrsLen);
3114 
3115     End = LastCopy;
3116     ++LastCopy;
3117     for (auto *NonCopy : reverse(NonCopyInstrs)) {
3118       MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3119     }
3120 
3121     ++End;
3122     MachineIRBuilder B(*Start);
3123     executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3124     break;
3125   }
3126   case AMDGPU::G_LOAD:
3127   case AMDGPU::G_ZEXTLOAD:
3128   case AMDGPU::G_SEXTLOAD: {
3129     if (applyMappingLoad(MI, OpdMapper, MRI))
3130       return;
3131     break;
3132   }
3133   case AMDGPU::G_DYN_STACKALLOC:
3134     applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3135     return;
3136   case AMDGPU::G_SBFX:
3137     applyMappingBFE(OpdMapper, /*Signed*/ true);
3138     return;
3139   case AMDGPU::G_UBFX:
3140     applyMappingBFE(OpdMapper, /*Signed*/ false);
3141     return;
3142   default:
3143     break;
3144   }
3145 
3146   return applyDefaultMapping(OpdMapper);
3147 }
3148 
3149 // vgpr, sgpr -> vgpr
3150 // vgpr, agpr -> vgpr
3151 // agpr, agpr -> agpr
3152 // agpr, sgpr -> vgpr
3153 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3154   if (RB0 == AMDGPU::InvalidRegBankID)
3155     return RB1;
3156   if (RB1 == AMDGPU::InvalidRegBankID)
3157     return RB0;
3158 
3159   if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3160     return AMDGPU::SGPRRegBankID;
3161 
3162   if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3163     return AMDGPU::AGPRRegBankID;
3164 
3165   return AMDGPU::VGPRRegBankID;
3166 }
3167 
3168 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3169   if (RB0 == AMDGPU::InvalidRegBankID)
3170     return RB1;
3171   if (RB1 == AMDGPU::InvalidRegBankID)
3172     return RB0;
3173 
3174   // vcc, vcc -> vcc
3175   // vcc, sgpr -> vcc
3176   // vcc, vgpr -> vcc
3177   if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3178     return AMDGPU::VCCRegBankID;
3179 
3180   // vcc, vgpr -> vgpr
3181   return regBankUnion(RB0, RB1);
3182 }
3183 
3184 unsigned AMDGPURegisterBankInfo::getMappingType(const MachineRegisterInfo &MRI,
3185                                                 const MachineInstr &MI) const {
3186   unsigned RegBank = AMDGPU::InvalidRegBankID;
3187 
3188   for (const MachineOperand &MO : MI.operands()) {
3189     if (!MO.isReg())
3190       continue;
3191     Register Reg = MO.getReg();
3192     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3193       RegBank = regBankUnion(RegBank, Bank->getID());
3194       if (RegBank == AMDGPU::VGPRRegBankID)
3195         break;
3196     }
3197   }
3198 
3199   return RegBank;
3200 }
3201 
3202 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
3203   const MachineFunction &MF = *MI.getParent()->getParent();
3204   const MachineRegisterInfo &MRI = MF.getRegInfo();
3205   for (const MachineOperand &MO : MI.operands()) {
3206     if (!MO.isReg())
3207       continue;
3208     Register Reg = MO.getReg();
3209     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3210       if (Bank->getID() != AMDGPU::SGPRRegBankID)
3211         return false;
3212     }
3213   }
3214   return true;
3215 }
3216 
3217 const RegisterBankInfo::InstructionMapping &
3218 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
3219   const MachineFunction &MF = *MI.getParent()->getParent();
3220   const MachineRegisterInfo &MRI = MF.getRegInfo();
3221   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3222 
3223   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3224     const MachineOperand &SrcOp = MI.getOperand(i);
3225     if (!SrcOp.isReg())
3226       continue;
3227 
3228     unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3229     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3230   }
3231   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3232                                MI.getNumOperands());
3233 }
3234 
3235 const RegisterBankInfo::InstructionMapping &
3236 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
3237   const MachineFunction &MF = *MI.getParent()->getParent();
3238   const MachineRegisterInfo &MRI = MF.getRegInfo();
3239   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3240 
3241   // Even though we technically could use SGPRs, this would require knowledge of
3242   // the constant bus restriction. Force all sources to VGPR (except for VCC).
3243   //
3244   // TODO: Unary ops are trivially OK, so accept SGPRs?
3245   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3246     const MachineOperand &Src = MI.getOperand(i);
3247     if (!Src.isReg())
3248       continue;
3249 
3250     unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3251     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3252     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3253   }
3254 
3255   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3256                                MI.getNumOperands());
3257 }
3258 
3259 const RegisterBankInfo::InstructionMapping &
3260 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
3261   const MachineFunction &MF = *MI.getParent()->getParent();
3262   const MachineRegisterInfo &MRI = MF.getRegInfo();
3263   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3264 
3265   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3266     const MachineOperand &Op = MI.getOperand(I);
3267     if (!Op.isReg())
3268       continue;
3269 
3270     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3271     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3272   }
3273 
3274   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3275                                MI.getNumOperands());
3276 }
3277 
3278 const RegisterBankInfo::InstructionMapping &
3279 AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
3280                                         const MachineInstr &MI,
3281                                         int RsrcIdx) const {
3282   // The reported argument index is relative to the IR intrinsic call arguments,
3283   // so we need to shift by the number of defs and the intrinsic ID.
3284   RsrcIdx += MI.getNumExplicitDefs() + 1;
3285 
3286   const int NumOps = MI.getNumOperands();
3287   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3288 
3289   // TODO: Should packed/unpacked D16 difference be reported here as part of
3290   // the value mapping?
3291   for (int I = 0; I != NumOps; ++I) {
3292     if (!MI.getOperand(I).isReg())
3293       continue;
3294 
3295     Register OpReg = MI.getOperand(I).getReg();
3296     // We replace some dead address operands with $noreg
3297     if (!OpReg)
3298       continue;
3299 
3300     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3301 
3302     // FIXME: Probably need a new intrinsic register bank searchable table to
3303     // handle arbitrary intrinsics easily.
3304     //
3305     // If this has a sampler, it immediately follows rsrc.
3306     const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3307 
3308     if (MustBeSGPR) {
3309       // If this must be an SGPR, so we must report whatever it is as legal.
3310       unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3311       OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3312     } else {
3313       // Some operands must be VGPR, and these are easy to copy to.
3314       OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3315     }
3316   }
3317 
3318   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3319 }
3320 
3321 /// Return the mapping for a pointer argument.
3322 const RegisterBankInfo::ValueMapping *
3323 AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
3324                                               Register PtrReg) const {
3325   LLT PtrTy = MRI.getType(PtrReg);
3326   unsigned Size = PtrTy.getSizeInBits();
3327   if (Subtarget.useFlatForGlobal() ||
3328       !AMDGPU::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
3329     return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3330 
3331   // If we're using MUBUF instructions for global memory, an SGPR base register
3332   // is possible. Otherwise this needs to be a VGPR.
3333   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3334   return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3335 }
3336 
3337 const RegisterBankInfo::InstructionMapping &
3338 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
3339 
3340   const MachineFunction &MF = *MI.getParent()->getParent();
3341   const MachineRegisterInfo &MRI = MF.getRegInfo();
3342   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3343   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3344   Register PtrReg = MI.getOperand(1).getReg();
3345   LLT PtrTy = MRI.getType(PtrReg);
3346   unsigned AS = PtrTy.getAddressSpace();
3347   unsigned PtrSize = PtrTy.getSizeInBits();
3348 
3349   const ValueMapping *ValMapping;
3350   const ValueMapping *PtrMapping;
3351 
3352   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3353 
3354   if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3355     if (isScalarLoadLegal(MI)) {
3356       // We have a uniform instruction so we want to use an SMRD load
3357       ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3358       PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3359     } else {
3360       ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3361 
3362       // If we're using MUBUF instructions for global memory, an SGPR base
3363       // register is possible. Otherwise this needs to be a VGPR.
3364       unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3365         AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3366 
3367       PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3368     }
3369   } else {
3370     ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3371     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3372   }
3373 
3374   OpdsMapping[0] = ValMapping;
3375   OpdsMapping[1] = PtrMapping;
3376   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
3377       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3378   return Mapping;
3379 
3380   // FIXME: Do we want to add a mapping for FLAT load, or should we just
3381   // handle that during instruction selection?
3382 }
3383 
3384 unsigned
3385 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
3386                                      const MachineRegisterInfo &MRI,
3387                                      unsigned Default) const {
3388   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3389   return Bank ? Bank->getID() : Default;
3390 }
3391 
3392 const RegisterBankInfo::ValueMapping *
3393 AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
3394                                          const MachineRegisterInfo &MRI,
3395                                          const TargetRegisterInfo &TRI) const {
3396   // Lie and claim anything is legal, even though this needs to be an SGPR
3397   // applyMapping will have to deal with it as a waterfall loop.
3398   unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3399   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3400   return AMDGPU::getValueMapping(Bank, Size);
3401 }
3402 
3403 const RegisterBankInfo::ValueMapping *
3404 AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
3405                                          const MachineRegisterInfo &MRI,
3406                                          const TargetRegisterInfo &TRI) const {
3407   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3408   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3409 }
3410 
3411 const RegisterBankInfo::ValueMapping *
3412 AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
3413                                          const MachineRegisterInfo &MRI,
3414                                          const TargetRegisterInfo &TRI) const {
3415   unsigned Size = getSizeInBits(Reg, MRI, TRI);
3416   return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3417 }
3418 
3419 ///
3420 /// This function must return a legal mapping, because
3421 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3422 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
3423 /// VGPR to SGPR generated is illegal.
3424 ///
3425 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3426 // legal. These will be dealt with in applyMappingImpl.
3427 //
3428 const RegisterBankInfo::InstructionMapping &
3429 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
3430   const MachineFunction &MF = *MI.getParent()->getParent();
3431   const MachineRegisterInfo &MRI = MF.getRegInfo();
3432 
3433   if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3434     // The default logic bothers to analyze impossible alternative mappings. We
3435     // want the most straightforward mapping, so just directly handle this.
3436     const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3437                                              *TRI);
3438     const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3439                                              *TRI);
3440     assert(SrcBank && "src bank should have been assigned already");
3441     if (!DstBank)
3442       DstBank = SrcBank;
3443 
3444     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3445     if (cannotCopy(*DstBank, *SrcBank, Size))
3446       return getInvalidInstructionMapping();
3447 
3448     const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3449     unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3450     SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3451     OpdsMapping[0] = &ValMap;
3452     if (MI.getOpcode() == AMDGPU::G_FREEZE)
3453       OpdsMapping[1] = &ValMap;
3454 
3455     return getInstructionMapping(
3456         1, /*Cost*/ 1,
3457         /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3458   }
3459 
3460   if (MI.isRegSequence()) {
3461     // If any input is a VGPR, the result must be a VGPR. The default handling
3462     // assumes any copy between banks is legal.
3463     unsigned BankID = AMDGPU::SGPRRegBankID;
3464 
3465     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3466       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3467       // It doesn't make sense to use vcc or scc banks here, so just ignore
3468       // them.
3469       if (OpBank != AMDGPU::SGPRRegBankID) {
3470         BankID = AMDGPU::VGPRRegBankID;
3471         break;
3472       }
3473     }
3474     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3475 
3476     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3477     return getInstructionMapping(
3478         1, /*Cost*/ 1,
3479         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3480   }
3481 
3482   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3483   // properly.
3484   //
3485   // TODO: There are additional exec masking dependencies to analyze.
3486   if (MI.getOpcode() == TargetOpcode::G_PHI) {
3487     unsigned ResultBank = AMDGPU::InvalidRegBankID;
3488     Register DstReg = MI.getOperand(0).getReg();
3489 
3490     // Sometimes the result may have already been assigned a bank.
3491     if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3492       ResultBank = DstBank->getID();
3493 
3494     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3495       Register Reg = MI.getOperand(I).getReg();
3496       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3497 
3498       // FIXME: Assuming VGPR for any undetermined inputs.
3499       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3500         ResultBank = AMDGPU::VGPRRegBankID;
3501         break;
3502       }
3503 
3504       // FIXME: Need to promote SGPR case to s32
3505       unsigned OpBank = Bank->getID();
3506       ResultBank = regBankBoolUnion(ResultBank, OpBank);
3507     }
3508 
3509     assert(ResultBank != AMDGPU::InvalidRegBankID);
3510 
3511     unsigned Size = MRI.getType(DstReg).getSizeInBits();
3512 
3513     const ValueMapping &ValMap =
3514         getValueMapping(0, Size, getRegBank(ResultBank));
3515     return getInstructionMapping(
3516         1, /*Cost*/ 1,
3517         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3518   }
3519 
3520   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
3521   if (Mapping.isValid())
3522     return Mapping;
3523 
3524   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3525 
3526   switch (MI.getOpcode()) {
3527   default:
3528     return getInvalidInstructionMapping();
3529 
3530   case AMDGPU::G_AND:
3531   case AMDGPU::G_OR:
3532   case AMDGPU::G_XOR: {
3533     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3534     if (Size == 1) {
3535       const RegisterBank *DstBank
3536         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3537 
3538       unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3539       unsigned BankLHS = AMDGPU::InvalidRegBankID;
3540       unsigned BankRHS = AMDGPU::InvalidRegBankID;
3541       if (DstBank) {
3542         TargetBankID = DstBank->getID();
3543         if (DstBank == &AMDGPU::VCCRegBank) {
3544           TargetBankID = AMDGPU::VCCRegBankID;
3545           BankLHS = AMDGPU::VCCRegBankID;
3546           BankRHS = AMDGPU::VCCRegBankID;
3547         } else {
3548           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3549                                  AMDGPU::SGPRRegBankID);
3550           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3551                                  AMDGPU::SGPRRegBankID);
3552         }
3553       } else {
3554         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3555                                AMDGPU::VCCRegBankID);
3556         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3557                                AMDGPU::VCCRegBankID);
3558 
3559         // Both inputs should be true booleans to produce a boolean result.
3560         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3561           TargetBankID = AMDGPU::VGPRRegBankID;
3562         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3563           TargetBankID = AMDGPU::VCCRegBankID;
3564           BankLHS = AMDGPU::VCCRegBankID;
3565           BankRHS = AMDGPU::VCCRegBankID;
3566         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3567           TargetBankID = AMDGPU::SGPRRegBankID;
3568         }
3569       }
3570 
3571       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3572       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3573       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3574       break;
3575     }
3576 
3577     if (Size == 64) {
3578 
3579       if (isSALUMapping(MI)) {
3580         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3581         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3582       } else {
3583         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3584         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3585         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3586 
3587         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3588         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3589       }
3590 
3591       break;
3592     }
3593 
3594     LLVM_FALLTHROUGH;
3595   }
3596   case AMDGPU::G_PTR_ADD:
3597   case AMDGPU::G_PTRMASK:
3598   case AMDGPU::G_ADD:
3599   case AMDGPU::G_SUB:
3600   case AMDGPU::G_MUL:
3601   case AMDGPU::G_SHL:
3602   case AMDGPU::G_LSHR:
3603   case AMDGPU::G_ASHR:
3604   case AMDGPU::G_UADDO:
3605   case AMDGPU::G_USUBO:
3606   case AMDGPU::G_UADDE:
3607   case AMDGPU::G_SADDE:
3608   case AMDGPU::G_USUBE:
3609   case AMDGPU::G_SSUBE:
3610   case AMDGPU::G_SMIN:
3611   case AMDGPU::G_SMAX:
3612   case AMDGPU::G_UMIN:
3613   case AMDGPU::G_UMAX:
3614   case AMDGPU::G_ABS:
3615   case AMDGPU::G_SHUFFLE_VECTOR:
3616   case AMDGPU::G_SBFX:
3617   case AMDGPU::G_UBFX:
3618     if (isSALUMapping(MI))
3619       return getDefaultMappingSOP(MI);
3620     LLVM_FALLTHROUGH;
3621 
3622   case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3623   case AMDGPU::G_SSUBSAT:
3624   case AMDGPU::G_UADDSAT:
3625   case AMDGPU::G_USUBSAT:
3626   case AMDGPU::G_FADD:
3627   case AMDGPU::G_FSUB:
3628   case AMDGPU::G_FPTOSI:
3629   case AMDGPU::G_FPTOUI:
3630   case AMDGPU::G_FMUL:
3631   case AMDGPU::G_FMA:
3632   case AMDGPU::G_FMAD:
3633   case AMDGPU::G_FSQRT:
3634   case AMDGPU::G_FFLOOR:
3635   case AMDGPU::G_FCEIL:
3636   case AMDGPU::G_FRINT:
3637   case AMDGPU::G_SITOFP:
3638   case AMDGPU::G_UITOFP:
3639   case AMDGPU::G_FPTRUNC:
3640   case AMDGPU::G_FPEXT:
3641   case AMDGPU::G_FEXP2:
3642   case AMDGPU::G_FLOG2:
3643   case AMDGPU::G_FMINNUM:
3644   case AMDGPU::G_FMAXNUM:
3645   case AMDGPU::G_FMINNUM_IEEE:
3646   case AMDGPU::G_FMAXNUM_IEEE:
3647   case AMDGPU::G_FCANONICALIZE:
3648   case AMDGPU::G_INTRINSIC_TRUNC:
3649   case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3650   case AMDGPU::G_FSHR: // TODO: Expand for scalar
3651   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3652   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3653   case AMDGPU::G_AMDGPU_RCP_IFLAG:
3654   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3655   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3656   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3657   case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3658   case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3659   case AMDGPU::G_AMDGPU_SMED3:
3660     return getDefaultMappingVOP(MI);
3661   case AMDGPU::G_UMULH:
3662   case AMDGPU::G_SMULH: {
3663     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
3664       return getDefaultMappingSOP(MI);
3665     return getDefaultMappingVOP(MI);
3666   }
3667   case AMDGPU::G_IMPLICIT_DEF: {
3668     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3669     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3670     break;
3671   }
3672   case AMDGPU::G_FCONSTANT:
3673   case AMDGPU::G_CONSTANT:
3674   case AMDGPU::G_GLOBAL_VALUE:
3675   case AMDGPU::G_BLOCK_ADDR:
3676   case AMDGPU::G_READCYCLECOUNTER: {
3677     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3678     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3679     break;
3680   }
3681   case AMDGPU::G_FRAME_INDEX: {
3682     // TODO: This should be the same as other constants, but eliminateFrameIndex
3683     // currently assumes VALU uses.
3684     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3685     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3686     break;
3687   }
3688   case AMDGPU::G_DYN_STACKALLOC: {
3689     // Result is always uniform, and a wave reduction is needed for the source.
3690     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3691     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3692     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3693     break;
3694   }
3695   case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3696     // This case is weird because we expect a physical register in the source,
3697     // but need to set a bank anyway.
3698     //
3699     // We could select the result to SGPR or VGPR, but for the one current use
3700     // it's more practical to always use VGPR.
3701     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3702     OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3703     break;
3704   }
3705   case AMDGPU::G_INSERT: {
3706     unsigned BankID = getMappingType(MRI, MI);
3707     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3708     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3709     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3710     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3711     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3712     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3713     OpdsMapping[3] = nullptr;
3714     break;
3715   }
3716   case AMDGPU::G_EXTRACT: {
3717     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3718     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3719     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3720     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3721     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3722     OpdsMapping[2] = nullptr;
3723     break;
3724   }
3725   case AMDGPU::G_BUILD_VECTOR:
3726   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3727     LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3728     if (DstTy == LLT::fixed_vector(2, 16)) {
3729       unsigned DstSize = DstTy.getSizeInBits();
3730       unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3731       unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3732       unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3733       unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3734 
3735       OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3736       OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3737       OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3738       break;
3739     }
3740 
3741     LLVM_FALLTHROUGH;
3742   }
3743   case AMDGPU::G_MERGE_VALUES:
3744   case AMDGPU::G_CONCAT_VECTORS: {
3745     unsigned Bank = getMappingType(MRI, MI);
3746     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3747     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3748 
3749     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3750     // Op1 and Dst should use the same register bank.
3751     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3752       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3753     break;
3754   }
3755   case AMDGPU::G_BITREVERSE:
3756   case AMDGPU::G_BITCAST:
3757   case AMDGPU::G_INTTOPTR:
3758   case AMDGPU::G_PTRTOINT:
3759   case AMDGPU::G_FABS:
3760   case AMDGPU::G_FNEG: {
3761     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3762     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3763     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3764     break;
3765   }
3766   case AMDGPU::G_AMDGPU_FFBH_U32:
3767   case AMDGPU::G_AMDGPU_FFBL_B32:
3768   case AMDGPU::G_CTLZ_ZERO_UNDEF:
3769   case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3770     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3771     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3772     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3773     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3774     break;
3775   }
3776   case AMDGPU::G_CTPOP: {
3777     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3778     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3779     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3780 
3781     // This should really be getValueMappingSGPR64Only, but allowing the generic
3782     // code to handle the register split just makes using LegalizerHelper more
3783     // difficult.
3784     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3785     break;
3786   }
3787   case AMDGPU::G_TRUNC: {
3788     Register Dst = MI.getOperand(0).getReg();
3789     Register Src = MI.getOperand(1).getReg();
3790     unsigned Bank = getRegBankID(Src, MRI);
3791     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3792     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3793     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3794     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3795     break;
3796   }
3797   case AMDGPU::G_ZEXT:
3798   case AMDGPU::G_SEXT:
3799   case AMDGPU::G_ANYEXT:
3800   case AMDGPU::G_SEXT_INREG: {
3801     Register Dst = MI.getOperand(0).getReg();
3802     Register Src = MI.getOperand(1).getReg();
3803     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3804     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3805 
3806     unsigned DstBank;
3807     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3808     assert(SrcBank);
3809     switch (SrcBank->getID()) {
3810     case AMDGPU::SGPRRegBankID:
3811       DstBank = AMDGPU::SGPRRegBankID;
3812       break;
3813     default:
3814       DstBank = AMDGPU::VGPRRegBankID;
3815       break;
3816     }
3817 
3818     // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3819     // 32-bits, and then to 64.
3820     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3821     OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3822                                                        SrcSize);
3823     break;
3824   }
3825   case AMDGPU::G_FCMP: {
3826     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3827     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3828     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3829     OpdsMapping[1] = nullptr; // Predicate Operand.
3830     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3831     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3832     break;
3833   }
3834   case AMDGPU::G_STORE: {
3835     assert(MI.getOperand(0).isReg());
3836     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3837 
3838     // FIXME: We need to specify a different reg bank once scalar stores are
3839     // supported.
3840     const ValueMapping *ValMapping =
3841         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3842     OpdsMapping[0] = ValMapping;
3843     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3844     break;
3845   }
3846   case AMDGPU::G_ICMP: {
3847     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3848     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3849 
3850     // See if the result register has already been constrained to vcc, which may
3851     // happen due to control flow intrinsic lowering.
3852     unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3853                                     AMDGPU::SGPRRegBankID);
3854     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3855     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3856 
3857     bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3858                      Op2Bank == AMDGPU::SGPRRegBankID &&
3859                      Op3Bank == AMDGPU::SGPRRegBankID &&
3860       (Size == 32 || (Size == 64 &&
3861                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3862                       Subtarget.hasScalarCompareEq64()));
3863 
3864     DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3865     unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3866 
3867     // TODO: Use 32-bit for scalar output size.
3868     // SCC results will need to be copied to a 32-bit SGPR virtual register.
3869     const unsigned ResultSize = 1;
3870 
3871     OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3872     OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3873     OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3874     break;
3875   }
3876   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3877     // VGPR index can be used for waterfall when indexing a SGPR vector.
3878     unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3879     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3880     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3881     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3882     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3883     unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3884 
3885     OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3886     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3887 
3888     // The index can be either if the source vector is VGPR.
3889     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3890     break;
3891   }
3892   case AMDGPU::G_INSERT_VECTOR_ELT: {
3893     unsigned OutputBankID = isSALUMapping(MI) ?
3894       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3895 
3896     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3897     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3898     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3899     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3900     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3901 
3902     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3903     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3904 
3905     // This is a weird case, because we need to break down the mapping based on
3906     // the register bank of a different operand.
3907     if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3908       OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3909                                                       InsertSize);
3910     } else {
3911       assert(InsertSize == 32 || InsertSize == 64);
3912       OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3913     }
3914 
3915     // The index can be either if the source vector is VGPR.
3916     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3917     break;
3918   }
3919   case AMDGPU::G_UNMERGE_VALUES: {
3920     unsigned Bank = getMappingType(MRI, MI);
3921 
3922     // Op1 and Dst should use the same register bank.
3923     // FIXME: Shouldn't this be the default? Why do we need to handle this?
3924     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3925       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3926       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3927     }
3928     break;
3929   }
3930   case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3931   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3932   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3933   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3934   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3935   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3936   case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3937   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3938   case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3939   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3940   case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3941   case AMDGPU::G_AMDGPU_BUFFER_STORE:
3942   case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3943   case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3944   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3945   case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3946     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3947 
3948     // rsrc
3949     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3950 
3951     // vindex
3952     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3953 
3954     // voffset
3955     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3956 
3957     // soffset
3958     OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3959 
3960     // Any remaining operands are immediates and were correctly null
3961     // initialized.
3962     break;
3963   }
3964   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3965   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3966   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3967   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3968   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3969   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3970   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3971   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3972   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3973   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3974   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3975   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3976   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3977   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3978   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3979     // vdata_out
3980     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3981 
3982     // vdata_in
3983     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3984 
3985     // rsrc
3986     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3987 
3988     // vindex
3989     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3990 
3991     // voffset
3992     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3993 
3994     // soffset
3995     OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3996 
3997     // Any remaining operands are immediates and were correctly null
3998     // initialized.
3999     break;
4000   }
4001   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4002     // vdata_out
4003     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4004 
4005     // vdata_in
4006     OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4007 
4008     // cmp
4009     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4010 
4011     // rsrc
4012     OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4013 
4014     // vindex
4015     OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4016 
4017     // voffset
4018     OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4019 
4020     // soffset
4021     OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4022 
4023     // Any remaining operands are immediates and were correctly null
4024     // initialized.
4025     break;
4026   }
4027   case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4028     // Lie and claim everything is legal, even though some need to be
4029     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4030     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4031     OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4032 
4033     // We need to convert this to a MUBUF if either the resource of offset is
4034     // VGPR.
4035     unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4036     unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4037     unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4038 
4039     unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4040     OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4041     break;
4042   }
4043   case AMDGPU::G_INTRINSIC: {
4044     switch (MI.getIntrinsicID()) {
4045     default:
4046       return getInvalidInstructionMapping();
4047     case Intrinsic::amdgcn_div_fmas:
4048     case Intrinsic::amdgcn_div_fixup:
4049     case Intrinsic::amdgcn_trig_preop:
4050     case Intrinsic::amdgcn_sin:
4051     case Intrinsic::amdgcn_cos:
4052     case Intrinsic::amdgcn_log_clamp:
4053     case Intrinsic::amdgcn_rcp:
4054     case Intrinsic::amdgcn_rcp_legacy:
4055     case Intrinsic::amdgcn_sqrt:
4056     case Intrinsic::amdgcn_rsq:
4057     case Intrinsic::amdgcn_rsq_legacy:
4058     case Intrinsic::amdgcn_rsq_clamp:
4059     case Intrinsic::amdgcn_fmul_legacy:
4060     case Intrinsic::amdgcn_fma_legacy:
4061     case Intrinsic::amdgcn_ldexp:
4062     case Intrinsic::amdgcn_frexp_mant:
4063     case Intrinsic::amdgcn_frexp_exp:
4064     case Intrinsic::amdgcn_fract:
4065     case Intrinsic::amdgcn_cvt_pkrtz:
4066     case Intrinsic::amdgcn_cvt_pknorm_i16:
4067     case Intrinsic::amdgcn_cvt_pknorm_u16:
4068     case Intrinsic::amdgcn_cvt_pk_i16:
4069     case Intrinsic::amdgcn_cvt_pk_u16:
4070     case Intrinsic::amdgcn_fmed3:
4071     case Intrinsic::amdgcn_cubeid:
4072     case Intrinsic::amdgcn_cubema:
4073     case Intrinsic::amdgcn_cubesc:
4074     case Intrinsic::amdgcn_cubetc:
4075     case Intrinsic::amdgcn_sffbh:
4076     case Intrinsic::amdgcn_fmad_ftz:
4077     case Intrinsic::amdgcn_mbcnt_lo:
4078     case Intrinsic::amdgcn_mbcnt_hi:
4079     case Intrinsic::amdgcn_mul_u24:
4080     case Intrinsic::amdgcn_mul_i24:
4081     case Intrinsic::amdgcn_mulhi_u24:
4082     case Intrinsic::amdgcn_mulhi_i24:
4083     case Intrinsic::amdgcn_lerp:
4084     case Intrinsic::amdgcn_sad_u8:
4085     case Intrinsic::amdgcn_msad_u8:
4086     case Intrinsic::amdgcn_sad_hi_u8:
4087     case Intrinsic::amdgcn_sad_u16:
4088     case Intrinsic::amdgcn_qsad_pk_u16_u8:
4089     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4090     case Intrinsic::amdgcn_mqsad_u32_u8:
4091     case Intrinsic::amdgcn_cvt_pk_u8_f32:
4092     case Intrinsic::amdgcn_alignbyte:
4093     case Intrinsic::amdgcn_perm:
4094     case Intrinsic::amdgcn_fdot2:
4095     case Intrinsic::amdgcn_sdot2:
4096     case Intrinsic::amdgcn_udot2:
4097     case Intrinsic::amdgcn_sdot4:
4098     case Intrinsic::amdgcn_udot4:
4099     case Intrinsic::amdgcn_sdot8:
4100     case Intrinsic::amdgcn_udot8:
4101       return getDefaultMappingVOP(MI);
4102     case Intrinsic::amdgcn_sbfe:
4103     case Intrinsic::amdgcn_ubfe:
4104       if (isSALUMapping(MI))
4105         return getDefaultMappingSOP(MI);
4106       return getDefaultMappingVOP(MI);
4107     case Intrinsic::amdgcn_ds_swizzle:
4108     case Intrinsic::amdgcn_ds_permute:
4109     case Intrinsic::amdgcn_ds_bpermute:
4110     case Intrinsic::amdgcn_update_dpp:
4111     case Intrinsic::amdgcn_mov_dpp8:
4112     case Intrinsic::amdgcn_mov_dpp:
4113     case Intrinsic::amdgcn_strict_wwm:
4114     case Intrinsic::amdgcn_wwm:
4115     case Intrinsic::amdgcn_strict_wqm:
4116     case Intrinsic::amdgcn_wqm:
4117     case Intrinsic::amdgcn_softwqm:
4118     case Intrinsic::amdgcn_set_inactive:
4119       return getDefaultMappingAllVGPR(MI);
4120     case Intrinsic::amdgcn_kernarg_segment_ptr:
4121     case Intrinsic::amdgcn_s_getpc:
4122     case Intrinsic::amdgcn_groupstaticsize:
4123     case Intrinsic::amdgcn_reloc_constant:
4124     case Intrinsic::returnaddress: {
4125       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4126       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4127       break;
4128     }
4129     case Intrinsic::amdgcn_wqm_vote: {
4130       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4131       OpdsMapping[0] = OpdsMapping[2]
4132         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4133       break;
4134     }
4135     case Intrinsic::amdgcn_ps_live: {
4136       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4137       break;
4138     }
4139     case Intrinsic::amdgcn_div_scale: {
4140       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4141       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4142       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4143       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4144 
4145       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4146       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4147       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4148       break;
4149     }
4150     case Intrinsic::amdgcn_class: {
4151       Register Src0Reg = MI.getOperand(2).getReg();
4152       Register Src1Reg = MI.getOperand(3).getReg();
4153       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4154       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4155       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4156       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4157       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4158       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4159       break;
4160     }
4161     case Intrinsic::amdgcn_icmp:
4162     case Intrinsic::amdgcn_fcmp: {
4163       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4164       // This is not VCCRegBank because this is not used in boolean contexts.
4165       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4166       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4167       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4168       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4169       break;
4170     }
4171     case Intrinsic::amdgcn_readlane: {
4172       // This must be an SGPR, but accept a VGPR.
4173       Register IdxReg = MI.getOperand(3).getReg();
4174       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4175       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4176       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4177       LLVM_FALLTHROUGH;
4178     }
4179     case Intrinsic::amdgcn_readfirstlane: {
4180       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4181       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4182       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4183       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4184       break;
4185     }
4186     case Intrinsic::amdgcn_writelane: {
4187       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4188       Register SrcReg = MI.getOperand(2).getReg();
4189       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4190       unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4191       Register IdxReg = MI.getOperand(3).getReg();
4192       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4193       unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4194       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4195 
4196       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4197       // to legalize.
4198       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4199       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4200       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4201       break;
4202     }
4203     case Intrinsic::amdgcn_if_break: {
4204       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4205       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4206       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4207       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4208       break;
4209     }
4210     case Intrinsic::amdgcn_permlane16:
4211     case Intrinsic::amdgcn_permlanex16: {
4212       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4213       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4214       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4215       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4216       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4217       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4218       break;
4219     }
4220     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4221     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4222     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4223     case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4224     case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4225     case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4226     case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4227     case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4228     case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4229     case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4230     case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4231     case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4232     case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4233     case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4234     case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4235     case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4236     case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4237     case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4238     case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4239     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4240     case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4241     case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4242     case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4243     case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4244     case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4245     case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4246     case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4247       // Default for MAI intrinsics.
4248       // srcC can also be an immediate which can be folded later.
4249       // FIXME: Should we eventually add an alternative mapping with AGPR src
4250       // for srcA/srcB?
4251       //
4252       // vdst, srcA, srcB, srcC
4253       const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4254       OpdsMapping[0] =
4255           Info->mayNeedAGPRs()
4256               ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4257               : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4258       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4259       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4260       OpdsMapping[4] =
4261           Info->mayNeedAGPRs()
4262               ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4263               : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4264       break;
4265     }
4266     case Intrinsic::amdgcn_interp_p1:
4267     case Intrinsic::amdgcn_interp_p2:
4268     case Intrinsic::amdgcn_interp_mov:
4269     case Intrinsic::amdgcn_interp_p1_f16:
4270     case Intrinsic::amdgcn_interp_p2_f16: {
4271       const int M0Idx = MI.getNumOperands() - 1;
4272       Register M0Reg = MI.getOperand(M0Idx).getReg();
4273       unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4274       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4275 
4276       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4277       for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4278         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4279 
4280       // Must be SGPR, but we must take whatever the original bank is and fix it
4281       // later.
4282       OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4283       break;
4284     }
4285     case Intrinsic::amdgcn_ballot: {
4286       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4287       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4288       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4289       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4290       break;
4291     }
4292     }
4293     break;
4294   }
4295   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4296   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4297   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4298   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4299     auto IntrID = MI.getIntrinsicID();
4300     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4301     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4302     // Non-images can have complications from operands that allow both SGPR
4303     // and VGPR. For now it's too complicated to figure out the final opcode
4304     // to derive the register bank from the MCInstrDesc.
4305     assert(RSrcIntrin->IsImage);
4306     return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4307   }
4308   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4309     unsigned N = MI.getNumExplicitOperands() - 2;
4310     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4311     OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4312     if (N == 3) {
4313       // Sequential form: all operands combined into VGPR256/VGPR512
4314       unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4315       if (Size > 256)
4316         Size = 512;
4317       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4318     } else {
4319       // NSA form
4320       for (unsigned I = 2; I < N; ++I)
4321         OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4322     }
4323     break;
4324   }
4325   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4326     auto IntrID = MI.getIntrinsicID();
4327     switch (IntrID) {
4328     case Intrinsic::amdgcn_s_getreg:
4329     case Intrinsic::amdgcn_s_memtime:
4330     case Intrinsic::amdgcn_s_memrealtime:
4331     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4332       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4333       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4334       break;
4335     }
4336     case Intrinsic::amdgcn_global_atomic_fadd:
4337     case Intrinsic::amdgcn_global_atomic_csub:
4338     case Intrinsic::amdgcn_global_atomic_fmin:
4339     case Intrinsic::amdgcn_global_atomic_fmax:
4340     case Intrinsic::amdgcn_flat_atomic_fadd:
4341     case Intrinsic::amdgcn_flat_atomic_fmin:
4342     case Intrinsic::amdgcn_flat_atomic_fmax:
4343     case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4344     case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4345       return getDefaultMappingAllVGPR(MI);
4346     case Intrinsic::amdgcn_ds_ordered_add:
4347     case Intrinsic::amdgcn_ds_ordered_swap: {
4348       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4349       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4350       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4351                                  AMDGPU::SGPRRegBankID);
4352       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4353       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4354       break;
4355     }
4356     case Intrinsic::amdgcn_ds_append:
4357     case Intrinsic::amdgcn_ds_consume: {
4358       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4359       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4360       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4361       break;
4362     }
4363     case Intrinsic::amdgcn_exp_compr:
4364       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4365       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4366       break;
4367     case Intrinsic::amdgcn_exp:
4368       // FIXME: Could we support packed types here?
4369       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4370       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4371       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4372       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4373       break;
4374     case Intrinsic::amdgcn_s_sendmsg:
4375     case Intrinsic::amdgcn_s_sendmsghalt: {
4376       // This must be an SGPR, but accept a VGPR.
4377       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4378                                    AMDGPU::SGPRRegBankID);
4379       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4380       break;
4381     }
4382     case Intrinsic::amdgcn_s_setreg: {
4383       // This must be an SGPR, but accept a VGPR.
4384       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4385                                    AMDGPU::SGPRRegBankID);
4386       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4387       break;
4388     }
4389     case Intrinsic::amdgcn_end_cf: {
4390       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4391       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4392       break;
4393     }
4394     case Intrinsic::amdgcn_else: {
4395       unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4396       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4397       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4398       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4399       break;
4400     }
4401     case Intrinsic::amdgcn_live_mask: {
4402       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4403       break;
4404     }
4405     case Intrinsic::amdgcn_wqm_demote:
4406     case Intrinsic::amdgcn_kill: {
4407       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4408       break;
4409     }
4410     case Intrinsic::amdgcn_raw_buffer_load:
4411     case Intrinsic::amdgcn_raw_tbuffer_load: {
4412       // FIXME: Should make intrinsic ID the last operand of the instruction,
4413       // then this would be the same as store
4414       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4415       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4416       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4417       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4418       break;
4419     }
4420     case Intrinsic::amdgcn_raw_buffer_store:
4421     case Intrinsic::amdgcn_raw_buffer_store_format:
4422     case Intrinsic::amdgcn_raw_tbuffer_store: {
4423       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4424       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4425       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4426       OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4427       break;
4428     }
4429     case Intrinsic::amdgcn_struct_buffer_load:
4430     case Intrinsic::amdgcn_struct_tbuffer_load: {
4431       OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4432       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4433       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4434       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4435       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4436       break;
4437     }
4438     case Intrinsic::amdgcn_struct_buffer_store:
4439     case Intrinsic::amdgcn_struct_tbuffer_store: {
4440       OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4441       OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4442       OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4443       OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4444       OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4445       break;
4446     }
4447     case Intrinsic::amdgcn_init_exec_from_input: {
4448       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4449       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4450       break;
4451     }
4452     case Intrinsic::amdgcn_ds_gws_init:
4453     case Intrinsic::amdgcn_ds_gws_barrier:
4454     case Intrinsic::amdgcn_ds_gws_sema_br: {
4455       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4456 
4457       // This must be an SGPR, but accept a VGPR.
4458       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4459                                    AMDGPU::SGPRRegBankID);
4460       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4461       break;
4462     }
4463     case Intrinsic::amdgcn_ds_gws_sema_v:
4464     case Intrinsic::amdgcn_ds_gws_sema_p:
4465     case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4466       // This must be an SGPR, but accept a VGPR.
4467       unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4468                                    AMDGPU::SGPRRegBankID);
4469       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4470       break;
4471     }
4472     default:
4473       return getInvalidInstructionMapping();
4474     }
4475     break;
4476   }
4477   case AMDGPU::G_SELECT: {
4478     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4479     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4480                                     AMDGPU::SGPRRegBankID);
4481     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4482                                     AMDGPU::SGPRRegBankID);
4483     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4484                     Op3Bank == AMDGPU::SGPRRegBankID;
4485 
4486     unsigned CondBankDefault = SGPRSrcs ?
4487       AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4488     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4489                                      CondBankDefault);
4490     if (CondBank == AMDGPU::SGPRRegBankID)
4491       CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4492     else if (CondBank == AMDGPU::VGPRRegBankID)
4493       CondBank = AMDGPU::VCCRegBankID;
4494 
4495     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4496       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4497 
4498     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4499 
4500     // TODO: Should report 32-bit for scalar condition type.
4501     if (Size == 64) {
4502       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4503       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4504       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4505       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4506     } else {
4507       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4508       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4509       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4510       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4511     }
4512 
4513     break;
4514   }
4515 
4516   case AMDGPU::G_SI_CALL: {
4517     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4518     // Lie and claim everything is legal, even though some need to be
4519     // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4520     OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4521 
4522     // Allow anything for implicit arguments
4523     for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4524       if (MI.getOperand(I).isReg()) {
4525         Register Reg = MI.getOperand(I).getReg();
4526         auto OpBank = getRegBankID(Reg, MRI);
4527         unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4528         OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4529       }
4530     }
4531     break;
4532   }
4533   case AMDGPU::G_LOAD:
4534   case AMDGPU::G_ZEXTLOAD:
4535   case AMDGPU::G_SEXTLOAD:
4536     return getInstrMappingForLoad(MI);
4537 
4538   case AMDGPU::G_ATOMICRMW_XCHG:
4539   case AMDGPU::G_ATOMICRMW_ADD:
4540   case AMDGPU::G_ATOMICRMW_SUB:
4541   case AMDGPU::G_ATOMICRMW_AND:
4542   case AMDGPU::G_ATOMICRMW_OR:
4543   case AMDGPU::G_ATOMICRMW_XOR:
4544   case AMDGPU::G_ATOMICRMW_MAX:
4545   case AMDGPU::G_ATOMICRMW_MIN:
4546   case AMDGPU::G_ATOMICRMW_UMAX:
4547   case AMDGPU::G_ATOMICRMW_UMIN:
4548   case AMDGPU::G_ATOMICRMW_FADD:
4549   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4550   case AMDGPU::G_AMDGPU_ATOMIC_INC:
4551   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4552   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4553   case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4554     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4555     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4556     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4557     break;
4558   }
4559   case AMDGPU::G_ATOMIC_CMPXCHG: {
4560     OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4561     OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4562     OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4563     OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4564     break;
4565   }
4566   case AMDGPU::G_BRCOND: {
4567     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4568                                  AMDGPU::SGPRRegBankID);
4569     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4570     if (Bank != AMDGPU::SGPRRegBankID)
4571       Bank = AMDGPU::VCCRegBankID;
4572 
4573     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4574     break;
4575   }
4576   case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4577   case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4578     return getDefaultMappingVOP(MI);
4579   }
4580 
4581   return getInstructionMapping(/*ID*/1, /*Cost*/1,
4582                                getOperandsMapping(OpdsMapping),
4583                                MI.getNumOperands());
4584 }
4585