1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPURegisterBankInfo.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
24 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
25 #include "llvm/CodeGen/TargetRegisterInfo.h"
26 #include "llvm/CodeGen/TargetSubtargetInfo.h"
27 #include "llvm/IR/Constants.h"
28 
29 #define GET_TARGET_REGBANK_IMPL
30 #include "AMDGPUGenRegisterBank.inc"
31 
32 // This file will be TableGen'ed at some point.
33 #include "AMDGPUGenRegisterBankInfo.def"
34 
35 using namespace llvm;
36 
37 namespace {
38 
39 // Observer to apply a register bank to new registers created by LegalizerHelper.
40 class ApplyRegBankMapping final : public GISelChangeObserver {
41 private:
42   MachineRegisterInfo &MRI;
43   const RegisterBank *NewBank;
44   SmallVector<MachineInstr *, 4> NewInsts;
45 
46 public:
47   ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
48     : MRI(MRI_), NewBank(RB) {}
49 
50   ~ApplyRegBankMapping() {
51     for (MachineInstr *MI : NewInsts)
52       applyBank(*MI);
53   }
54 
55   /// Set any registers that don't have a set register class or bank to SALU.
56   void applyBank(MachineInstr &MI) {
57     for (MachineOperand &Op : MI.operands()) {
58       if (!Op.isReg())
59         continue;
60 
61       Register Reg = Op.getReg();
62       if (MRI.getRegClassOrRegBank(Reg))
63         continue;
64 
65       const RegisterBank *RB = NewBank;
66       // FIXME: This might not be enough to detect when SCC should be used.
67       if (MRI.getType(Reg) == LLT::scalar(1))
68         RB = (NewBank == &AMDGPU::SGPRRegBank ?
69               &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
70 
71       MRI.setRegBank(Reg, *RB);
72     }
73   }
74 
75   void erasingInstr(MachineInstr &MI) override {}
76 
77   void createdInstr(MachineInstr &MI) override {
78     // At this point, the instruction was just inserted and has no operands.
79     NewInsts.push_back(&MI);
80   }
81 
82   void changingInstr(MachineInstr &MI) override {}
83   void changedInstr(MachineInstr &MI) override {}
84 };
85 
86 }
87 AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
88     : AMDGPUGenRegisterBankInfo(),
89       TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
90 
91   // HACK: Until this is fully tablegen'd.
92   static bool AlreadyInit = false;
93   if (AlreadyInit)
94     return;
95 
96   AlreadyInit = true;
97 
98   const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
99   (void)RBSGPR;
100   assert(&RBSGPR == &AMDGPU::SGPRRegBank);
101 
102   const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
103   (void)RBVGPR;
104   assert(&RBVGPR == &AMDGPU::VGPRRegBank);
105 
106 }
107 
108 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
109                                           const RegisterBank &Src,
110                                           unsigned Size) const {
111   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
112   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
113       Src.getID() == AMDGPU::VGPRRegBankID) {
114     return std::numeric_limits<unsigned>::max();
115   }
116 
117   // Bool values are tricky, because the meaning is based on context. The SCC
118   // and VCC banks are for the natural scalar and vector conditions produced by
119   // a compare.
120   //
121   // Legalization doesn't know about the necessary context, so an s1 use may
122   // have been a truncate from an arbitrary value, in which case a copy (lowered
123   // as a compare with 0) needs to be inserted.
124   if (Size == 1 &&
125       (Dst.getID() == AMDGPU::SCCRegBankID ||
126        Dst.getID() == AMDGPU::SGPRRegBankID) &&
127       (Src.getID() == AMDGPU::SGPRRegBankID ||
128        Src.getID() == AMDGPU::VGPRRegBankID ||
129        Src.getID() == AMDGPU::VCCRegBankID))
130     return std::numeric_limits<unsigned>::max();
131 
132   if (Dst.getID() == AMDGPU::SCCRegBankID &&
133       Src.getID() == AMDGPU::VCCRegBankID)
134     return std::numeric_limits<unsigned>::max();
135 
136   return RegisterBankInfo::copyCost(Dst, Src, Size);
137 }
138 
139 unsigned AMDGPURegisterBankInfo::getBreakDownCost(
140   const ValueMapping &ValMapping,
141   const RegisterBank *CurBank) const {
142   // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
143   // VGPR.
144   // FIXME: Is there a better way to do this?
145   if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
146     return 10; // This is expensive.
147 
148   assert(ValMapping.NumBreakDowns == 2 &&
149          ValMapping.BreakDown[0].Length == 32 &&
150          ValMapping.BreakDown[0].StartIdx == 0 &&
151          ValMapping.BreakDown[1].Length == 32 &&
152          ValMapping.BreakDown[1].StartIdx == 32 &&
153          ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
154 
155   // 32-bit extract of a 64-bit value is just access of a subregister, so free.
156   // TODO: Cost of 0 hits assert, though it's not clear it's what we really
157   // want.
158 
159   // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
160   // alignment restrictions, but this probably isn't important.
161   return 1;
162 }
163 
164 const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
165     const TargetRegisterClass &RC) const {
166 
167   if (TRI->isSGPRClass(&RC))
168     return getRegBank(AMDGPU::SGPRRegBankID);
169 
170   return getRegBank(AMDGPU::VGPRRegBankID);
171 }
172 
173 template <unsigned NumOps>
174 RegisterBankInfo::InstructionMappings
175 AMDGPURegisterBankInfo::addMappingFromTable(
176     const MachineInstr &MI, const MachineRegisterInfo &MRI,
177     const std::array<unsigned, NumOps> RegSrcOpIdx,
178     ArrayRef<OpRegBankEntry<NumOps>> Table) const {
179 
180   InstructionMappings AltMappings;
181 
182   SmallVector<const ValueMapping *, 10> Operands(MI.getNumOperands());
183 
184   unsigned Sizes[NumOps];
185   for (unsigned I = 0; I < NumOps; ++I) {
186     Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
187     Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
188   }
189 
190   for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
191     unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
192     Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
193   }
194 
195   // getInstrMapping's default mapping uses ID 1, so start at 2.
196   unsigned MappingID = 2;
197   for (const auto &Entry : Table) {
198     for (unsigned I = 0; I < NumOps; ++I) {
199       int OpIdx = RegSrcOpIdx[I];
200       Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
201     }
202 
203     AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
204                                                  getOperandsMapping(Operands),
205                                                  Operands.size()));
206   }
207 
208   return AltMappings;
209 }
210 
211 RegisterBankInfo::InstructionMappings
212 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
213     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
214   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
215   case Intrinsic::amdgcn_readlane: {
216     static const OpRegBankEntry<3> Table[2] = {
217       // Perfectly legal.
218       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
219 
220       // Need a readfirstlane for the index.
221       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
222     };
223 
224     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
225     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
226   }
227   case Intrinsic::amdgcn_writelane: {
228     static const OpRegBankEntry<4> Table[4] = {
229       // Perfectly legal.
230       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
231 
232       // Need readfirstlane of first op
233       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
234 
235       // Need readfirstlane of second op
236       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
237 
238       // Need readfirstlane of both ops
239       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
240     };
241 
242     // rsrc, voffset, offset
243     const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
244     return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
245   }
246   default:
247     return RegisterBankInfo::getInstrAlternativeMappings(MI);
248   }
249 }
250 
251 RegisterBankInfo::InstructionMappings
252 AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
253     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
254 
255   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
256   case Intrinsic::amdgcn_buffer_load: {
257     static const OpRegBankEntry<3> Table[4] = {
258       // Perfectly legal.
259       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
260       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
261 
262       // Waterfall loop needed for rsrc. In the worst case this will execute
263       // approximately an extra 10 * wavesize + 2 instructions.
264       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
265       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
266     };
267 
268     // rsrc, voffset, offset
269     const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
270     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
271   }
272   case Intrinsic::amdgcn_s_buffer_load: {
273     static const OpRegBankEntry<2> Table[4] = {
274       // Perfectly legal.
275       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
276 
277       // Only need 1 register in loop
278       { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
279 
280       // Have to waterfall the resource.
281       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
282 
283       // Have to waterfall the resource, and the offset.
284       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
285     };
286 
287     // rsrc, offset
288     const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
289     return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
290   }
291   case Intrinsic::amdgcn_ds_ordered_add:
292   case Intrinsic::amdgcn_ds_ordered_swap: {
293     // VGPR = M0, VGPR
294     static const OpRegBankEntry<3> Table[2] = {
295       // Perfectly legal.
296       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID  }, 1 },
297 
298       // Need a readfirstlane for m0
299       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
300     };
301 
302     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
303     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
304   }
305   case Intrinsic::amdgcn_s_sendmsg:
306   case Intrinsic::amdgcn_s_sendmsghalt: {
307     static const OpRegBankEntry<1> Table[2] = {
308       // Perfectly legal.
309       { { AMDGPU::SGPRRegBankID }, 1 },
310 
311       // Need readlane
312       { { AMDGPU::VGPRRegBankID }, 3 }
313     };
314 
315     const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
316     return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
317   }
318   default:
319     return RegisterBankInfo::getInstrAlternativeMappings(MI);
320   }
321 }
322 
323 static bool isInstrUniform(const MachineInstr &MI) {
324   if (!MI.hasOneMemOperand())
325     return false;
326 
327   const MachineMemOperand *MMO = *MI.memoperands_begin();
328   return AMDGPUInstrInfo::isUniformMMO(MMO);
329 }
330 
331 RegisterBankInfo::InstructionMappings
332 AMDGPURegisterBankInfo::getInstrAlternativeMappings(
333     const MachineInstr &MI) const {
334 
335   const MachineFunction &MF = *MI.getParent()->getParent();
336   const MachineRegisterInfo &MRI = MF.getRegInfo();
337 
338 
339   InstructionMappings AltMappings;
340   switch (MI.getOpcode()) {
341   case TargetOpcode::G_CONSTANT:
342   case TargetOpcode::G_FCONSTANT:
343   case TargetOpcode::G_FRAME_INDEX:
344   case TargetOpcode::G_GLOBAL_VALUE: {
345     static const OpRegBankEntry<1> Table[2] = {
346       { { AMDGPU::VGPRRegBankID }, 1 },
347       { { AMDGPU::SGPRRegBankID }, 1 }
348     };
349 
350     return addMappingFromTable<1>(MI, MRI, { 0 }, Table);
351   }
352   case TargetOpcode::G_AND:
353   case TargetOpcode::G_OR:
354   case TargetOpcode::G_XOR: {
355     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
356 
357     if (Size == 1) {
358       // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
359       const InstructionMapping &SCCMapping = getInstructionMapping(
360         1, 1, getOperandsMapping(
361           {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
362            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
363            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
364         3); // Num Operands
365       AltMappings.push_back(&SCCMapping);
366 
367       const InstructionMapping &SGPRMapping = getInstructionMapping(
368         1, 1, getOperandsMapping(
369           {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
370            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
371            AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
372         3); // Num Operands
373       AltMappings.push_back(&SGPRMapping);
374 
375       const InstructionMapping &VCCMapping0 = getInstructionMapping(
376         2, 10, getOperandsMapping(
377           {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
378               AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
379               AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
380         3); // Num Operands
381       AltMappings.push_back(&VCCMapping0);
382       return AltMappings;
383     }
384 
385     if (Size != 64)
386       break;
387 
388     const InstructionMapping &SSMapping = getInstructionMapping(
389       1, 1, getOperandsMapping(
390         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
391          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
392          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
393       3); // Num Operands
394     AltMappings.push_back(&SSMapping);
395 
396     const InstructionMapping &VVMapping = getInstructionMapping(
397       2, 2, getOperandsMapping(
398         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
399          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
400          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
401       3); // Num Operands
402     AltMappings.push_back(&VVMapping);
403 
404     const InstructionMapping &SVMapping = getInstructionMapping(
405       3, 3, getOperandsMapping(
406         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
407          AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
408          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
409       3); // Num Operands
410     AltMappings.push_back(&SVMapping);
411 
412     // SGPR in LHS is slightly preferrable, so make it VS more expensive than
413     // SV.
414     const InstructionMapping &VSMapping = getInstructionMapping(
415       3, 4, getOperandsMapping(
416         {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
417          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
418          AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
419       3); // Num Operands
420     AltMappings.push_back(&VSMapping);
421     break;
422   }
423   case TargetOpcode::G_LOAD: {
424     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
425     LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
426     // FIXME: Should we be hard coding the size for these mappings?
427     if (isInstrUniform(MI)) {
428       const InstructionMapping &SSMapping = getInstructionMapping(
429           1, 1, getOperandsMapping(
430                     {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
431                      AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
432           2); // Num Operands
433       AltMappings.push_back(&SSMapping);
434     }
435 
436     const InstructionMapping &VVMapping = getInstructionMapping(
437         2, 1, getOperandsMapping(
438                   {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
439                    AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
440         2); // Num Operands
441     AltMappings.push_back(&VVMapping);
442 
443     // It may be possible to have a vgpr = load sgpr mapping here, because
444     // the mubuf instructions support this kind of load, but probably for only
445     // gfx7 and older.  However, the addressing mode matching in the instruction
446     // selector should be able to do a better job of detecting and selecting
447     // these kinds of loads from the vgpr = load vgpr mapping.
448 
449     return AltMappings;
450 
451   }
452   case TargetOpcode::G_ICMP: {
453     unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
454     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
455       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
456                           nullptr, // Predicate operand.
457                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
458                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
459       4); // Num Operands
460     AltMappings.push_back(&SSMapping);
461 
462     const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
463       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
464                           nullptr, // Predicate operand.
465                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
466                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
467       4); // Num Operands
468     AltMappings.push_back(&SVMapping);
469 
470     const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
471       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
472                           nullptr, // Predicate operand.
473                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
474                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
475       4); // Num Operands
476     AltMappings.push_back(&VSMapping);
477 
478     const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
479       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
480                           nullptr, // Predicate operand.
481                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
482                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
483       4); // Num Operands
484     AltMappings.push_back(&VVMapping);
485 
486     return AltMappings;
487   }
488   case TargetOpcode::G_SELECT: {
489     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
490     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
491       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
492                           AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
493                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
494                           AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
495       4); // Num Operands
496     AltMappings.push_back(&SSMapping);
497 
498     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
499       getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
500                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
501                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
502                           AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
503       4); // Num Operands
504     AltMappings.push_back(&VVMapping);
505 
506     return AltMappings;
507   }
508   case TargetOpcode::G_SMIN:
509   case TargetOpcode::G_SMAX:
510   case TargetOpcode::G_UMIN:
511   case TargetOpcode::G_UMAX: {
512     static const OpRegBankEntry<3> Table[4] = {
513       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
514       { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
515       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
516 
517       // Scalar requires cmp+select, and extends if 16-bit.
518       // FIXME: Should there be separate costs for 32 and 16-bit
519       { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 3 }
520     };
521 
522     const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 1, 2 } };
523     return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
524   }
525   case TargetOpcode::G_UADDE:
526   case TargetOpcode::G_USUBE:
527   case TargetOpcode::G_SADDE:
528   case TargetOpcode::G_SSUBE: {
529     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
530     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
531       getOperandsMapping(
532         {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
533          AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
534          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536          AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
537       5); // Num Operands
538     AltMappings.push_back(&SSMapping);
539 
540     const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
541       getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
542                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
543                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
544                           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
545                           AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
546       5); // Num Operands
547     AltMappings.push_back(&VVMapping);
548     return AltMappings;
549   }
550   case AMDGPU::G_BRCOND: {
551     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
552 
553     const InstructionMapping &SMapping = getInstructionMapping(
554       1, 1, getOperandsMapping(
555         {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
556       2); // Num Operands
557     AltMappings.push_back(&SMapping);
558 
559     const InstructionMapping &VMapping = getInstructionMapping(
560       1, 1, getOperandsMapping(
561         {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
562       2); // Num Operands
563     AltMappings.push_back(&VMapping);
564     return AltMappings;
565   }
566   case AMDGPU::G_INTRINSIC:
567     return getInstrAlternativeMappingsIntrinsic(MI, MRI);
568   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
569     return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
570   default:
571     break;
572   }
573   return RegisterBankInfo::getInstrAlternativeMappings(MI);
574 }
575 
576 void AMDGPURegisterBankInfo::split64BitValueForMapping(
577   MachineIRBuilder &B,
578   SmallVector<Register, 2> &Regs,
579   LLT HalfTy,
580   Register Reg) const {
581   assert(HalfTy.getSizeInBits() == 32);
582   MachineRegisterInfo *MRI = B.getMRI();
583   Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
584   Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
585   const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
586   MRI->setRegBank(LoLHS, *Bank);
587   MRI->setRegBank(HiLHS, *Bank);
588 
589   Regs.push_back(LoLHS);
590   Regs.push_back(HiLHS);
591 
592   B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
593     .addDef(LoLHS)
594     .addDef(HiLHS)
595     .addUse(Reg);
596 }
597 
598 /// Replace the current type each register in \p Regs has with \p NewTy
599 static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef<Register> Regs,
600                           LLT NewTy) {
601   for (Register Reg : Regs) {
602     assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
603     MRI.setType(Reg, NewTy);
604   }
605 }
606 
607 static LLT getHalfSizedType(LLT Ty) {
608   if (Ty.isVector()) {
609     assert(Ty.getNumElements() % 2 == 0);
610     return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
611   }
612 
613   assert(Ty.getSizeInBits() % 2 == 0);
614   return LLT::scalar(Ty.getSizeInBits() / 2);
615 }
616 
617 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
618 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
619 /// execute the instruction for each unique combination of values in all lanes
620 /// in the wave. The block will be split such that rest of the instructions are
621 /// moved to a new block.
622 ///
623 /// Essentially performs this loop:
624 //
625 /// Save Execution Mask
626 /// For (Lane : Wavefront) {
627 ///   Enable Lane, Disable all other lanes
628 ///   SGPR = read SGPR value for current lane from VGPR
629 ///   VGPRResult[Lane] = use_op SGPR
630 /// }
631 /// Restore Execution Mask
632 ///
633 /// There is additional complexity to try for compare values to identify the
634 /// unique values used.
635 void AMDGPURegisterBankInfo::executeInWaterfallLoop(
636   MachineInstr &MI, MachineRegisterInfo &MRI,
637   ArrayRef<unsigned> OpIndices) const {
638   MachineFunction *MF = MI.getParent()->getParent();
639   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
640   const SIInstrInfo *TII = ST.getInstrInfo();
641   MachineBasicBlock::iterator I(MI);
642 
643   MachineBasicBlock &MBB = *MI.getParent();
644   const DebugLoc &DL = MI.getDebugLoc();
645 
646   // Use a set to avoid extra readfirstlanes in the case where multiple operands
647   // are the same register.
648   SmallSet<Register, 4> SGPROperandRegs;
649   for (unsigned Op : OpIndices) {
650     assert(MI.getOperand(Op).isUse());
651     Register Reg = MI.getOperand(Op).getReg();
652     const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
653     if (OpBank->getID() == AMDGPU::VGPRRegBankID)
654       SGPROperandRegs.insert(Reg);
655   }
656 
657   // No operands need to be replaced, so no need to loop.
658   if (SGPROperandRegs.empty())
659     return;
660 
661   MachineIRBuilder B(MI);
662   SmallVector<Register, 4> ResultRegs;
663   SmallVector<Register, 4> InitResultRegs;
664   SmallVector<Register, 4> PhiRegs;
665   for (MachineOperand &Def : MI.defs()) {
666     LLT ResTy = MRI.getType(Def.getReg());
667     const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
668     ResultRegs.push_back(Def.getReg());
669     Register InitReg = B.buildUndef(ResTy).getReg(0);
670     Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
671     InitResultRegs.push_back(InitReg);
672     PhiRegs.push_back(PhiReg);
673     MRI.setRegBank(PhiReg, *DefBank);
674     MRI.setRegBank(InitReg, *DefBank);
675   }
676 
677   Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
678   Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
679 
680   // Don't bother using generic instructions/registers for the exec mask.
681   B.buildInstr(TargetOpcode::IMPLICIT_DEF)
682     .addDef(InitSaveExecReg);
683 
684   Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
685   Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
686 
687   // To insert the loop we need to split the block. Move everything before this
688   // point to a new block, and insert a new empty block before this instruction.
689   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
690   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
691   MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
692   MachineFunction::iterator MBBI(MBB);
693   ++MBBI;
694   MF->insert(MBBI, LoopBB);
695   MF->insert(MBBI, RestoreExecBB);
696   MF->insert(MBBI, RemainderBB);
697 
698   LoopBB->addSuccessor(RestoreExecBB);
699   LoopBB->addSuccessor(LoopBB);
700 
701   // Move the rest of the block into a new block.
702   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
703   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
704 
705   MBB.addSuccessor(LoopBB);
706   RestoreExecBB->addSuccessor(RemainderBB);
707 
708   B.setInsertPt(*LoopBB, LoopBB->end());
709 
710   B.buildInstr(TargetOpcode::PHI)
711     .addDef(PhiExec)
712     .addReg(InitSaveExecReg)
713     .addMBB(&MBB)
714     .addReg(NewExec)
715     .addMBB(LoopBB);
716 
717   for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
718     B.buildInstr(TargetOpcode::G_PHI)
719       .addDef(std::get<2>(Result))
720       .addReg(std::get<0>(Result)) // Initial value / implicit_def
721       .addMBB(&MBB)
722       .addReg(std::get<1>(Result)) // Mid-loop value.
723       .addMBB(LoopBB);
724   }
725 
726   // Move the instruction into the loop.
727   LoopBB->splice(LoopBB->end(), &MBB, I);
728   I = std::prev(LoopBB->end());
729 
730   B.setInstr(*I);
731 
732   Register CondReg;
733 
734   for (MachineOperand &Op : MI.uses()) {
735     if (!Op.isReg())
736       continue;
737 
738     assert(!Op.isDef());
739     if (SGPROperandRegs.count(Op.getReg())) {
740       LLT OpTy = MRI.getType(Op.getReg());
741       unsigned OpSize = OpTy.getSizeInBits();
742 
743       // Can only do a readlane of 32-bit pieces.
744       if (OpSize == 32) {
745         // Avoid extra copies in the simple case of one 32-bit register.
746         Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
747         MRI.setType(CurrentLaneOpReg, OpTy);
748 
749         constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
750         // Read the next variant <- also loop target.
751         BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
752           .addReg(Op.getReg());
753 
754         Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
755         bool First = CondReg == AMDGPU::NoRegister;
756         if (First)
757           CondReg = NewCondReg;
758 
759         // Compare the just read M0 value to all possible Idx values.
760         B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
761           .addDef(NewCondReg)
762           .addReg(CurrentLaneOpReg)
763           .addReg(Op.getReg());
764         Op.setReg(CurrentLaneOpReg);
765 
766         if (!First) {
767           Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
768 
769           // If there are multiple operands to consider, and the conditions.
770           B.buildInstr(AMDGPU::S_AND_B64)
771             .addDef(AndReg)
772             .addReg(NewCondReg)
773             .addReg(CondReg);
774           CondReg = AndReg;
775         }
776       } else {
777         LLT S32 = LLT::scalar(32);
778         SmallVector<Register, 8> ReadlanePieces;
779 
780         // The compares can be done as 64-bit, but the extract needs to be done
781         // in 32-bit pieces.
782 
783         bool Is64 = OpSize % 64 == 0;
784 
785         LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
786         unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
787                                           : AMDGPU::V_CMP_EQ_U32_e64;
788 
789         // The compares can be done as 64-bit, but the extract needs to be done
790         // in 32-bit pieces.
791 
792         // Insert the unmerge before the loop.
793 
794         B.setMBB(MBB);
795         auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
796         B.setInstr(*I);
797 
798         unsigned NumPieces = Unmerge->getNumOperands() - 1;
799         for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
800           Register UnmergePiece = Unmerge.getReg(PieceIdx);
801 
802           Register CurrentLaneOpReg;
803           if (Is64) {
804             Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
805             Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
806 
807             MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
808             MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
809             MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
810 
811             // Read the next variant <- also loop target.
812             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
813                     CurrentLaneOpRegLo)
814               .addReg(UnmergePiece, 0, AMDGPU::sub0);
815 
816             // Read the next variant <- also loop target.
817             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
818                     CurrentLaneOpRegHi)
819               .addReg(UnmergePiece, 0, AMDGPU::sub1);
820 
821             CurrentLaneOpReg =
822                 B.buildMerge(LLT::scalar(64),
823                              {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
824                     .getReg(0);
825 
826             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
827 
828             if (OpTy.getScalarSizeInBits() == 64) {
829               // If we need to produce a 64-bit element vector, so use the
830               // merged pieces
831               ReadlanePieces.push_back(CurrentLaneOpReg);
832             } else {
833               // 32-bit element type.
834               ReadlanePieces.push_back(CurrentLaneOpRegLo);
835               ReadlanePieces.push_back(CurrentLaneOpRegHi);
836             }
837           } else {
838             CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
839             MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
840             MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
841 
842             // Read the next variant <- also loop target.
843             BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
844                     CurrentLaneOpReg)
845               .addReg(UnmergePiece);
846             ReadlanePieces.push_back(CurrentLaneOpReg);
847           }
848 
849           Register NewCondReg
850             = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
851           bool First = CondReg == AMDGPU::NoRegister;
852           if (First)
853             CondReg = NewCondReg;
854 
855           B.buildInstr(CmpOp)
856             .addDef(NewCondReg)
857             .addReg(CurrentLaneOpReg)
858             .addReg(UnmergePiece);
859 
860           if (!First) {
861             Register AndReg
862               = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
863 
864             // If there are multiple operands to consider, and the conditions.
865             B.buildInstr(AMDGPU::S_AND_B64)
866               .addDef(AndReg)
867               .addReg(NewCondReg)
868               .addReg(CondReg);
869             CondReg = AndReg;
870           }
871         }
872 
873         // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
874         // BUILD_VECTOR
875         if (OpTy.isVector()) {
876           auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
877           Op.setReg(Merge.getReg(0));
878         } else {
879           auto Merge = B.buildMerge(OpTy, ReadlanePieces);
880           Op.setReg(Merge.getReg(0));
881         }
882 
883         MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
884       }
885     }
886   }
887 
888   B.setInsertPt(*LoopBB, LoopBB->end());
889 
890   // Update EXEC, save the original EXEC value to VCC.
891   B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
892     .addDef(NewExec)
893     .addReg(CondReg, RegState::Kill);
894 
895   MRI.setSimpleHint(NewExec, CondReg);
896 
897   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
898   B.buildInstr(AMDGPU::S_XOR_B64_term)
899     .addDef(AMDGPU::EXEC)
900     .addReg(AMDGPU::EXEC)
901     .addReg(NewExec);
902 
903   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
904   // s_cbranch_scc0?
905 
906   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
907   B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
908     .addMBB(LoopBB);
909 
910   // Save the EXEC mask before the loop.
911   BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
912     .addReg(AMDGPU::EXEC);
913 
914   // Restore the EXEC mask after the loop.
915   B.setMBB(*RestoreExecBB);
916   B.buildInstr(AMDGPU::S_MOV_B64_term)
917     .addDef(AMDGPU::EXEC)
918     .addReg(SaveExecReg);
919 }
920 
921 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
922 void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
923     MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
924   Register Reg = MI.getOperand(OpIdx).getReg();
925   const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
926   if (Bank != &AMDGPU::VGPRRegBank)
927     return;
928 
929   MachineIRBuilder B(MI);
930   Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
931   B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
932     .addDef(SGPR)
933     .addReg(Reg);
934 
935   const TargetRegisterClass *Constrained =
936       constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
937   (void)Constrained;
938   assert(Constrained && "Failed to constrain readfirstlane src reg");
939 
940   MI.getOperand(OpIdx).setReg(SGPR);
941 }
942 
943 // When regbankselect repairs registers, it will insert a repair instruction
944 // which defines the repaired register.  Then it calls applyMapping and expects
945 // that the targets will either delete or rewrite the originally wrote to the
946 // repaired registers.  Beccause of this, we end up in a situation where
947 // we have 2 instructions defining the same registers.
948 static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
949                                      Register Reg,
950                                      const MachineInstr &MI) {
951   // Is there some way we can assert that there are exactly 2 def instructions?
952   for (MachineInstr &Other : MRI.def_instructions(Reg)) {
953     if (&Other != &MI)
954       return &Other;
955   }
956 
957   return nullptr;
958 }
959 
960 bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
961                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
962                                               MachineRegisterInfo &MRI) const {
963   Register DstReg = MI.getOperand(0).getReg();
964   const LLT LoadTy =  MRI.getType(DstReg);
965   unsigned LoadSize = LoadTy.getSizeInBits();
966   const unsigned MaxNonSmrdLoadSize = 128;
967   // 128-bit loads are supported for all instruction types.
968   if (LoadSize <= MaxNonSmrdLoadSize)
969     return false;
970 
971   SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
972   SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
973 
974   // If the pointer is an SGPR, we have nothing to do.
975   if (SrcRegs.empty())
976     return false;
977 
978   assert(LoadSize % MaxNonSmrdLoadSize == 0);
979 
980   // We want to get the repair instruction now, because it will help us
981   // determine which instruction the legalizer inserts that will also
982   // write to DstReg.
983   MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
984 
985   // RegBankSelect only emits scalar types, so we need to reset the pointer
986   // operand to a pointer type.
987   Register BasePtrReg = SrcRegs[0];
988   LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
989   MRI.setType(BasePtrReg, PtrTy);
990 
991   MachineIRBuilder B(MI);
992 
993   unsigned SplitElts =
994       MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
995   const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
996   ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
997   GISelObserverWrapper Observer(&O);
998   B.setChangeObserver(Observer);
999   LegalizerHelper Helper(B.getMF(), Observer, B);
1000   if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1001     return false;
1002 
1003   // At this point, the legalizer has split the original load into smaller
1004   // loads.  At the end of lowering, it inserts an instruction (LegalizedInst)
1005   // that combines the outputs of the lower loads and writes it to DstReg.
1006   // The register bank selector has also added the RepairInst which writes to
1007   // DstReg as well.
1008 
1009   MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
1010 
1011   // Replace the output of the LegalizedInst with a temporary register, since
1012   // RepairInst already defines DstReg.
1013   Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
1014   LegalizedInst->getOperand(0).setReg(TmpReg);
1015   B.setInsertPt(*RepairInst->getParent(), RepairInst);
1016 
1017   for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
1018     Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
1019     B.buildConstant(IdxReg, DefIdx);
1020     MRI.setRegBank(IdxReg, getRegBank(AMDGPU::VGPRRegBankID));
1021     B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
1022   }
1023 
1024   MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1025   return true;
1026 }
1027 
1028 // For cases where only a single copy is inserted for matching register banks.
1029 // Replace the register in the instruction operand
1030 static void substituteSimpleCopyRegs(
1031   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1032   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1033   if (!SrcReg.empty()) {
1034     assert(SrcReg.size() == 1);
1035     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1036   }
1037 }
1038 
1039 void AMDGPURegisterBankInfo::applyMappingImpl(
1040     const OperandsMapper &OpdMapper) const {
1041   MachineInstr &MI = OpdMapper.getMI();
1042   unsigned Opc = MI.getOpcode();
1043   MachineRegisterInfo &MRI = OpdMapper.getMRI();
1044   switch (Opc) {
1045   case AMDGPU::G_SELECT: {
1046     Register DstReg = MI.getOperand(0).getReg();
1047     LLT DstTy = MRI.getType(DstReg);
1048     if (DstTy.getSizeInBits() != 64)
1049       break;
1050 
1051     LLT HalfTy = getHalfSizedType(DstTy);
1052 
1053     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1054     SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
1055     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1056     SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
1057 
1058     // All inputs are SGPRs, nothing special to do.
1059     if (DefRegs.empty()) {
1060       assert(Src1Regs.empty() && Src2Regs.empty());
1061       break;
1062     }
1063 
1064     MachineIRBuilder B(MI);
1065     if (Src0Regs.empty())
1066       Src0Regs.push_back(MI.getOperand(1).getReg());
1067     else {
1068       assert(Src0Regs.size() == 1);
1069     }
1070 
1071     if (Src1Regs.empty())
1072       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1073     else {
1074       setRegsToType(MRI, Src1Regs, HalfTy);
1075     }
1076 
1077     if (Src2Regs.empty())
1078       split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
1079     else
1080       setRegsToType(MRI, Src2Regs, HalfTy);
1081 
1082     setRegsToType(MRI, DefRegs, HalfTy);
1083 
1084     B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
1085     B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
1086 
1087     MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1088     MI.eraseFromParent();
1089     return;
1090   }
1091   case AMDGPU::G_AND:
1092   case AMDGPU::G_OR:
1093   case AMDGPU::G_XOR: {
1094     // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
1095     // there is a VGPR input.
1096     Register DstReg = MI.getOperand(0).getReg();
1097     LLT DstTy = MRI.getType(DstReg);
1098     if (DstTy.getSizeInBits() != 64)
1099       break;
1100 
1101     LLT HalfTy = getHalfSizedType(DstTy);
1102     SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1103     SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
1104     SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
1105 
1106     // All inputs are SGPRs, nothing special to do.
1107     if (DefRegs.empty()) {
1108       assert(Src0Regs.empty() && Src1Regs.empty());
1109       break;
1110     }
1111 
1112     assert(DefRegs.size() == 2);
1113     assert(Src0Regs.size() == Src1Regs.size() &&
1114            (Src0Regs.empty() || Src0Regs.size() == 2));
1115 
1116     // Depending on where the source registers came from, the generic code may
1117     // have decided to split the inputs already or not. If not, we still need to
1118     // extract the values.
1119     MachineIRBuilder B(MI);
1120 
1121     if (Src0Regs.empty())
1122       split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
1123     else
1124       setRegsToType(MRI, Src0Regs, HalfTy);
1125 
1126     if (Src1Regs.empty())
1127       split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
1128     else
1129       setRegsToType(MRI, Src1Regs, HalfTy);
1130 
1131     setRegsToType(MRI, DefRegs, HalfTy);
1132 
1133     B.buildInstr(Opc)
1134       .addDef(DefRegs[0])
1135       .addUse(Src0Regs[0])
1136       .addUse(Src1Regs[0]);
1137 
1138     B.buildInstr(Opc)
1139       .addDef(DefRegs[1])
1140       .addUse(Src0Regs[1])
1141       .addUse(Src1Regs[1]);
1142 
1143     MRI.setRegBank(DstReg, getRegBank(AMDGPU::VGPRRegBankID));
1144     MI.eraseFromParent();
1145     return;
1146   }
1147   case AMDGPU::G_ADD:
1148   case AMDGPU::G_SUB:
1149   case AMDGPU::G_MUL: {
1150     Register DstReg = MI.getOperand(0).getReg();
1151     LLT DstTy = MRI.getType(DstReg);
1152     if (DstTy != LLT::scalar(16))
1153       break;
1154 
1155     const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1156     if (DstBank == &AMDGPU::VGPRRegBank)
1157       break;
1158 
1159     // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
1160     MachineFunction *MF = MI.getParent()->getParent();
1161     MachineIRBuilder B(MI);
1162     ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1163     GISelObserverWrapper Observer(&ApplySALU);
1164     LegalizerHelper Helper(*MF, Observer, B);
1165 
1166     if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
1167         LegalizerHelper::Legalized)
1168       llvm_unreachable("widen scalar should have succeeded");
1169     return;
1170   }
1171   case AMDGPU::G_SMIN:
1172   case AMDGPU::G_SMAX:
1173   case AMDGPU::G_UMIN:
1174   case AMDGPU::G_UMAX: {
1175     Register DstReg = MI.getOperand(0).getReg();
1176     const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
1177     if (DstBank == &AMDGPU::VGPRRegBank)
1178       break;
1179 
1180     MachineFunction *MF = MI.getParent()->getParent();
1181     MachineIRBuilder B(MI);
1182     ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
1183     GISelObserverWrapper Observer(&ApplySALU);
1184     LegalizerHelper Helper(*MF, Observer, B);
1185 
1186     // Turn scalar min/max into a compare and select.
1187     LLT Ty = MRI.getType(DstReg);
1188     LLT S32 = LLT::scalar(32);
1189     LLT S16 = LLT::scalar(16);
1190 
1191     if (Ty == S16) {
1192       // Need to widen to s32, and expand as cmp + select.
1193       if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
1194         llvm_unreachable("widenScalar should have succeeded");
1195 
1196       // FIXME: This is relying on widenScalar leaving MI in place.
1197       if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
1198         llvm_unreachable("lower should have succeeded");
1199     } else {
1200       if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
1201         llvm_unreachable("lower should have succeeded");
1202     }
1203 
1204     return;
1205   }
1206   case AMDGPU::G_SEXT:
1207   case AMDGPU::G_ZEXT: {
1208     Register SrcReg = MI.getOperand(1).getReg();
1209     LLT SrcTy = MRI.getType(SrcReg);
1210     bool Signed = Opc == AMDGPU::G_SEXT;
1211 
1212     MachineIRBuilder B(MI);
1213     const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
1214 
1215     Register DstReg = MI.getOperand(0).getReg();
1216     LLT DstTy = MRI.getType(DstReg);
1217     if (DstTy.isScalar() &&
1218         SrcBank != &AMDGPU::SGPRRegBank &&
1219         SrcBank != &AMDGPU::SCCRegBank &&
1220         SrcBank != &AMDGPU::VCCRegBank &&
1221         // FIXME: Should handle any type that round to s64 when irregular
1222         // breakdowns supported.
1223         DstTy.getSizeInBits() == 64 &&
1224         SrcTy.getSizeInBits() <= 32) {
1225       const LLT S32 = LLT::scalar(32);
1226       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1227 
1228       // Extend to 32-bit, and then extend the low half.
1229       if (Signed) {
1230         // TODO: Should really be buildSExtOrCopy
1231         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
1232 
1233         // Replicate sign bit from 32-bit extended part.
1234         auto ShiftAmt = B.buildConstant(S32, 31);
1235         MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1236         B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
1237       } else {
1238         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
1239         B.buildConstant(DefRegs[1], 0);
1240       }
1241 
1242       MRI.setRegBank(DstReg, *SrcBank);
1243       MI.eraseFromParent();
1244       return;
1245     }
1246 
1247     if (SrcTy != LLT::scalar(1))
1248       return;
1249 
1250     if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
1251       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
1252 
1253       const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
1254         &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
1255 
1256       unsigned DstSize = DstTy.getSizeInBits();
1257       // 64-bit select is SGPR only
1258       const bool UseSel64 = DstSize > 32 &&
1259         SrcBank->getID() == AMDGPU::SCCRegBankID;
1260 
1261       // TODO: Should s16 select be legal?
1262       LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
1263       auto True = B.buildConstant(SelType, Signed ? -1 : 1);
1264       auto False = B.buildConstant(SelType, 0);
1265 
1266       MRI.setRegBank(True.getReg(0), *DstBank);
1267       MRI.setRegBank(False.getReg(0), *DstBank);
1268       MRI.setRegBank(DstReg, *DstBank);
1269 
1270       if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
1271         B.buildSelect(DefRegs[0], SrcReg, True, False);
1272         B.buildCopy(DefRegs[1], DefRegs[0]);
1273       } else if (DstSize < 32) {
1274         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
1275         MRI.setRegBank(Sel.getReg(0), *DstBank);
1276         B.buildTrunc(DstReg, Sel);
1277       } else {
1278         B.buildSelect(DstReg, SrcReg, True, False);
1279       }
1280 
1281       MI.eraseFromParent();
1282       return;
1283     }
1284 
1285     // Fixup the case with an s1 src that isn't a condition register. Use shifts
1286     // instead of introducing a compare to avoid an unnecessary condition
1287     // register (and since there's no scalar 16-bit compares).
1288     auto Ext = B.buildAnyExt(DstTy, SrcReg);
1289     auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
1290     auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
1291 
1292     if (MI.getOpcode() == AMDGPU::G_SEXT)
1293       B.buildAShr(DstReg, Shl, ShiftAmt);
1294     else
1295       B.buildLShr(DstReg, Shl, ShiftAmt);
1296 
1297     MRI.setRegBank(DstReg, *SrcBank);
1298     MRI.setRegBank(Ext.getReg(0), *SrcBank);
1299     MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
1300     MRI.setRegBank(Shl.getReg(0), *SrcBank);
1301     MI.eraseFromParent();
1302     return;
1303   }
1304   case AMDGPU::G_EXTRACT_VECTOR_ELT:
1305     applyDefaultMapping(OpdMapper);
1306     executeInWaterfallLoop(MI, MRI, { 2 });
1307     return;
1308   case AMDGPU::G_INTRINSIC: {
1309     switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1310     case Intrinsic::amdgcn_s_buffer_load: {
1311       // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
1312       executeInWaterfallLoop(MI, MRI, { 2, 3 });
1313       return;
1314     }
1315     case Intrinsic::amdgcn_readlane: {
1316       substituteSimpleCopyRegs(OpdMapper, 2);
1317 
1318       assert(empty(OpdMapper.getVRegs(0)));
1319       assert(empty(OpdMapper.getVRegs(3)));
1320 
1321       // Make sure the index is an SGPR. It doesn't make sense to run this in a
1322       // waterfall loop, so assume it's a uniform value.
1323       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1324       return;
1325     }
1326     case Intrinsic::amdgcn_writelane: {
1327       assert(empty(OpdMapper.getVRegs(0)));
1328       assert(empty(OpdMapper.getVRegs(2)));
1329       assert(empty(OpdMapper.getVRegs(3)));
1330 
1331       substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
1332       constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
1333       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
1334       return;
1335     }
1336     default:
1337       break;
1338     }
1339     break;
1340   }
1341   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
1342     switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1343     case Intrinsic::amdgcn_buffer_load: {
1344       executeInWaterfallLoop(MI, MRI, { 2 });
1345       return;
1346     }
1347     case Intrinsic::amdgcn_ds_ordered_add:
1348     case Intrinsic::amdgcn_ds_ordered_swap: {
1349       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
1350       assert(empty(OpdMapper.getVRegs(0)));
1351       substituteSimpleCopyRegs(OpdMapper, 3);
1352       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1353       return;
1354     }
1355     case Intrinsic::amdgcn_s_sendmsg:
1356     case Intrinsic::amdgcn_s_sendmsghalt: {
1357       // FIXME: Should this use a waterfall loop?
1358       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
1359       return;
1360     }
1361     default:
1362       break;
1363     }
1364     break;
1365   }
1366   case AMDGPU::G_LOAD: {
1367     if (applyMappingWideLoad(MI, OpdMapper, MRI))
1368       return;
1369     break;
1370   }
1371   default:
1372     break;
1373   }
1374 
1375   return applyDefaultMapping(OpdMapper);
1376 }
1377 
1378 bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
1379   const MachineFunction &MF = *MI.getParent()->getParent();
1380   const MachineRegisterInfo &MRI = MF.getRegInfo();
1381   for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
1382     if (!MI.getOperand(i).isReg())
1383       continue;
1384     Register Reg = MI.getOperand(i).getReg();
1385     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
1386       if (Bank->getID() == AMDGPU::VGPRRegBankID)
1387         return false;
1388 
1389       assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
1390              Bank->getID() == AMDGPU::SCCRegBankID);
1391     }
1392   }
1393   return true;
1394 }
1395 
1396 const RegisterBankInfo::InstructionMapping &
1397 AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
1398   const MachineFunction &MF = *MI.getParent()->getParent();
1399   const MachineRegisterInfo &MRI = MF.getRegInfo();
1400   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1401 
1402   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1403     unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1404     unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
1405     OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
1406   }
1407   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1408                                MI.getNumOperands());
1409 }
1410 
1411 const RegisterBankInfo::InstructionMapping &
1412 AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
1413   const MachineFunction &MF = *MI.getParent()->getParent();
1414   const MachineRegisterInfo &MRI = MF.getRegInfo();
1415   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1416   unsigned OpdIdx = 0;
1417 
1418   unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1419   OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
1420 
1421   if (MI.getOperand(OpdIdx).isIntrinsicID())
1422     OpdsMapping[OpdIdx++] = nullptr;
1423 
1424   Register Reg1 = MI.getOperand(OpdIdx).getReg();
1425   unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
1426 
1427   unsigned DefaultBankID = Size1 == 1 ?
1428     AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1429   unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
1430 
1431   OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
1432 
1433   for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
1434     const MachineOperand &MO = MI.getOperand(OpdIdx);
1435     if (!MO.isReg())
1436       continue;
1437 
1438     unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
1439     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
1440     OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
1441   }
1442 
1443   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1444                                MI.getNumOperands());
1445 }
1446 
1447 const RegisterBankInfo::InstructionMapping &
1448 AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
1449   const MachineFunction &MF = *MI.getParent()->getParent();
1450   const MachineRegisterInfo &MRI = MF.getRegInfo();
1451   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1452 
1453   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1454     const MachineOperand &Op = MI.getOperand(I);
1455     if (!Op.isReg())
1456       continue;
1457 
1458     unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
1459     OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1460   }
1461 
1462   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
1463                                MI.getNumOperands());
1464 }
1465 
1466 const RegisterBankInfo::InstructionMapping &
1467 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
1468 
1469   const MachineFunction &MF = *MI.getParent()->getParent();
1470   const MachineRegisterInfo &MRI = MF.getRegInfo();
1471   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1472   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1473   LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
1474   unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1475 
1476   const ValueMapping *ValMapping;
1477   const ValueMapping *PtrMapping;
1478 
1479   if (isInstrUniform(MI)) {
1480     // We have a uniform instruction so we want to use an SMRD load
1481     ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1482     PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
1483   } else {
1484     ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
1485     // FIXME: What would happen if we used SGPRRegBankID here?
1486     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
1487   }
1488 
1489   OpdsMapping[0] = ValMapping;
1490   OpdsMapping[1] = PtrMapping;
1491   const RegisterBankInfo::InstructionMapping &Mapping = getInstructionMapping(
1492       1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
1493   return Mapping;
1494 
1495   // FIXME: Do we want to add a mapping for FLAT load, or should we just
1496   // handle that during instruction selection?
1497 }
1498 
1499 unsigned
1500 AMDGPURegisterBankInfo::getRegBankID(Register Reg,
1501                                      const MachineRegisterInfo &MRI,
1502                                      const TargetRegisterInfo &TRI,
1503                                      unsigned Default) const {
1504 
1505   const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
1506   return Bank ? Bank->getID() : Default;
1507 }
1508 
1509 ///
1510 /// This function must return a legal mapping, because
1511 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
1512 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
1513 /// VGPR to SGPR generated is illegal.
1514 ///
1515 const RegisterBankInfo::InstructionMapping &
1516 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
1517   const MachineFunction &MF = *MI.getParent()->getParent();
1518   const MachineRegisterInfo &MRI = MF.getRegInfo();
1519 
1520   if (MI.isRegSequence()) {
1521     // If any input is a VGPR, the result must be a VGPR. The default handling
1522     // assumes any copy between banks is legal.
1523     unsigned BankID = AMDGPU::SGPRRegBankID;
1524 
1525     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1526       auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI, *TRI);
1527       // It doesn't make sense to use vcc or scc banks here, so just ignore
1528       // them.
1529       if (OpBank != AMDGPU::SGPRRegBankID) {
1530         BankID = AMDGPU::VGPRRegBankID;
1531         break;
1532       }
1533     }
1534     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1535 
1536     const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
1537     return getInstructionMapping(
1538         1, /*Cost*/ 1,
1539         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1540   }
1541 
1542   // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
1543   // properly.
1544   //
1545   // TODO: There are additional exec masking dependencies to analyze.
1546   if (MI.getOpcode() == TargetOpcode::G_PHI) {
1547     // TODO: Generate proper invalid bank enum.
1548     int ResultBank = -1;
1549 
1550     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
1551       Register Reg = MI.getOperand(I).getReg();
1552       const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1553 
1554       // FIXME: Assuming VGPR for any undetermined inputs.
1555       if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
1556         ResultBank = AMDGPU::VGPRRegBankID;
1557         break;
1558       }
1559 
1560       unsigned OpBank = Bank->getID();
1561       // scc, scc -> sgpr
1562       if (OpBank == AMDGPU::SCCRegBankID) {
1563         // There's only one SCC register, so a phi requires copying to SGPR.
1564         OpBank = AMDGPU::SGPRRegBankID;
1565       } else if (OpBank == AMDGPU::VCCRegBankID) {
1566         // vcc, vcc -> vcc
1567         // vcc, sgpr -> vgpr
1568         if (ResultBank != -1 && ResultBank != AMDGPU::VCCRegBankID) {
1569           ResultBank = AMDGPU::VGPRRegBankID;
1570           break;
1571         }
1572       }
1573 
1574       ResultBank = OpBank;
1575     }
1576 
1577     assert(ResultBank != -1);
1578 
1579     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1580 
1581     const ValueMapping &ValMap =
1582         getValueMapping(0, Size, getRegBank(ResultBank));
1583     return getInstructionMapping(
1584         1, /*Cost*/ 1,
1585         /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
1586   }
1587 
1588   const RegisterBankInfo::InstructionMapping &Mapping = getInstrMappingImpl(MI);
1589   if (Mapping.isValid())
1590     return Mapping;
1591 
1592   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
1593 
1594   switch (MI.getOpcode()) {
1595   default:
1596     return getInvalidInstructionMapping();
1597 
1598   case AMDGPU::G_AND:
1599   case AMDGPU::G_OR:
1600   case AMDGPU::G_XOR: {
1601     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1602     if (Size == 1) {
1603       const RegisterBank *DstBank
1604         = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
1605 
1606       unsigned TargetBankID = -1;
1607       unsigned BankLHS = -1;
1608       unsigned BankRHS = -1;
1609       if (DstBank) {
1610         TargetBankID = DstBank->getID();
1611         if (DstBank == &AMDGPU::VCCRegBank) {
1612           TargetBankID = AMDGPU::VCCRegBankID;
1613           BankLHS = AMDGPU::VCCRegBankID;
1614           BankRHS = AMDGPU::VCCRegBankID;
1615         } else if (DstBank == &AMDGPU::SCCRegBank) {
1616           TargetBankID = AMDGPU::SCCRegBankID;
1617           BankLHS = AMDGPU::SGPRRegBankID;
1618           BankRHS = AMDGPU::SGPRRegBankID;
1619         } else {
1620           BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1621                                  AMDGPU::SGPRRegBankID);
1622           BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1623                                  AMDGPU::SGPRRegBankID);
1624         }
1625       } else {
1626         BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
1627                                AMDGPU::VCCRegBankID);
1628         BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
1629                                AMDGPU::VCCRegBankID);
1630 
1631         // Both inputs should be true booleans to produce a boolean result.
1632         if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
1633           TargetBankID = AMDGPU::VGPRRegBankID;
1634         } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
1635           TargetBankID = AMDGPU::VCCRegBankID;
1636           BankLHS = AMDGPU::VCCRegBankID;
1637           BankRHS = AMDGPU::VCCRegBankID;
1638         } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
1639           TargetBankID = AMDGPU::SGPRRegBankID;
1640         } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
1641           // The operation must be done on a 32-bit register, but it will set
1642           // scc. The result type could interchangably be SCC or SGPR, since
1643           // both values will be produced.
1644           TargetBankID = AMDGPU::SCCRegBankID;
1645           BankLHS = AMDGPU::SGPRRegBankID;
1646           BankRHS = AMDGPU::SGPRRegBankID;
1647         }
1648       }
1649 
1650       OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
1651       OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
1652       OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
1653       break;
1654     }
1655 
1656     if (Size == 64) {
1657 
1658       if (isSALUMapping(MI)) {
1659         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
1660         OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
1661       } else {
1662         OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
1663         unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI/*, DefaultBankID*/);
1664         OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
1665 
1666         unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI/*, DefaultBankID*/);
1667         OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
1668       }
1669 
1670       break;
1671     }
1672 
1673     LLVM_FALLTHROUGH;
1674   }
1675 
1676   case AMDGPU::G_GEP:
1677   case AMDGPU::G_ADD:
1678   case AMDGPU::G_SUB:
1679   case AMDGPU::G_MUL:
1680   case AMDGPU::G_SHL:
1681   case AMDGPU::G_LSHR:
1682   case AMDGPU::G_ASHR:
1683   case AMDGPU::G_UADDO:
1684   case AMDGPU::G_SADDO:
1685   case AMDGPU::G_USUBO:
1686   case AMDGPU::G_SSUBO:
1687   case AMDGPU::G_UADDE:
1688   case AMDGPU::G_SADDE:
1689   case AMDGPU::G_USUBE:
1690   case AMDGPU::G_SSUBE:
1691   case AMDGPU::G_UMULH:
1692   case AMDGPU::G_SMULH:
1693   case AMDGPU::G_SMIN:
1694   case AMDGPU::G_SMAX:
1695   case AMDGPU::G_UMIN:
1696   case AMDGPU::G_UMAX:
1697     if (isSALUMapping(MI))
1698       return getDefaultMappingSOP(MI);
1699     LLVM_FALLTHROUGH;
1700 
1701   case AMDGPU::G_FADD:
1702   case AMDGPU::G_FSUB:
1703   case AMDGPU::G_FPTOSI:
1704   case AMDGPU::G_FPTOUI:
1705   case AMDGPU::G_FMUL:
1706   case AMDGPU::G_FMA:
1707   case AMDGPU::G_FSQRT:
1708   case AMDGPU::G_SITOFP:
1709   case AMDGPU::G_UITOFP:
1710   case AMDGPU::G_FPTRUNC:
1711   case AMDGPU::G_FPEXT:
1712   case AMDGPU::G_FEXP2:
1713   case AMDGPU::G_FLOG2:
1714   case AMDGPU::G_FMINNUM:
1715   case AMDGPU::G_FMAXNUM:
1716   case AMDGPU::G_FMINNUM_IEEE:
1717   case AMDGPU::G_FMAXNUM_IEEE:
1718   case AMDGPU::G_FCANONICALIZE:
1719   case AMDGPU::G_INTRINSIC_TRUNC:
1720   case AMDGPU::G_INTRINSIC_ROUND:
1721     return getDefaultMappingVOP(MI);
1722   case AMDGPU::G_IMPLICIT_DEF: {
1723     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1724     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1725     break;
1726   }
1727   case AMDGPU::G_FCONSTANT:
1728   case AMDGPU::G_CONSTANT:
1729   case AMDGPU::G_FRAME_INDEX:
1730   case AMDGPU::G_BLOCK_ADDR: {
1731     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1732     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1733     break;
1734   }
1735   case AMDGPU::G_INSERT: {
1736     unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1737                                           AMDGPU::VGPRRegBankID;
1738     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1739     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1740     unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
1741     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1742     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1743     OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
1744     OpdsMapping[3] = nullptr;
1745     break;
1746   }
1747   case AMDGPU::G_EXTRACT: {
1748     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1749     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
1750     unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
1751     OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
1752     OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
1753     OpdsMapping[2] = nullptr;
1754     break;
1755   }
1756   case AMDGPU::G_MERGE_VALUES:
1757   case AMDGPU::G_BUILD_VECTOR:
1758   case AMDGPU::G_CONCAT_VECTORS: {
1759     unsigned Bank = isSALUMapping(MI) ?
1760       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1761     unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1762     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1763 
1764     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1765     // Op1 and Dst should use the same register bank.
1766     for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
1767       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
1768     break;
1769   }
1770   case AMDGPU::G_BITCAST:
1771   case AMDGPU::G_INTTOPTR:
1772   case AMDGPU::G_PTRTOINT:
1773   case AMDGPU::G_CTLZ:
1774   case AMDGPU::G_CTLZ_ZERO_UNDEF:
1775   case AMDGPU::G_CTTZ:
1776   case AMDGPU::G_CTTZ_ZERO_UNDEF:
1777   case AMDGPU::G_CTPOP:
1778   case AMDGPU::G_BSWAP:
1779   case AMDGPU::G_FABS:
1780   case AMDGPU::G_FNEG: {
1781     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1782     unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
1783     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
1784     break;
1785   }
1786   case AMDGPU::G_TRUNC: {
1787     Register Dst = MI.getOperand(0).getReg();
1788     Register Src = MI.getOperand(1).getReg();
1789     unsigned Bank = getRegBankID(Src, MRI, *TRI);
1790     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1791     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1792     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
1793     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
1794     break;
1795   }
1796   case AMDGPU::G_ZEXT:
1797   case AMDGPU::G_SEXT:
1798   case AMDGPU::G_ANYEXT: {
1799     Register Dst = MI.getOperand(0).getReg();
1800     Register Src = MI.getOperand(1).getReg();
1801     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
1802     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
1803 
1804     unsigned DstBank;
1805     const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
1806     assert(SrcBank);
1807     switch (SrcBank->getID()) {
1808     case AMDGPU::SCCRegBankID:
1809     case AMDGPU::SGPRRegBankID:
1810       DstBank = AMDGPU::SGPRRegBankID;
1811       break;
1812     default:
1813       DstBank = AMDGPU::VGPRRegBankID;
1814       break;
1815     }
1816 
1817     // TODO: Should anyext be split into 32-bit part as well?
1818     if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
1819       OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
1820       OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
1821     } else {
1822       // Scalar extend can use 64-bit BFE, but VGPRs require extending to
1823       // 32-bits, and then to 64.
1824       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
1825       OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
1826                                                          SrcSize);
1827     }
1828     break;
1829   }
1830   case AMDGPU::G_FCMP: {
1831     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1832     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1833     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
1834     OpdsMapping[1] = nullptr; // Predicate Operand.
1835     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1836     OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1837     break;
1838   }
1839   case AMDGPU::G_STORE: {
1840     assert(MI.getOperand(0).isReg());
1841     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1842     // FIXME: We need to specify a different reg bank once scalar stores
1843     // are supported.
1844     const ValueMapping *ValMapping =
1845         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
1846     // FIXME: Depending on the type of store, the pointer could be in
1847     // the SGPR Reg bank.
1848     // FIXME: Pointer size should be based on the address space.
1849     const ValueMapping *PtrMapping =
1850         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
1851 
1852     OpdsMapping[0] = ValMapping;
1853     OpdsMapping[1] = PtrMapping;
1854     break;
1855   }
1856 
1857   case AMDGPU::G_ICMP: {
1858     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1859     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1860     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1861     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1862 
1863     bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
1864                      Op3Bank == AMDGPU::SGPRRegBankID &&
1865       (Size == 32 || (Size == 64 &&
1866                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
1867                       MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
1868 
1869     unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
1870 
1871     OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
1872     OpdsMapping[1] = nullptr; // Predicate Operand.
1873     OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
1874     OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
1875     break;
1876   }
1877   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
1878     unsigned OutputBankID = isSALUMapping(MI) ?
1879                             AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1880     unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1881     unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1882     unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1883 
1884     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1885     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
1886 
1887     // The index can be either if the source vector is VGPR.
1888     OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1889     break;
1890   }
1891   case AMDGPU::G_INSERT_VECTOR_ELT: {
1892     unsigned OutputBankID = isSALUMapping(MI) ?
1893       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
1894 
1895     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1896     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
1897     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
1898     unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
1899     unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
1900 
1901     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1902     OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
1903     OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
1904 
1905     // The index can be either if the source vector is VGPR.
1906     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
1907     break;
1908   }
1909   case AMDGPU::G_UNMERGE_VALUES: {
1910     unsigned Bank = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
1911       AMDGPU::VGPRRegBankID;
1912 
1913     // Op1 and Dst should use the same register bank.
1914     // FIXME: Shouldn't this be the default? Why do we need to handle this?
1915     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
1916       unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
1917       OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
1918     }
1919     break;
1920   }
1921   case AMDGPU::G_INTRINSIC: {
1922     switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1923     default:
1924       return getInvalidInstructionMapping();
1925     case Intrinsic::amdgcn_div_fmas:
1926     case Intrinsic::amdgcn_trig_preop:
1927     case Intrinsic::amdgcn_sin:
1928     case Intrinsic::amdgcn_cos:
1929     case Intrinsic::amdgcn_log_clamp:
1930     case Intrinsic::amdgcn_rcp:
1931     case Intrinsic::amdgcn_rcp_legacy:
1932     case Intrinsic::amdgcn_rsq:
1933     case Intrinsic::amdgcn_rsq_legacy:
1934     case Intrinsic::amdgcn_rsq_clamp:
1935     case Intrinsic::amdgcn_ldexp:
1936     case Intrinsic::amdgcn_frexp_mant:
1937     case Intrinsic::amdgcn_frexp_exp:
1938     case Intrinsic::amdgcn_fract:
1939     case Intrinsic::amdgcn_cvt_pkrtz:
1940     case Intrinsic::amdgcn_cvt_pknorm_i16:
1941     case Intrinsic::amdgcn_cvt_pknorm_u16:
1942     case Intrinsic::amdgcn_cvt_pk_i16:
1943     case Intrinsic::amdgcn_cvt_pk_u16:
1944     case Intrinsic::amdgcn_fmed3:
1945     case Intrinsic::amdgcn_cubeid:
1946     case Intrinsic::amdgcn_cubema:
1947     case Intrinsic::amdgcn_cubesc:
1948     case Intrinsic::amdgcn_cubetc:
1949     case Intrinsic::amdgcn_sffbh:
1950     case Intrinsic::amdgcn_fmad_ftz:
1951     case Intrinsic::amdgcn_mbcnt_lo:
1952     case Intrinsic::amdgcn_mbcnt_hi:
1953     case Intrinsic::amdgcn_ubfe:
1954     case Intrinsic::amdgcn_sbfe:
1955     case Intrinsic::amdgcn_lerp:
1956     case Intrinsic::amdgcn_sad_u8:
1957     case Intrinsic::amdgcn_msad_u8:
1958     case Intrinsic::amdgcn_sad_hi_u8:
1959     case Intrinsic::amdgcn_sad_u16:
1960     case Intrinsic::amdgcn_qsad_pk_u16_u8:
1961     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
1962     case Intrinsic::amdgcn_mqsad_u32_u8:
1963     case Intrinsic::amdgcn_cvt_pk_u8_f32:
1964     case Intrinsic::amdgcn_alignbit:
1965     case Intrinsic::amdgcn_alignbyte:
1966     case Intrinsic::amdgcn_fdot2:
1967     case Intrinsic::amdgcn_sdot2:
1968     case Intrinsic::amdgcn_udot2:
1969     case Intrinsic::amdgcn_sdot4:
1970     case Intrinsic::amdgcn_udot4:
1971     case Intrinsic::amdgcn_sdot8:
1972     case Intrinsic::amdgcn_udot8:
1973     case Intrinsic::amdgcn_wwm:
1974     case Intrinsic::amdgcn_wqm:
1975       return getDefaultMappingVOP(MI);
1976     case Intrinsic::amdgcn_ds_permute:
1977     case Intrinsic::amdgcn_ds_bpermute:
1978     case Intrinsic::amdgcn_update_dpp:
1979       return getDefaultMappingAllVGPR(MI);
1980     case Intrinsic::amdgcn_kernarg_segment_ptr:
1981     case Intrinsic::amdgcn_s_getpc:
1982     case Intrinsic::amdgcn_groupstaticsize: {
1983       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1984       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
1985       break;
1986     }
1987     case Intrinsic::amdgcn_wqm_vote: {
1988       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1989       OpdsMapping[0] = OpdsMapping[2]
1990         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
1991       break;
1992     }
1993     case Intrinsic::amdgcn_s_buffer_load: {
1994       // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
1995       Register RSrc = MI.getOperand(2).getReg();   // SGPR
1996       Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
1997 
1998       unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1999       unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2000       unsigned Size3 = MRI.getType(Offset).getSizeInBits();
2001 
2002       unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2003       unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2004 
2005       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
2006       OpdsMapping[1] = nullptr; // intrinsic id
2007 
2008       // Lie and claim everything is legal, even though some need to be
2009       // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2010       OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2011       OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
2012       OpdsMapping[4] = nullptr;
2013       break;
2014     }
2015     case Intrinsic::amdgcn_div_scale: {
2016       unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2017       unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
2018       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
2019       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
2020 
2021       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
2022       OpdsMapping[3] = AMDGPU::getValueMapping(
2023         getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
2024       OpdsMapping[4] = AMDGPU::getValueMapping(
2025         getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
2026 
2027       break;
2028     }
2029     case Intrinsic::amdgcn_class: {
2030       Register Src0Reg = MI.getOperand(2).getReg();
2031       Register Src1Reg = MI.getOperand(3).getReg();
2032       unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
2033       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
2034       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2035       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
2036       OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
2037                                                Src0Size);
2038       OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
2039                                                Src1Size);
2040       break;
2041     }
2042     case Intrinsic::amdgcn_icmp:
2043     case Intrinsic::amdgcn_fcmp: {
2044       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2045       // This is not VCCRegBank because this is not used in boolean contexts.
2046       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2047       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2048       unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
2049       unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
2050       OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
2051       OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
2052       break;
2053     }
2054     case Intrinsic::amdgcn_readlane: {
2055       // This must be an SGPR, but accept a VGPR.
2056       Register IdxReg = MI.getOperand(3).getReg();
2057       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2058       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2059       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2060       LLVM_FALLTHROUGH;
2061     }
2062     case Intrinsic::amdgcn_readfirstlane: {
2063       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2064       unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
2065       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
2066       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2067       break;
2068     }
2069     case Intrinsic::amdgcn_writelane: {
2070       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2071       Register SrcReg = MI.getOperand(2).getReg();
2072       unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2073       unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2074       Register IdxReg = MI.getOperand(3).getReg();
2075       unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
2076       unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
2077       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2078 
2079       // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
2080       // to legalize.
2081       OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
2082       OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
2083       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
2084       break;
2085     }
2086     case Intrinsic::amdgcn_if_break: {
2087       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
2088       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2089       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
2090       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2091       break;
2092     }
2093     }
2094     break;
2095   }
2096   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2097     switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
2098     default:
2099       return getInvalidInstructionMapping();
2100     case Intrinsic::amdgcn_s_getreg:
2101     case Intrinsic::amdgcn_s_memtime:
2102     case Intrinsic::amdgcn_s_memrealtime:
2103     case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
2104       unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2105       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2106       break;
2107     }
2108     case Intrinsic::amdgcn_ds_append:
2109     case Intrinsic::amdgcn_ds_consume:
2110     case Intrinsic::amdgcn_ds_fadd:
2111     case Intrinsic::amdgcn_ds_fmin:
2112     case Intrinsic::amdgcn_ds_fmax:
2113     case Intrinsic::amdgcn_atomic_inc:
2114     case Intrinsic::amdgcn_atomic_dec:
2115       return getDefaultMappingAllVGPR(MI);
2116     case Intrinsic::amdgcn_ds_ordered_add:
2117     case Intrinsic::amdgcn_ds_ordered_swap: {
2118       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2119       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
2120       unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2121                                  AMDGPU::SGPRRegBankID);
2122       OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
2123       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2124       break;
2125     }
2126     case Intrinsic::amdgcn_exp_compr:
2127       OpdsMapping[0] = nullptr; // IntrinsicID
2128       // FIXME: These are immediate values which can't be read from registers.
2129       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2130       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2131       // FIXME: Could we support packed types here?
2132       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2133       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2134       // FIXME: These are immediate values which can't be read from registers.
2135       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2136       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2137       break;
2138     case Intrinsic::amdgcn_exp:
2139       OpdsMapping[0] = nullptr; // IntrinsicID
2140       // FIXME: These are immediate values which can't be read from registers.
2141       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2142       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2143       // FIXME: Could we support packed types here?
2144       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2145       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2146       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2147       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
2148       // FIXME: These are immediate values which can't be read from registers.
2149       OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2150       OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
2151       break;
2152     case Intrinsic::amdgcn_buffer_load: {
2153       Register RSrc = MI.getOperand(2).getReg();   // SGPR
2154       Register VIndex = MI.getOperand(3).getReg(); // VGPR
2155       Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
2156 
2157       unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2158       unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
2159       unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
2160       unsigned Size4 = MRI.getType(Offset).getSizeInBits();
2161 
2162       unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
2163       unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
2164 
2165       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
2166       OpdsMapping[1] = nullptr; // intrinsic id
2167 
2168       // Lie and claim everything is legal, even though some need to be
2169       // SGPRs. applyMapping will have to deal with it as a waterfall loop.
2170       OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
2171       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
2172       OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
2173       OpdsMapping[5] = nullptr;
2174       OpdsMapping[6] = nullptr;
2175       break;
2176     }
2177     case Intrinsic::amdgcn_s_sendmsg:
2178     case Intrinsic::amdgcn_s_sendmsghalt: {
2179       // This must be an SGPR, but accept a VGPR.
2180       unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2181                                    AMDGPU::SGPRRegBankID);
2182       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
2183       break;
2184     }
2185     case Intrinsic::amdgcn_end_cf: {
2186       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
2187       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
2188       break;
2189     }
2190     }
2191     break;
2192   }
2193   case AMDGPU::G_SELECT: {
2194     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
2195     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
2196                                     AMDGPU::SGPRRegBankID);
2197     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI,
2198                                     AMDGPU::SGPRRegBankID);
2199     bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
2200                     Op3Bank == AMDGPU::SGPRRegBankID;
2201 
2202     unsigned CondBankDefault = SGPRSrcs ?
2203       AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2204     unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
2205                                      CondBankDefault);
2206     if (CondBank == AMDGPU::SGPRRegBankID)
2207       CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
2208     else if (CondBank == AMDGPU::VGPRRegBankID)
2209       CondBank = AMDGPU::VCCRegBankID;
2210 
2211     unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
2212       AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
2213 
2214     assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
2215 
2216     if (Size == 64) {
2217       OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2218       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2219       OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2220       OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
2221     } else {
2222       OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
2223       OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
2224       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
2225       OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
2226     }
2227 
2228     break;
2229   }
2230 
2231   case AMDGPU::G_LOAD:
2232     return getInstrMappingForLoad(MI);
2233 
2234   case AMDGPU::G_ATOMICRMW_XCHG:
2235   case AMDGPU::G_ATOMICRMW_ADD:
2236   case AMDGPU::G_ATOMICRMW_SUB:
2237   case AMDGPU::G_ATOMICRMW_AND:
2238   case AMDGPU::G_ATOMICRMW_OR:
2239   case AMDGPU::G_ATOMICRMW_XOR:
2240   case AMDGPU::G_ATOMICRMW_MAX:
2241   case AMDGPU::G_ATOMICRMW_MIN:
2242   case AMDGPU::G_ATOMICRMW_UMAX:
2243   case AMDGPU::G_ATOMICRMW_UMIN:
2244   case AMDGPU::G_ATOMICRMW_FADD:
2245   case AMDGPU::G_ATOMIC_CMPXCHG: {
2246     return getDefaultMappingAllVGPR(MI);
2247   }
2248   case AMDGPU::G_BRCOND: {
2249     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
2250                                  AMDGPU::SGPRRegBankID);
2251     assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
2252     if (Bank != AMDGPU::SCCRegBankID)
2253       Bank = AMDGPU::VCCRegBankID;
2254 
2255     OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
2256     break;
2257   }
2258   }
2259 
2260   return getInstructionMapping(/*ID*/1, /*Cost*/1,
2261                                getOperandsMapping(OpdsMapping),
2262                                MI.getNumOperands());
2263 }
2264 
2265