1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIRegisterInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUInstPrinter.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/RegisterScavenging.h"
24 
25 using namespace llvm;
26 
27 #define GET_REGINFO_TARGET_DESC
28 #include "AMDGPUGenRegisterInfo.inc"
29 
30 static cl::opt<bool> EnableSpillSGPRToVGPR(
31   "amdgpu-spill-sgpr-to-vgpr",
32   cl::desc("Enable spilling VGPRs to SGPRs"),
33   cl::ReallyHidden,
34   cl::init(true));
35 
36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
38 
39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
42 //      meaning index 7 in SubRegFromChannelTable.
43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
44     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
45 
46 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
47     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
48       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
49 
50   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
51          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
52          (getSubRegIndexLaneMask(AMDGPU::lo16) |
53           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
54            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
55          "getNumCoveredRegs() will not work with generated subreg masks!");
56 
57   RegPressureIgnoredUnits.resize(getNumRegUnits());
58   RegPressureIgnoredUnits.set(
59       *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
60   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
61     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
62 
63   // HACK: Until this is fully tablegen'd.
64   static llvm::once_flag InitializeRegSplitPartsFlag;
65 
66   static auto InitializeRegSplitPartsOnce = [this]() {
67     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
68       unsigned Size = getSubRegIdxSize(Idx);
69       if (Size & 31)
70         continue;
71       std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
72       unsigned Pos = getSubRegIdxOffset(Idx);
73       if (Pos % Size)
74         continue;
75       Pos /= Size;
76       if (Vec.empty()) {
77         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
78         Vec.resize(MaxNumParts);
79       }
80       Vec[Pos] = Idx;
81     }
82   };
83 
84   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
85 
86   static auto InitializeSubRegFromChannelTableOnce = [this]() {
87     for (auto &Row : SubRegFromChannelTable)
88       Row.fill(AMDGPU::NoSubRegister);
89     for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
90       unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
91       unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
92       assert(Width < SubRegFromChannelTableWidthMap.size());
93       Width = SubRegFromChannelTableWidthMap[Width];
94       if (Width == 0)
95         continue;
96       unsigned TableIdx = Width - 1;
97       assert(TableIdx < SubRegFromChannelTable.size());
98       assert(Offset < SubRegFromChannelTable[TableIdx].size());
99       SubRegFromChannelTable[TableIdx][Offset] = Idx;
100     }
101   };
102 
103   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
104   llvm::call_once(InitializeSubRegFromChannelTableFlag,
105                   InitializeSubRegFromChannelTableOnce);
106 }
107 
108 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
109                                            MCRegister Reg) const {
110   MCRegAliasIterator R(Reg, this, true);
111 
112   for (; R.isValid(); ++R)
113     Reserved.set(*R);
114 }
115 
116 // Forced to be here by one .inc
117 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
118   const MachineFunction *MF) const {
119   CallingConv::ID CC = MF->getFunction().getCallingConv();
120   switch (CC) {
121   case CallingConv::C:
122   case CallingConv::Fast:
123   case CallingConv::Cold:
124   case CallingConv::AMDGPU_Gfx:
125     return MF->getSubtarget<GCNSubtarget>().hasGFX90AInsts()
126         ? CSR_AMDGPU_HighRegs_With_AGPRs_SaveList
127         : CSR_AMDGPU_HighRegs_SaveList;
128   default: {
129     // Dummy to not crash RegisterClassInfo.
130     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
131     return &NoCalleeSavedReg;
132   }
133   }
134 }
135 
136 const MCPhysReg *
137 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
138   return nullptr;
139 }
140 
141 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
142                                                      CallingConv::ID CC) const {
143   switch (CC) {
144   case CallingConv::C:
145   case CallingConv::Fast:
146   case CallingConv::Cold:
147   case CallingConv::AMDGPU_Gfx:
148     return MF.getSubtarget<GCNSubtarget>().hasGFX90AInsts()
149         ? CSR_AMDGPU_HighRegs_With_AGPRs_RegMask
150         : CSR_AMDGPU_HighRegs_RegMask;
151   default:
152     return nullptr;
153   }
154 }
155 
156 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
157   return CSR_AMDGPU_NoRegs_RegMask;
158 }
159 
160 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
161   const SIFrameLowering *TFI =
162       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
163   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
164   // During ISel lowering we always reserve the stack pointer in entry
165   // functions, but never actually want to reference it when accessing our own
166   // frame. If we need a frame pointer we use it, but otherwise we can just use
167   // an immediate "0" which we represent by returning NoRegister.
168   if (FuncInfo->isEntryFunction()) {
169     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
170   }
171   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
172                         : FuncInfo->getStackPtrOffsetReg();
173 }
174 
175 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
176   // When we need stack realignment, we can't reference off of the
177   // stack pointer, so we reserve a base pointer.
178   const MachineFrameInfo &MFI = MF.getFrameInfo();
179   return MFI.getNumFixedObjects() && needsStackRealignment(MF);
180 }
181 
182 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
183 
184 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
185   return CSR_AMDGPU_AllVGPRs_RegMask;
186 }
187 
188 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
189   return CSR_AMDGPU_AllAGPRs_RegMask;
190 }
191 
192 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
193   return CSR_AMDGPU_AllVectorRegs_RegMask;
194 }
195 
196 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
197   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
198 }
199 
200 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
201                                               unsigned NumRegs) {
202   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
203   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
204   assert(NumRegIndex && "Not implemented");
205   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
206   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
207 }
208 
209 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
210   const MachineFunction &MF) const {
211   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
212   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
213   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
214 }
215 
216 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
217   BitVector Reserved(getNumRegs());
218   Reserved.set(AMDGPU::MODE);
219 
220   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
221   // this seems likely to result in bugs, so I'm marking them as reserved.
222   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
223   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
224 
225   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
226   reserveRegisterTuples(Reserved, AMDGPU::M0);
227 
228   // Reserve src_vccz, src_execz, src_scc.
229   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
230   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
231   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
232 
233   // Reserve the memory aperture registers.
234   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
235   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
236   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
237   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
238 
239   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
240   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
241 
242   // Reserve xnack_mask registers - support is not implemented in Codegen.
243   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
244 
245   // Reserve lds_direct register - support is not implemented in Codegen.
246   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
247 
248   // Reserve Trap Handler registers - support is not implemented in Codegen.
249   reserveRegisterTuples(Reserved, AMDGPU::TBA);
250   reserveRegisterTuples(Reserved, AMDGPU::TMA);
251   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
252   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
253   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
254   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
255   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
256   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
257   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
258   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
259 
260   // Reserve null register - it shall never be allocated
261   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
262 
263   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
264   // will result in bugs.
265   if (isWave32) {
266     Reserved.set(AMDGPU::VCC);
267     Reserved.set(AMDGPU::VCC_HI);
268   }
269 
270   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
271   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
272   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
273     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
274     reserveRegisterTuples(Reserved, Reg);
275   }
276 
277   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
278   // TODO: In an entry function without calls and AGPRs used it is possible
279   //       to use the whole register budget for VGPRs. Even more it shall
280   //       be possible to estimate maximum AGPR/VGPR pressure and split
281   //       register file accordingly.
282   if (ST.hasGFX90AInsts())
283     MaxNumVGPRs /= 2;
284   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
285   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
286     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
287     reserveRegisterTuples(Reserved, Reg);
288     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
289     reserveRegisterTuples(Reserved, Reg);
290   }
291 
292   for (auto Reg : AMDGPU::SReg_32RegClass) {
293     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
294     Register Low = getSubReg(Reg, AMDGPU::lo16);
295     // This is to prevent BB vcc liveness errors.
296     if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
297       Reserved.set(Low);
298   }
299 
300   for (auto Reg : AMDGPU::AGPR_32RegClass) {
301     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
302   }
303 
304   // Reserve all the rest AGPRs if there are no instructions to use it.
305   if (!ST.hasMAIInsts()) {
306     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
307       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
308       reserveRegisterTuples(Reserved, Reg);
309     }
310   }
311 
312   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
313 
314   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
315   if (ScratchRSrcReg != AMDGPU::NoRegister) {
316     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
317     // to spill.
318     // TODO: May need to reserve a VGPR if doing LDS spilling.
319     reserveRegisterTuples(Reserved, ScratchRSrcReg);
320   }
321 
322   // We have to assume the SP is needed in case there are calls in the function,
323   // which is detected after the function is lowered. If we aren't really going
324   // to need SP, don't bother reserving it.
325   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
326 
327   if (StackPtrReg) {
328     reserveRegisterTuples(Reserved, StackPtrReg);
329     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
330   }
331 
332   MCRegister FrameReg = MFI->getFrameOffsetReg();
333   if (FrameReg) {
334     reserveRegisterTuples(Reserved, FrameReg);
335     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
336   }
337 
338   if (hasBasePointer(MF)) {
339     MCRegister BasePtrReg = getBaseRegister();
340     reserveRegisterTuples(Reserved, BasePtrReg);
341     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
342   }
343 
344   for (MCRegister Reg : MFI->WWMReservedRegs) {
345     reserveRegisterTuples(Reserved, Reg);
346   }
347 
348   // FIXME: Stop using reserved registers for this.
349   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
350     reserveRegisterTuples(Reserved, Reg);
351 
352   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
353     reserveRegisterTuples(Reserved, Reg);
354 
355   for (auto SSpill : MFI->getSGPRSpillVGPRs())
356     reserveRegisterTuples(Reserved, SSpill.VGPR);
357 
358   return Reserved;
359 }
360 
361 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
362   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
363   // On entry, the base address is 0, so it can't possibly need any more
364   // alignment.
365 
366   // FIXME: Should be able to specify the entry frame alignment per calling
367   // convention instead.
368   if (Info->isEntryFunction())
369     return false;
370 
371   return TargetRegisterInfo::canRealignStack(MF);
372 }
373 
374 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
375   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
376   if (Info->isEntryFunction()) {
377     const MachineFrameInfo &MFI = Fn.getFrameInfo();
378     return MFI.hasStackObjects() || MFI.hasCalls();
379   }
380 
381   // May need scavenger for dealing with callee saved registers.
382   return true;
383 }
384 
385 bool SIRegisterInfo::requiresFrameIndexScavenging(
386   const MachineFunction &MF) const {
387   // Do not use frame virtual registers. They used to be used for SGPRs, but
388   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
389   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
390   // spill.
391   return false;
392 }
393 
394 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
395   const MachineFunction &MF) const {
396   const MachineFrameInfo &MFI = MF.getFrameInfo();
397   return MFI.hasStackObjects();
398 }
399 
400 bool SIRegisterInfo::requiresVirtualBaseRegisters(
401   const MachineFunction &) const {
402   // There are no special dedicated stack or frame pointers.
403   return true;
404 }
405 
406 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
407   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
408 
409   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
410                                           AMDGPU::OpName::offset);
411   return MI->getOperand(OffIdx).getImm();
412 }
413 
414 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
415                                                  int Idx) const {
416   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
417     return 0;
418 
419   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
420                                             AMDGPU::OpName::vaddr) ||
421          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
422                                             AMDGPU::OpName::saddr))) &&
423          "Should never see frame index on non-address operand");
424 
425   return getScratchInstrOffset(MI);
426 }
427 
428 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
429   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
430     return false;
431 
432   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
433 
434   if (SIInstrInfo::isMUBUF(*MI))
435     return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
436 
437   const SIInstrInfo *TII = ST.getInstrInfo();
438   return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
439 }
440 
441 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
442                                                       int FrameIdx,
443                                                       int64_t Offset) const {
444   MachineBasicBlock::iterator Ins = MBB->begin();
445   DebugLoc DL; // Defaults to "unknown"
446 
447   if (Ins != MBB->end())
448     DL = Ins->getDebugLoc();
449 
450   MachineFunction *MF = MBB->getParent();
451   const SIInstrInfo *TII = ST.getInstrInfo();
452   MachineRegisterInfo &MRI = MF->getRegInfo();
453   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
454                                            : AMDGPU::V_MOV_B32_e32;
455 
456   Register BaseReg = MRI.createVirtualRegister(
457       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
458                              : &AMDGPU::VGPR_32RegClass);
459 
460   if (Offset == 0) {
461     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
462       .addFrameIndex(FrameIdx);
463     return BaseReg;
464   }
465 
466   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
467 
468   Register FIReg = MRI.createVirtualRegister(
469       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
470                              : &AMDGPU::VGPR_32RegClass);
471 
472   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
473     .addImm(Offset);
474   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
475     .addFrameIndex(FrameIdx);
476 
477   if (ST.enableFlatScratch() ) {
478     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
479         .addReg(OffsetReg, RegState::Kill)
480         .addReg(FIReg);
481     return BaseReg;
482   }
483 
484   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
485     .addReg(OffsetReg, RegState::Kill)
486     .addReg(FIReg)
487     .addImm(0); // clamp bit
488 
489   return BaseReg;
490 }
491 
492 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
493                                        int64_t Offset) const {
494   const SIInstrInfo *TII = ST.getInstrInfo();
495   bool IsFlat = TII->isFLATScratch(MI);
496 
497 #ifndef NDEBUG
498   // FIXME: Is it possible to be storing a frame index to itself?
499   bool SeenFI = false;
500   for (const MachineOperand &MO: MI.operands()) {
501     if (MO.isFI()) {
502       if (SeenFI)
503         llvm_unreachable("should not see multiple frame indices");
504 
505       SeenFI = true;
506     }
507   }
508 #endif
509 
510   MachineOperand *FIOp =
511       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
512                                       : AMDGPU::OpName::vaddr);
513 
514   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
515   int64_t NewOffset = OffsetOp->getImm() + Offset;
516 
517   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
518   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
519 
520   if (IsFlat) {
521     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
522            "offset should be legal");
523     FIOp->ChangeToRegister(BaseReg, false);
524     OffsetOp->setImm(NewOffset);
525     return;
526   }
527 
528 #ifndef NDEBUG
529   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
530   assert(SOffset->isImm() && SOffset->getImm() == 0);
531 #endif
532 
533   assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
534          "offset should be legal");
535 
536   FIOp->ChangeToRegister(BaseReg, false);
537   OffsetOp->setImm(NewOffset);
538 }
539 
540 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
541                                         Register BaseReg,
542                                         int64_t Offset) const {
543   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
544     return false;
545 
546   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
547 
548   if (SIInstrInfo::isMUBUF(*MI))
549     return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
550 
551   const SIInstrInfo *TII = ST.getInstrInfo();
552   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
553 }
554 
555 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
556   const MachineFunction &MF, unsigned Kind) const {
557   // This is inaccurate. It depends on the instruction and address space. The
558   // only place where we should hit this is for dealing with frame indexes /
559   // private accesses, so this is correct in that case.
560   return &AMDGPU::VGPR_32RegClass;
561 }
562 
563 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
564 
565   switch (Op) {
566   case AMDGPU::SI_SPILL_S1024_SAVE:
567   case AMDGPU::SI_SPILL_S1024_RESTORE:
568   case AMDGPU::SI_SPILL_V1024_SAVE:
569   case AMDGPU::SI_SPILL_V1024_RESTORE:
570   case AMDGPU::SI_SPILL_A1024_SAVE:
571   case AMDGPU::SI_SPILL_A1024_RESTORE:
572     return 32;
573   case AMDGPU::SI_SPILL_S512_SAVE:
574   case AMDGPU::SI_SPILL_S512_RESTORE:
575   case AMDGPU::SI_SPILL_V512_SAVE:
576   case AMDGPU::SI_SPILL_V512_RESTORE:
577   case AMDGPU::SI_SPILL_A512_SAVE:
578   case AMDGPU::SI_SPILL_A512_RESTORE:
579     return 16;
580   case AMDGPU::SI_SPILL_S256_SAVE:
581   case AMDGPU::SI_SPILL_S256_RESTORE:
582   case AMDGPU::SI_SPILL_V256_SAVE:
583   case AMDGPU::SI_SPILL_V256_RESTORE:
584   case AMDGPU::SI_SPILL_A256_SAVE:
585   case AMDGPU::SI_SPILL_A256_RESTORE:
586     return 8;
587   case AMDGPU::SI_SPILL_S192_SAVE:
588   case AMDGPU::SI_SPILL_S192_RESTORE:
589   case AMDGPU::SI_SPILL_V192_SAVE:
590   case AMDGPU::SI_SPILL_V192_RESTORE:
591   case AMDGPU::SI_SPILL_A192_SAVE:
592   case AMDGPU::SI_SPILL_A192_RESTORE:
593     return 6;
594   case AMDGPU::SI_SPILL_S160_SAVE:
595   case AMDGPU::SI_SPILL_S160_RESTORE:
596   case AMDGPU::SI_SPILL_V160_SAVE:
597   case AMDGPU::SI_SPILL_V160_RESTORE:
598   case AMDGPU::SI_SPILL_A160_SAVE:
599   case AMDGPU::SI_SPILL_A160_RESTORE:
600     return 5;
601   case AMDGPU::SI_SPILL_S128_SAVE:
602   case AMDGPU::SI_SPILL_S128_RESTORE:
603   case AMDGPU::SI_SPILL_V128_SAVE:
604   case AMDGPU::SI_SPILL_V128_RESTORE:
605   case AMDGPU::SI_SPILL_A128_SAVE:
606   case AMDGPU::SI_SPILL_A128_RESTORE:
607     return 4;
608   case AMDGPU::SI_SPILL_S96_SAVE:
609   case AMDGPU::SI_SPILL_S96_RESTORE:
610   case AMDGPU::SI_SPILL_V96_SAVE:
611   case AMDGPU::SI_SPILL_V96_RESTORE:
612   case AMDGPU::SI_SPILL_A96_SAVE:
613   case AMDGPU::SI_SPILL_A96_RESTORE:
614     return 3;
615   case AMDGPU::SI_SPILL_S64_SAVE:
616   case AMDGPU::SI_SPILL_S64_RESTORE:
617   case AMDGPU::SI_SPILL_V64_SAVE:
618   case AMDGPU::SI_SPILL_V64_RESTORE:
619   case AMDGPU::SI_SPILL_A64_SAVE:
620   case AMDGPU::SI_SPILL_A64_RESTORE:
621     return 2;
622   case AMDGPU::SI_SPILL_S32_SAVE:
623   case AMDGPU::SI_SPILL_S32_RESTORE:
624   case AMDGPU::SI_SPILL_V32_SAVE:
625   case AMDGPU::SI_SPILL_V32_RESTORE:
626   case AMDGPU::SI_SPILL_A32_SAVE:
627   case AMDGPU::SI_SPILL_A32_RESTORE:
628     return 1;
629   default: llvm_unreachable("Invalid spill opcode");
630   }
631 }
632 
633 static int getOffsetMUBUFStore(unsigned Opc) {
634   switch (Opc) {
635   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
636     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
637   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
638     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
639   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
640     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
641   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
642     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
643   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
644     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
645   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
646     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
647   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
648     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
649   default:
650     return -1;
651   }
652 }
653 
654 static int getOffsetMUBUFLoad(unsigned Opc) {
655   switch (Opc) {
656   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
657     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
658   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
659     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
660   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
661     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
662   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
663     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
664   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
665     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
666   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
667     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
668   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
669     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
670   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
671     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
672   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
673     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
674   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
675     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
676   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
677     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
678   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
679     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
680   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
681     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
682   default:
683     return -1;
684   }
685 }
686 
687 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
688                                            MachineBasicBlock::iterator MI,
689                                            int Index,
690                                            unsigned Lane,
691                                            unsigned ValueReg,
692                                            bool IsKill) {
693   MachineBasicBlock *MBB = MI->getParent();
694   MachineFunction *MF = MI->getParent()->getParent();
695   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
696   const SIInstrInfo *TII = ST.getInstrInfo();
697 
698   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
699 
700   if (Reg == AMDGPU::NoRegister)
701     return MachineInstrBuilder();
702 
703   bool IsStore = MI->mayStore();
704   MachineRegisterInfo &MRI = MF->getRegInfo();
705   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
706 
707   unsigned Dst = IsStore ? Reg : ValueReg;
708   unsigned Src = IsStore ? ValueReg : Reg;
709   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
710                                                    : AMDGPU::V_ACCVGPR_READ_B32_e64;
711 
712   auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
713                .addReg(Src, getKillRegState(IsKill));
714   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
715   return MIB;
716 }
717 
718 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
719 // need to handle the case where an SGPR may need to be spilled while spilling.
720 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
721                                       MachineFrameInfo &MFI,
722                                       MachineBasicBlock::iterator MI,
723                                       int Index,
724                                       int64_t Offset) {
725   const SIInstrInfo *TII = ST.getInstrInfo();
726   MachineBasicBlock *MBB = MI->getParent();
727   const DebugLoc &DL = MI->getDebugLoc();
728   bool IsStore = MI->mayStore();
729 
730   unsigned Opc = MI->getOpcode();
731   int LoadStoreOp = IsStore ?
732     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
733   if (LoadStoreOp == -1)
734     return false;
735 
736   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
737   if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
738     return true;
739 
740   MachineInstrBuilder NewMI =
741       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
742           .add(*Reg)
743           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
744           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
745           .addImm(Offset)
746           .addImm(0) // glc
747           .addImm(0) // slc
748           .addImm(0) // tfe
749           .addImm(0) // dlc
750           .addImm(0) // swz
751           .addImm(0) // scc
752           .cloneMemRefs(*MI);
753 
754   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
755                                                        AMDGPU::OpName::vdata_in);
756   if (VDataIn)
757     NewMI.add(*VDataIn);
758   return true;
759 }
760 
761 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
762                                           unsigned LoadStoreOp,
763                                           unsigned EltSize) {
764   bool IsStore = TII->get(LoadStoreOp).mayStore();
765   bool UseST =
766     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
767     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
768 
769   switch (EltSize) {
770   case 4:
771     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
772                           : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
773     break;
774   case 8:
775     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
776                           : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
777     break;
778   case 12:
779     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
780                           : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
781     break;
782   case 16:
783     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
784                           : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
785     break;
786   default:
787     llvm_unreachable("Unexpected spill load/store size!");
788   }
789 
790   if (UseST)
791     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
792 
793   return LoadStoreOp;
794 }
795 
796 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
797                                          unsigned LoadStoreOp,
798                                          int Index,
799                                          Register ValueReg,
800                                          bool IsKill,
801                                          MCRegister ScratchOffsetReg,
802                                          int64_t InstOffset,
803                                          MachineMemOperand *MMO,
804                                          RegScavenger *RS) const {
805   MachineBasicBlock *MBB = MI->getParent();
806   MachineFunction *MF = MI->getParent()->getParent();
807   const SIInstrInfo *TII = ST.getInstrInfo();
808   const MachineFrameInfo &MFI = MF->getFrameInfo();
809   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
810 
811   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
812   const DebugLoc &DL = MI->getDebugLoc();
813   bool IsStore = Desc->mayStore();
814   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
815 
816   bool Scavenged = false;
817   MCRegister SOffset = ScratchOffsetReg;
818 
819   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
820   // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
821   const bool IsAGPR = !ST.hasGFX90AInsts() && hasAGPRs(RC);
822   const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
823 
824   // Always use 4 byte operations for AGPRs because we need to scavenge
825   // a temporary VGPR.
826   unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
827   unsigned NumSubRegs = RegWidth / EltSize;
828   unsigned Size = NumSubRegs * EltSize;
829   unsigned RemSize = RegWidth - Size;
830   unsigned NumRemSubRegs = RemSize ? 1 : 0;
831   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
832   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
833   int64_t ScratchOffsetRegDelta = 0;
834 
835   if (IsFlat && EltSize > 4) {
836     LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
837     Desc = &TII->get(LoadStoreOp);
838   }
839 
840   Align Alignment = MFI.getObjectAlign(Index);
841   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
842 
843   assert((IsFlat || ((Offset % EltSize) == 0)) &&
844          "unexpected VGPR spill offset");
845 
846   bool IsOffsetLegal = IsFlat
847       ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
848       : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
849   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
850     SOffset = MCRegister();
851 
852     // We currently only support spilling VGPRs to EltSize boundaries, meaning
853     // we can simplify the adjustment of Offset here to just scale with
854     // WavefrontSize.
855     if (!IsFlat)
856       Offset *= ST.getWavefrontSize();
857 
858     // We don't have access to the register scavenger if this function is called
859     // during  PEI::scavengeFrameVirtualRegs().
860     if (RS)
861       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
862 
863     if (!SOffset) {
864       // There are no free SGPRs, and since we are in the process of spilling
865       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
866       // on SI/CI and on VI it is true until we implement spilling using scalar
867       // stores), we have no way to free up an SGPR.  Our solution here is to
868       // add the offset directly to the ScratchOffset or StackPtrOffset
869       // register, and then subtract the offset after the spill to return the
870       // register to it's original value.
871       if (!ScratchOffsetReg)
872         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
873       SOffset = ScratchOffsetReg;
874       ScratchOffsetRegDelta = Offset;
875     } else {
876       Scavenged = true;
877     }
878 
879     if (!SOffset)
880       report_fatal_error("could not scavenge SGPR to spill in entry function");
881 
882     if (ScratchOffsetReg == AMDGPU::NoRegister) {
883       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
884           .addImm(Offset);
885     } else {
886       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
887           .addReg(ScratchOffsetReg)
888           .addImm(Offset);
889     }
890 
891     Offset = 0;
892   }
893 
894   if (IsFlat && SOffset == AMDGPU::NoRegister) {
895     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
896            && "Unexpected vaddr for flat scratch with a FI operand");
897 
898     assert(ST.hasFlatScratchSTMode());
899     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
900     Desc = &TII->get(LoadStoreOp);
901   }
902 
903   Register TmpReg;
904 
905   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
906        ++i, RegOffset += EltSize) {
907     if (i == NumSubRegs) {
908       EltSize = RemSize;
909       LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
910     }
911     Desc = &TII->get(LoadStoreOp);
912 
913     unsigned NumRegs = EltSize / 4;
914     Register SubReg = e == 1
915             ? ValueReg
916             : Register(getSubReg(ValueReg,
917                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
918 
919     unsigned SOffsetRegState = 0;
920     unsigned SrcDstRegState = getDefRegState(!IsStore);
921     if (i + 1 == e) {
922       SOffsetRegState |= getKillRegState(Scavenged);
923       // The last implicit use carries the "Kill" flag.
924       SrcDstRegState |= getKillRegState(IsKill);
925     }
926 
927     // Make sure the whole register is defined if there are undef components by
928     // adding an implicit def of the super-reg on the first instruction.
929     bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
930     bool NeedSuperRegImpOperand = e > 1;
931 
932     unsigned Lane = RegOffset / 4;
933     unsigned LaneE = (RegOffset + EltSize) / 4;
934     for ( ; Lane != LaneE; ++Lane) {
935       bool IsSubReg = e > 1 || EltSize > 4;
936       Register Sub = IsSubReg
937              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
938              : ValueReg;
939       auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
940       if (!MIB.getInstr())
941         break;
942       if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
943         MIB.addReg(ValueReg, RegState::ImplicitDefine);
944         NeedSuperRegDef = false;
945       }
946       if (IsSubReg || NeedSuperRegImpOperand) {
947         NeedSuperRegImpOperand = true;
948         unsigned State = SrcDstRegState;
949         if (Lane + 1 != LaneE)
950           State &= ~RegState::Kill;
951         MIB.addReg(ValueReg, RegState::Implicit | State);
952       }
953     }
954 
955     if (Lane == LaneE) // Fully spilled into AGPRs.
956       continue;
957 
958     // Offset in bytes from the beginning of the ValueReg to its portion we
959     // still need to spill. It may differ from RegOffset if a portion of
960     // current SubReg has been already spilled into AGPRs by the loop above.
961     unsigned RemRegOffset = Lane * 4;
962     unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
963     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
964       assert(IsFlat && EltSize > 4);
965 
966       unsigned NumRegs = RemEltSize / 4;
967       SubReg = Register(getSubReg(ValueReg,
968                         getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
969       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
970       Desc = &TII->get(Opc);
971     }
972 
973     unsigned FinalReg = SubReg;
974 
975     if (IsAGPR) {
976       assert(EltSize == 4);
977 
978       if (!TmpReg) {
979         assert(RS && "Needs to have RegScavenger to spill an AGPR!");
980         // FIXME: change to scavengeRegisterBackwards()
981         TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
982         RS->setRegUsed(TmpReg);
983       }
984       if (IsStore) {
985         auto AccRead = BuildMI(*MBB, MI, DL,
986                               TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
987           .addReg(SubReg, getKillRegState(IsKill));
988         if (NeedSuperRegDef)
989           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
990         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
991       }
992       SubReg = TmpReg;
993     }
994 
995     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
996     MachineMemOperand *NewMMO =
997         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
998                                  commonAlignment(Alignment, RemRegOffset));
999 
1000     auto MIB = BuildMI(*MBB, MI, DL, *Desc)
1001                   .addReg(SubReg,
1002                           getDefRegState(!IsStore) | getKillRegState(IsKill));
1003     if (!IsFlat)
1004       MIB.addReg(FuncInfo->getScratchRSrcReg());
1005 
1006     if (SOffset == AMDGPU::NoRegister) {
1007       if (!IsFlat)
1008         MIB.addImm(0);
1009     } else {
1010       MIB.addReg(SOffset, SOffsetRegState);
1011     }
1012     MIB.addImm(Offset + RemRegOffset)
1013         .addImm(0) // glc
1014         .addImm(0) // slc
1015         .addImm(0); // tfe for MUBUF or dlc for FLAT
1016     if (!IsFlat)
1017       MIB.addImm(0) // dlc
1018          .addImm(0); // swz
1019     MIB.addImm(0); // scc
1020     MIB.addMemOperand(NewMMO);
1021 
1022     if (!IsAGPR && NeedSuperRegDef)
1023       MIB.addReg(ValueReg, RegState::ImplicitDefine);
1024 
1025     if (!IsStore && TmpReg != AMDGPU::NoRegister) {
1026       MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1027                     FinalReg)
1028         .addReg(TmpReg, RegState::Kill);
1029       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1030     }
1031 
1032     if (NeedSuperRegImpOperand)
1033       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1034   }
1035 
1036   if (ScratchOffsetRegDelta != 0) {
1037     // Subtract the offset we added to the ScratchOffset register.
1038     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
1039         .addReg(SOffset)
1040         .addImm(ScratchOffsetRegDelta);
1041   }
1042 }
1043 
1044 // Generate a VMEM access which loads or stores the VGPR containing an SGPR
1045 // spill such that all the lanes set in VGPRLanes are loaded or stored.
1046 // This generates exec mask manipulation and will use SGPRs available in MI
1047 // or VGPR lanes in the VGPR to save and restore the exec mask.
1048 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
1049                                              int Index, int Offset,
1050                                              unsigned EltSize, Register VGPR,
1051                                              int64_t VGPRLanes,
1052                                              RegScavenger *RS,
1053                                              bool IsLoad) const {
1054   MachineBasicBlock *MBB = MI->getParent();
1055   MachineFunction *MF = MBB->getParent();
1056   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1057   const SIInstrInfo *TII = ST.getInstrInfo();
1058 
1059   Register SuperReg = MI->getOperand(0).getReg();
1060   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1061   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1062   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1063   unsigned FirstPart = Offset * 32;
1064   unsigned ExecLane = 0;
1065 
1066   bool IsKill = MI->getOperand(0).isKill();
1067   const DebugLoc &DL = MI->getDebugLoc();
1068 
1069   // Cannot handle load/store to EXEC
1070   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1071          SuperReg != AMDGPU::EXEC && "exec should never spill");
1072 
1073   // On Wave32 only handle EXEC_LO.
1074   // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
1075   bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
1076 
1077   unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1078   Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1079   Register SavedExecReg;
1080 
1081   // Backup EXEC
1082   if (OnlyExecLo) {
1083     SavedExecReg =
1084         NumSubRegs == 1
1085             ? SuperReg
1086             : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
1087   } else {
1088     // If src/dst is an odd size it is possible subreg0 is not aligned.
1089     for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
1090       SavedExecReg = getMatchingSuperReg(
1091           getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
1092           &AMDGPU::SReg_64_XEXECRegClass);
1093       if (SavedExecReg)
1094         break;
1095     }
1096   }
1097   assert(SavedExecReg);
1098   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
1099 
1100   // Setup EXEC
1101   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
1102 
1103   // Load/store VGPR
1104   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1105   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1106 
1107   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1108                           ? getBaseRegister()
1109                           : getFrameRegister(*MF);
1110 
1111   Align Alignment = FrameInfo.getObjectAlign(Index);
1112   MachinePointerInfo PtrInfo =
1113       MachinePointerInfo::getFixedStack(*MF, Index);
1114   MachineMemOperand *MMO = MF->getMachineMemOperand(
1115       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1116       EltSize, Alignment);
1117 
1118   if (IsLoad) {
1119     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1120                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1121     buildSpillLoadStore(MI, Opc,
1122           Index,
1123           VGPR, false,
1124           FrameReg,
1125           Offset * EltSize, MMO,
1126           RS);
1127   } else {
1128     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1129                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1130     buildSpillLoadStore(MI, Opc, Index, VGPR,
1131                         IsKill, FrameReg,
1132                         Offset * EltSize, MMO, RS);
1133     // This only ever adds one VGPR spill
1134     MFI->addToSpilledVGPRs(1);
1135   }
1136 
1137   // Restore EXEC
1138   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
1139       .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
1140 
1141   // Restore clobbered SGPRs
1142   if (IsLoad) {
1143     // Nothing to do; register will be overwritten
1144   } else if (!IsKill) {
1145     // Restore SGPRs from appropriate VGPR lanes
1146     if (!OnlyExecLo) {
1147       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1148               getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
1149           .addReg(VGPR)
1150           .addImm(ExecLane + 1);
1151     }
1152     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1153             NumSubRegs == 1 ? SavedExecReg
1154                             : Register(getSubReg(
1155                                   SuperReg, SplitParts[FirstPart + ExecLane])))
1156         .addReg(VGPR, RegState::Kill)
1157         .addImm(ExecLane);
1158   }
1159 }
1160 
1161 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
1162                                int Index,
1163                                RegScavenger *RS,
1164                                bool OnlyToVGPR) const {
1165   MachineBasicBlock *MBB = MI->getParent();
1166   MachineFunction *MF = MBB->getParent();
1167   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1168 
1169   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1170     = MFI->getSGPRToVGPRSpills(Index);
1171   bool SpillToVGPR = !VGPRSpills.empty();
1172   if (OnlyToVGPR && !SpillToVGPR)
1173     return false;
1174 
1175   const SIInstrInfo *TII = ST.getInstrInfo();
1176 
1177   Register SuperReg = MI->getOperand(0).getReg();
1178   bool IsKill = MI->getOperand(0).isKill();
1179   const DebugLoc &DL = MI->getDebugLoc();
1180 
1181   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
1182                          SuperReg != MFI->getFrameOffsetReg()));
1183 
1184   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1185   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1186          SuperReg != AMDGPU::EXEC && "exec should never spill");
1187 
1188   unsigned EltSize = 4;
1189   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1190 
1191   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1192   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1193 
1194   if (SpillToVGPR) {
1195     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1196       Register SubReg = NumSubRegs == 1
1197                             ? SuperReg
1198                             : Register(getSubReg(SuperReg, SplitParts[i]));
1199       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1200 
1201       bool UseKill = IsKill && i == NumSubRegs - 1;
1202 
1203       // Mark the "old value of vgpr" input undef only if this is the first sgpr
1204       // spill to this specific vgpr in the first basic block.
1205       auto MIB =
1206           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
1207               .addReg(SubReg, getKillRegState(UseKill))
1208               .addImm(Spill.Lane)
1209               .addReg(Spill.VGPR);
1210 
1211       if (i == 0 && NumSubRegs > 1) {
1212         // We may be spilling a super-register which is only partially defined,
1213         // and need to ensure later spills think the value is defined.
1214         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1215       }
1216 
1217       if (NumSubRegs > 1)
1218         MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1219 
1220       // FIXME: Since this spills to another register instead of an actual
1221       // frame index, we should delete the frame index when all references to
1222       // it are fixed.
1223     }
1224   } else {
1225     // Scavenged temporary VGPR to use. It must be scavenged once for any number
1226     // of spilled subregs.
1227     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1228     RS->setRegUsed(TmpVGPR);
1229 
1230     // SubReg carries the "Kill" flag when SubReg == SuperReg.
1231     unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
1232 
1233     unsigned PerVGPR = 32;
1234     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1235     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1236 
1237     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1238       unsigned TmpVGPRFlags = RegState::Undef;
1239 
1240       // Write sub registers into the VGPR
1241       for (unsigned i = Offset * PerVGPR,
1242                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1243            i < e; ++i) {
1244         Register SubReg = NumSubRegs == 1
1245                               ? SuperReg
1246                               : Register(getSubReg(SuperReg, SplitParts[i]));
1247 
1248         MachineInstrBuilder WriteLane =
1249             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
1250                 .addReg(SubReg, SubKillState)
1251                 .addImm(i % PerVGPR)
1252                 .addReg(TmpVGPR, TmpVGPRFlags);
1253         TmpVGPRFlags = 0;
1254 
1255         // There could be undef components of a spilled super register.
1256         // TODO: Can we detect this and skip the spill?
1257         if (NumSubRegs > 1) {
1258           // The last implicit use of the SuperReg carries the "Kill" flag.
1259           unsigned SuperKillState = 0;
1260           if (i + 1 == NumSubRegs)
1261             SuperKillState |= getKillRegState(IsKill);
1262           WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
1263         }
1264       }
1265 
1266       // Write out VGPR
1267       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1268                               RS, false);
1269     }
1270   }
1271 
1272   MI->eraseFromParent();
1273   MFI->addToSpilledSGPRs(NumSubRegs);
1274   return true;
1275 }
1276 
1277 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
1278                                  int Index,
1279                                  RegScavenger *RS,
1280                                  bool OnlyToVGPR) const {
1281   MachineFunction *MF = MI->getParent()->getParent();
1282   MachineBasicBlock *MBB = MI->getParent();
1283   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1284 
1285   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1286     = MFI->getSGPRToVGPRSpills(Index);
1287   bool SpillToVGPR = !VGPRSpills.empty();
1288   if (OnlyToVGPR && !SpillToVGPR)
1289     return false;
1290 
1291   const SIInstrInfo *TII = ST.getInstrInfo();
1292   const DebugLoc &DL = MI->getDebugLoc();
1293 
1294   Register SuperReg = MI->getOperand(0).getReg();
1295 
1296   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1297   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1298          SuperReg != AMDGPU::EXEC && "exec should never spill");
1299 
1300   unsigned EltSize = 4;
1301 
1302   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1303 
1304   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1305   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1306 
1307   if (SpillToVGPR) {
1308     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1309       Register SubReg = NumSubRegs == 1
1310                             ? SuperReg
1311                             : Register(getSubReg(SuperReg, SplitParts[i]));
1312 
1313       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1314       auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1315                      .addReg(Spill.VGPR)
1316                      .addImm(Spill.Lane);
1317       if (NumSubRegs > 1 && i == 0)
1318         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1319     }
1320   } else {
1321     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1322     RS->setRegUsed(TmpVGPR);
1323 
1324     unsigned PerVGPR = 32;
1325     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1326     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1327 
1328     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1329       // Load in VGPR data
1330       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1331                               RS, true);
1332 
1333       // Unpack lanes
1334       for (unsigned i = Offset * PerVGPR,
1335                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1336            i < e; ++i) {
1337         Register SubReg = NumSubRegs == 1
1338                               ? SuperReg
1339                               : Register(getSubReg(SuperReg, SplitParts[i]));
1340 
1341         bool LastSubReg = (i + 1 == e);
1342         auto MIB =
1343             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1344                 .addReg(TmpVGPR, getKillRegState(LastSubReg))
1345                 .addImm(i);
1346         if (NumSubRegs > 1 && i == 0)
1347           MIB.addReg(SuperReg, RegState::ImplicitDefine);
1348       }
1349     }
1350   }
1351 
1352   MI->eraseFromParent();
1353   return true;
1354 }
1355 
1356 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1357 /// a VGPR and the stack slot can be safely eliminated when all other users are
1358 /// handled.
1359 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1360   MachineBasicBlock::iterator MI,
1361   int FI,
1362   RegScavenger *RS) const {
1363   switch (MI->getOpcode()) {
1364   case AMDGPU::SI_SPILL_S1024_SAVE:
1365   case AMDGPU::SI_SPILL_S512_SAVE:
1366   case AMDGPU::SI_SPILL_S256_SAVE:
1367   case AMDGPU::SI_SPILL_S192_SAVE:
1368   case AMDGPU::SI_SPILL_S160_SAVE:
1369   case AMDGPU::SI_SPILL_S128_SAVE:
1370   case AMDGPU::SI_SPILL_S96_SAVE:
1371   case AMDGPU::SI_SPILL_S64_SAVE:
1372   case AMDGPU::SI_SPILL_S32_SAVE:
1373     return spillSGPR(MI, FI, RS, true);
1374   case AMDGPU::SI_SPILL_S1024_RESTORE:
1375   case AMDGPU::SI_SPILL_S512_RESTORE:
1376   case AMDGPU::SI_SPILL_S256_RESTORE:
1377   case AMDGPU::SI_SPILL_S192_RESTORE:
1378   case AMDGPU::SI_SPILL_S160_RESTORE:
1379   case AMDGPU::SI_SPILL_S128_RESTORE:
1380   case AMDGPU::SI_SPILL_S96_RESTORE:
1381   case AMDGPU::SI_SPILL_S64_RESTORE:
1382   case AMDGPU::SI_SPILL_S32_RESTORE:
1383     return restoreSGPR(MI, FI, RS, true);
1384   default:
1385     llvm_unreachable("not an SGPR spill instruction");
1386   }
1387 }
1388 
1389 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1390                                         int SPAdj, unsigned FIOperandNum,
1391                                         RegScavenger *RS) const {
1392   MachineFunction *MF = MI->getParent()->getParent();
1393   MachineBasicBlock *MBB = MI->getParent();
1394   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1395   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1396   const SIInstrInfo *TII = ST.getInstrInfo();
1397   DebugLoc DL = MI->getDebugLoc();
1398 
1399   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1400 
1401   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1402   int Index = MI->getOperand(FIOperandNum).getIndex();
1403 
1404   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1405                           ? getBaseRegister()
1406                           : getFrameRegister(*MF);
1407 
1408   switch (MI->getOpcode()) {
1409     // SGPR register spill
1410     case AMDGPU::SI_SPILL_S1024_SAVE:
1411     case AMDGPU::SI_SPILL_S512_SAVE:
1412     case AMDGPU::SI_SPILL_S256_SAVE:
1413     case AMDGPU::SI_SPILL_S192_SAVE:
1414     case AMDGPU::SI_SPILL_S160_SAVE:
1415     case AMDGPU::SI_SPILL_S128_SAVE:
1416     case AMDGPU::SI_SPILL_S96_SAVE:
1417     case AMDGPU::SI_SPILL_S64_SAVE:
1418     case AMDGPU::SI_SPILL_S32_SAVE: {
1419       spillSGPR(MI, Index, RS);
1420       break;
1421     }
1422 
1423     // SGPR register restore
1424     case AMDGPU::SI_SPILL_S1024_RESTORE:
1425     case AMDGPU::SI_SPILL_S512_RESTORE:
1426     case AMDGPU::SI_SPILL_S256_RESTORE:
1427     case AMDGPU::SI_SPILL_S192_RESTORE:
1428     case AMDGPU::SI_SPILL_S160_RESTORE:
1429     case AMDGPU::SI_SPILL_S128_RESTORE:
1430     case AMDGPU::SI_SPILL_S96_RESTORE:
1431     case AMDGPU::SI_SPILL_S64_RESTORE:
1432     case AMDGPU::SI_SPILL_S32_RESTORE: {
1433       restoreSGPR(MI, Index, RS);
1434       break;
1435     }
1436 
1437     // VGPR register spill
1438     case AMDGPU::SI_SPILL_V1024_SAVE:
1439     case AMDGPU::SI_SPILL_V512_SAVE:
1440     case AMDGPU::SI_SPILL_V256_SAVE:
1441     case AMDGPU::SI_SPILL_V192_SAVE:
1442     case AMDGPU::SI_SPILL_V160_SAVE:
1443     case AMDGPU::SI_SPILL_V128_SAVE:
1444     case AMDGPU::SI_SPILL_V96_SAVE:
1445     case AMDGPU::SI_SPILL_V64_SAVE:
1446     case AMDGPU::SI_SPILL_V32_SAVE:
1447     case AMDGPU::SI_SPILL_A1024_SAVE:
1448     case AMDGPU::SI_SPILL_A512_SAVE:
1449     case AMDGPU::SI_SPILL_A256_SAVE:
1450     case AMDGPU::SI_SPILL_A192_SAVE:
1451     case AMDGPU::SI_SPILL_A160_SAVE:
1452     case AMDGPU::SI_SPILL_A128_SAVE:
1453     case AMDGPU::SI_SPILL_A96_SAVE:
1454     case AMDGPU::SI_SPILL_A64_SAVE:
1455     case AMDGPU::SI_SPILL_A32_SAVE: {
1456       const MachineOperand *VData = TII->getNamedOperand(*MI,
1457                                                          AMDGPU::OpName::vdata);
1458       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1459              MFI->getStackPtrOffsetReg());
1460 
1461       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1462                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1463       buildSpillLoadStore(MI, Opc,
1464             Index,
1465             VData->getReg(), VData->isKill(),
1466             FrameReg,
1467             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1468             *MI->memoperands_begin(),
1469             RS);
1470       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1471       MI->eraseFromParent();
1472       break;
1473     }
1474     case AMDGPU::SI_SPILL_V32_RESTORE:
1475     case AMDGPU::SI_SPILL_V64_RESTORE:
1476     case AMDGPU::SI_SPILL_V96_RESTORE:
1477     case AMDGPU::SI_SPILL_V128_RESTORE:
1478     case AMDGPU::SI_SPILL_V160_RESTORE:
1479     case AMDGPU::SI_SPILL_V192_RESTORE:
1480     case AMDGPU::SI_SPILL_V256_RESTORE:
1481     case AMDGPU::SI_SPILL_V512_RESTORE:
1482     case AMDGPU::SI_SPILL_V1024_RESTORE:
1483     case AMDGPU::SI_SPILL_A32_RESTORE:
1484     case AMDGPU::SI_SPILL_A64_RESTORE:
1485     case AMDGPU::SI_SPILL_A96_RESTORE:
1486     case AMDGPU::SI_SPILL_A128_RESTORE:
1487     case AMDGPU::SI_SPILL_A160_RESTORE:
1488     case AMDGPU::SI_SPILL_A192_RESTORE:
1489     case AMDGPU::SI_SPILL_A256_RESTORE:
1490     case AMDGPU::SI_SPILL_A512_RESTORE:
1491     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1492       const MachineOperand *VData = TII->getNamedOperand(*MI,
1493                                                          AMDGPU::OpName::vdata);
1494       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1495              MFI->getStackPtrOffsetReg());
1496 
1497       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1498                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1499       buildSpillLoadStore(MI, Opc,
1500             Index,
1501             VData->getReg(), VData->isKill(),
1502             FrameReg,
1503             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1504             *MI->memoperands_begin(),
1505             RS);
1506       MI->eraseFromParent();
1507       break;
1508     }
1509 
1510     default: {
1511       const DebugLoc &DL = MI->getDebugLoc();
1512 
1513       int64_t Offset = FrameInfo.getObjectOffset(Index);
1514       if (ST.enableFlatScratch()) {
1515         if (TII->isFLATScratch(*MI)) {
1516           assert((int16_t)FIOperandNum ==
1517                  AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1518                                             AMDGPU::OpName::saddr));
1519 
1520           // The offset is always swizzled, just replace it
1521           if (FrameReg)
1522             FIOp.ChangeToRegister(FrameReg, false);
1523 
1524           if (!Offset)
1525             return;
1526 
1527           MachineOperand *OffsetOp =
1528             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1529           int64_t NewOffset = Offset + OffsetOp->getImm();
1530           if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1531                                      true)) {
1532             OffsetOp->setImm(NewOffset);
1533             if (FrameReg)
1534               return;
1535             Offset = 0;
1536           }
1537 
1538           assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
1539                  "Unexpected vaddr for flat scratch with a FI operand");
1540 
1541           // On GFX10 we have ST mode to use no registers for an address.
1542           // Otherwise we need to materialize 0 into an SGPR.
1543           if (!Offset && ST.hasFlatScratchSTMode()) {
1544             unsigned Opc = MI->getOpcode();
1545             unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
1546             MI->RemoveOperand(
1547                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
1548             MI->setDesc(TII->get(NewOpc));
1549             return;
1550           }
1551         }
1552 
1553         if (!FrameReg) {
1554           FIOp.ChangeToImmediate(Offset);
1555           if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
1556             return;
1557         }
1558 
1559         // We need to use register here. Check if we can use an SGPR or need
1560         // a VGPR.
1561         FIOp.ChangeToRegister(AMDGPU::M0, false);
1562         bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
1563 
1564         if (!Offset && FrameReg && UseSGPR) {
1565           FIOp.setReg(FrameReg);
1566           return;
1567         }
1568 
1569         const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
1570                                                 : &AMDGPU::VGPR_32RegClass;
1571 
1572         Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
1573         FIOp.setReg(TmpReg);
1574         FIOp.setIsKill(true);
1575 
1576         if ((!FrameReg || !Offset) && TmpReg) {
1577           unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1578           auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
1579           if (FrameReg)
1580             MIB.addReg(FrameReg);
1581           else
1582             MIB.addImm(Offset);
1583 
1584           return;
1585         }
1586 
1587         Register TmpSReg =
1588             UseSGPR ? TmpReg
1589                     : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
1590                                            !UseSGPR);
1591 
1592         // TODO: for flat scratch another attempt can be made with a VGPR index
1593         //       if no SGPRs can be scavenged.
1594         if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
1595           report_fatal_error("Cannot scavenge register in FI elimination!");
1596 
1597         if (!TmpSReg) {
1598           // Use frame register and restore it after.
1599           TmpSReg = FrameReg;
1600           FIOp.setReg(FrameReg);
1601           FIOp.setIsKill(false);
1602         }
1603 
1604         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
1605           .addReg(FrameReg)
1606           .addImm(Offset);
1607 
1608         if (!UseSGPR)
1609           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1610             .addReg(TmpSReg, RegState::Kill);
1611 
1612         if (TmpSReg == FrameReg) {
1613           // Undo frame register modification.
1614           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
1615                   FrameReg)
1616             .addReg(FrameReg)
1617             .addImm(Offset);
1618         }
1619 
1620         return;
1621       }
1622 
1623       bool IsMUBUF = TII->isMUBUF(*MI);
1624 
1625       if (!IsMUBUF && !MFI->isEntryFunction()) {
1626         // Convert to a swizzled stack address by scaling by the wave size.
1627         //
1628         // In an entry function/kernel the offset is already swizzled.
1629 
1630         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1631         Register ResultReg =
1632             IsCopy ? MI->getOperand(0).getReg()
1633                    : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1634 
1635         int64_t Offset = FrameInfo.getObjectOffset(Index);
1636         if (Offset == 0) {
1637           // XXX - This never happens because of emergency scavenging slot at 0?
1638           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1639             .addImm(ST.getWavefrontSizeLog2())
1640             .addReg(FrameReg);
1641         } else {
1642           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
1643             // Reuse ResultReg in intermediate step.
1644             Register ScaledReg = ResultReg;
1645 
1646             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
1647                     ScaledReg)
1648               .addImm(ST.getWavefrontSizeLog2())
1649               .addReg(FrameReg);
1650 
1651             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
1652 
1653             // TODO: Fold if use instruction is another add of a constant.
1654             if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1655               // FIXME: This can fail
1656               MIB.addImm(Offset);
1657               MIB.addReg(ScaledReg, RegState::Kill);
1658               if (!IsVOP2)
1659                 MIB.addImm(0); // clamp bit
1660             } else {
1661               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
1662                      "Need to reuse carry out register");
1663 
1664               // Use scavenged unused carry out as offset register.
1665               Register ConstOffsetReg;
1666               if (!isWave32)
1667                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
1668               else
1669                 ConstOffsetReg = MIB.getReg(1);
1670 
1671               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1672                 .addImm(Offset);
1673               MIB.addReg(ConstOffsetReg, RegState::Kill);
1674               MIB.addReg(ScaledReg, RegState::Kill);
1675               MIB.addImm(0); // clamp bit
1676             }
1677           } else {
1678             // We have to produce a carry out, and there isn't a free SGPR pair
1679             // for it. We can keep the whole computation on the SALU to avoid
1680             // clobbering an additional register at the cost of an extra mov.
1681 
1682             // We may have 1 free scratch SGPR even though a carry out is
1683             // unavailable. Only one additional mov is needed.
1684             Register TmpScaledReg =
1685                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
1686             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
1687 
1688             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
1689               .addReg(FrameReg)
1690               .addImm(ST.getWavefrontSizeLog2());
1691             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
1692               .addReg(ScaledReg, RegState::Kill)
1693               .addImm(Offset);
1694             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
1695               .addReg(ScaledReg, RegState::Kill);
1696 
1697             // If there were truly no free SGPRs, we need to undo everything.
1698             if (!TmpScaledReg.isValid()) {
1699               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
1700                 .addReg(ScaledReg, RegState::Kill)
1701                 .addImm(Offset);
1702               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
1703                 .addReg(FrameReg)
1704                 .addImm(ST.getWavefrontSizeLog2());
1705             }
1706           }
1707         }
1708 
1709         // Don't introduce an extra copy if we're just materializing in a mov.
1710         if (IsCopy)
1711           MI->eraseFromParent();
1712         else
1713           FIOp.ChangeToRegister(ResultReg, false, false, true);
1714         return;
1715       }
1716 
1717       if (IsMUBUF) {
1718         // Disable offen so we don't need a 0 vgpr base.
1719         assert(static_cast<int>(FIOperandNum) ==
1720                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1721                                           AMDGPU::OpName::vaddr));
1722 
1723         auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
1724         assert((SOffset.isImm() && SOffset.getImm() == 0));
1725 
1726         if (FrameReg != AMDGPU::NoRegister)
1727           SOffset.ChangeToRegister(FrameReg, false);
1728 
1729         int64_t Offset = FrameInfo.getObjectOffset(Index);
1730         int64_t OldImm
1731           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1732         int64_t NewOffset = OldImm + Offset;
1733 
1734         if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
1735             buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
1736           MI->eraseFromParent();
1737           return;
1738         }
1739       }
1740 
1741       // If the offset is simply too big, don't convert to a scratch wave offset
1742       // relative index.
1743 
1744       FIOp.ChangeToImmediate(Offset);
1745       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1746         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1747         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1748           .addImm(Offset);
1749         FIOp.ChangeToRegister(TmpReg, false, false, true);
1750       }
1751     }
1752   }
1753 }
1754 
1755 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
1756   return AMDGPUInstPrinter::getRegisterName(Reg);
1757 }
1758 
1759 static const TargetRegisterClass *
1760 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
1761   if (BitWidth <= 64)
1762     return &AMDGPU::VReg_64RegClass;
1763   if (BitWidth <= 96)
1764     return &AMDGPU::VReg_96RegClass;
1765   if (BitWidth <= 128)
1766     return &AMDGPU::VReg_128RegClass;
1767   if (BitWidth <= 160)
1768     return &AMDGPU::VReg_160RegClass;
1769   if (BitWidth <= 192)
1770     return &AMDGPU::VReg_192RegClass;
1771   if (BitWidth <= 256)
1772     return &AMDGPU::VReg_256RegClass;
1773   if (BitWidth <= 512)
1774     return &AMDGPU::VReg_512RegClass;
1775   if (BitWidth <= 1024)
1776     return &AMDGPU::VReg_1024RegClass;
1777 
1778   return nullptr;
1779 }
1780 
1781 static const TargetRegisterClass *
1782 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
1783   if (BitWidth <= 64)
1784     return &AMDGPU::VReg_64_Align2RegClass;
1785   if (BitWidth <= 96)
1786     return &AMDGPU::VReg_96_Align2RegClass;
1787   if (BitWidth <= 128)
1788     return &AMDGPU::VReg_128_Align2RegClass;
1789   if (BitWidth <= 160)
1790     return &AMDGPU::VReg_160_Align2RegClass;
1791   if (BitWidth <= 192)
1792     return &AMDGPU::VReg_192_Align2RegClass;
1793   if (BitWidth <= 256)
1794     return &AMDGPU::VReg_256_Align2RegClass;
1795   if (BitWidth <= 512)
1796     return &AMDGPU::VReg_512_Align2RegClass;
1797   if (BitWidth <= 1024)
1798     return &AMDGPU::VReg_1024_Align2RegClass;
1799 
1800   return nullptr;
1801 }
1802 
1803 const TargetRegisterClass *
1804 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
1805   if (BitWidth == 1)
1806     return &AMDGPU::VReg_1RegClass;
1807   if (BitWidth <= 16)
1808     return &AMDGPU::VGPR_LO16RegClass;
1809   if (BitWidth <= 32)
1810     return &AMDGPU::VGPR_32RegClass;
1811   return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
1812                                 : getAnyVGPRClassForBitWidth(BitWidth);
1813 }
1814 
1815 static const TargetRegisterClass *
1816 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
1817   if (BitWidth <= 64)
1818     return &AMDGPU::AReg_64RegClass;
1819   if (BitWidth <= 96)
1820     return &AMDGPU::AReg_96RegClass;
1821   if (BitWidth <= 128)
1822     return &AMDGPU::AReg_128RegClass;
1823   if (BitWidth <= 160)
1824     return &AMDGPU::AReg_160RegClass;
1825   if (BitWidth <= 192)
1826     return &AMDGPU::AReg_192RegClass;
1827   if (BitWidth <= 256)
1828     return &AMDGPU::AReg_256RegClass;
1829   if (BitWidth <= 512)
1830     return &AMDGPU::AReg_512RegClass;
1831   if (BitWidth <= 1024)
1832     return &AMDGPU::AReg_1024RegClass;
1833 
1834   return nullptr;
1835 }
1836 
1837 static const TargetRegisterClass *
1838 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
1839   if (BitWidth <= 64)
1840     return &AMDGPU::AReg_64_Align2RegClass;
1841   if (BitWidth <= 96)
1842     return &AMDGPU::AReg_96_Align2RegClass;
1843   if (BitWidth <= 128)
1844     return &AMDGPU::AReg_128_Align2RegClass;
1845   if (BitWidth <= 160)
1846     return &AMDGPU::AReg_160_Align2RegClass;
1847   if (BitWidth <= 192)
1848     return &AMDGPU::AReg_192_Align2RegClass;
1849   if (BitWidth <= 256)
1850     return &AMDGPU::AReg_256_Align2RegClass;
1851   if (BitWidth <= 512)
1852     return &AMDGPU::AReg_512_Align2RegClass;
1853   if (BitWidth <= 1024)
1854     return &AMDGPU::AReg_1024_Align2RegClass;
1855 
1856   return nullptr;
1857 }
1858 
1859 const TargetRegisterClass *
1860 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
1861   if (BitWidth <= 16)
1862     return &AMDGPU::AGPR_LO16RegClass;
1863   if (BitWidth <= 32)
1864     return &AMDGPU::AGPR_32RegClass;
1865   return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
1866                                 : getAnyAGPRClassForBitWidth(BitWidth);
1867 }
1868 
1869 const TargetRegisterClass *
1870 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
1871   if (BitWidth <= 16)
1872     return &AMDGPU::SGPR_LO16RegClass;
1873   if (BitWidth <= 32)
1874     return &AMDGPU::SReg_32RegClass;
1875   if (BitWidth <= 64)
1876     return &AMDGPU::SReg_64RegClass;
1877   if (BitWidth <= 96)
1878     return &AMDGPU::SGPR_96RegClass;
1879   if (BitWidth <= 128)
1880     return &AMDGPU::SGPR_128RegClass;
1881   if (BitWidth <= 160)
1882     return &AMDGPU::SGPR_160RegClass;
1883   if (BitWidth <= 192)
1884     return &AMDGPU::SGPR_192RegClass;
1885   if (BitWidth <= 256)
1886     return &AMDGPU::SGPR_256RegClass;
1887   if (BitWidth <= 512)
1888     return &AMDGPU::SGPR_512RegClass;
1889   if (BitWidth <= 1024)
1890     return &AMDGPU::SGPR_1024RegClass;
1891 
1892   return nullptr;
1893 }
1894 
1895 // FIXME: This is very slow. It might be worth creating a map from physreg to
1896 // register class.
1897 const TargetRegisterClass *
1898 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
1899   static const TargetRegisterClass *const BaseClasses[] = {
1900     &AMDGPU::VGPR_LO16RegClass,
1901     &AMDGPU::VGPR_HI16RegClass,
1902     &AMDGPU::SReg_LO16RegClass,
1903     &AMDGPU::AGPR_LO16RegClass,
1904     &AMDGPU::VGPR_32RegClass,
1905     &AMDGPU::SReg_32RegClass,
1906     &AMDGPU::AGPR_32RegClass,
1907     &AMDGPU::AGPR_32RegClass,
1908     &AMDGPU::VReg_64_Align2RegClass,
1909     &AMDGPU::VReg_64RegClass,
1910     &AMDGPU::SReg_64RegClass,
1911     &AMDGPU::AReg_64_Align2RegClass,
1912     &AMDGPU::AReg_64RegClass,
1913     &AMDGPU::VReg_96_Align2RegClass,
1914     &AMDGPU::VReg_96RegClass,
1915     &AMDGPU::SReg_96RegClass,
1916     &AMDGPU::AReg_96_Align2RegClass,
1917     &AMDGPU::AReg_96RegClass,
1918     &AMDGPU::VReg_128_Align2RegClass,
1919     &AMDGPU::VReg_128RegClass,
1920     &AMDGPU::SReg_128RegClass,
1921     &AMDGPU::AReg_128_Align2RegClass,
1922     &AMDGPU::AReg_128RegClass,
1923     &AMDGPU::VReg_160_Align2RegClass,
1924     &AMDGPU::VReg_160RegClass,
1925     &AMDGPU::SReg_160RegClass,
1926     &AMDGPU::AReg_160_Align2RegClass,
1927     &AMDGPU::AReg_160RegClass,
1928     &AMDGPU::VReg_192_Align2RegClass,
1929     &AMDGPU::VReg_192RegClass,
1930     &AMDGPU::SReg_192RegClass,
1931     &AMDGPU::AReg_192_Align2RegClass,
1932     &AMDGPU::AReg_192RegClass,
1933     &AMDGPU::VReg_256_Align2RegClass,
1934     &AMDGPU::VReg_256RegClass,
1935     &AMDGPU::SReg_256RegClass,
1936     &AMDGPU::AReg_256_Align2RegClass,
1937     &AMDGPU::AReg_256RegClass,
1938     &AMDGPU::VReg_512_Align2RegClass,
1939     &AMDGPU::VReg_512RegClass,
1940     &AMDGPU::SReg_512RegClass,
1941     &AMDGPU::AReg_512_Align2RegClass,
1942     &AMDGPU::AReg_512RegClass,
1943     &AMDGPU::SReg_1024RegClass,
1944     &AMDGPU::VReg_1024_Align2RegClass,
1945     &AMDGPU::VReg_1024RegClass,
1946     &AMDGPU::AReg_1024_Align2RegClass,
1947     &AMDGPU::AReg_1024RegClass,
1948     &AMDGPU::SCC_CLASSRegClass,
1949     &AMDGPU::Pseudo_SReg_32RegClass,
1950     &AMDGPU::Pseudo_SReg_128RegClass,
1951   };
1952 
1953   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1954     if (BaseClass->contains(Reg)) {
1955       return BaseClass;
1956     }
1957   }
1958   return nullptr;
1959 }
1960 
1961 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
1962                                Register Reg) const {
1963   const TargetRegisterClass *RC;
1964   if (Reg.isVirtual())
1965     RC = MRI.getRegClass(Reg);
1966   else
1967     RC = getPhysRegClass(Reg);
1968   return isSGPRClass(RC);
1969 }
1970 
1971 // TODO: It might be helpful to have some target specific flags in
1972 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1973 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1974   unsigned Size = getRegSizeInBits(*RC);
1975   if (Size == 16) {
1976     return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
1977            getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
1978   }
1979   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1980   if (!VRC) {
1981     assert(Size < 32 && "Invalid register class size");
1982     return false;
1983   }
1984   return getCommonSubClass(VRC, RC) != nullptr;
1985 }
1986 
1987 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1988   unsigned Size = getRegSizeInBits(*RC);
1989   if (Size < 16)
1990     return false;
1991   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1992   if (!ARC) {
1993     assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
1994     return false;
1995   }
1996   return getCommonSubClass(ARC, RC) != nullptr;
1997 }
1998 
1999 const TargetRegisterClass *
2000 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
2001   unsigned Size = getRegSizeInBits(*SRC);
2002   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
2003   assert(VRC && "Invalid register class size");
2004   return VRC;
2005 }
2006 
2007 const TargetRegisterClass *
2008 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
2009   unsigned Size = getRegSizeInBits(*SRC);
2010   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
2011   assert(ARC && "Invalid register class size");
2012   return ARC;
2013 }
2014 
2015 const TargetRegisterClass *
2016 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
2017   unsigned Size = getRegSizeInBits(*VRC);
2018   if (Size == 32)
2019     return &AMDGPU::SGPR_32RegClass;
2020   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
2021   assert(SRC && "Invalid register class size");
2022   return SRC;
2023 }
2024 
2025 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
2026                          const TargetRegisterClass *RC, unsigned SubIdx) const {
2027   if (SubIdx == AMDGPU::NoSubRegister)
2028     return RC;
2029 
2030   // We can assume that each lane corresponds to one 32-bit register.
2031   unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
2032   if (isSGPRClass(RC)) {
2033     if (Size == 32)
2034       RC = &AMDGPU::SGPR_32RegClass;
2035     else
2036       RC = getSGPRClassForBitWidth(Size);
2037   } else if (hasAGPRs(RC)) {
2038     RC = getAGPRClassForBitWidth(Size);
2039   } else {
2040     RC = getVGPRClassForBitWidth(Size);
2041   }
2042   assert(RC && "Invalid sub-register class size");
2043   return RC;
2044 }
2045 
2046 const TargetRegisterClass *
2047 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
2048                                          const TargetRegisterClass *SubRC,
2049                                          unsigned SubIdx) const {
2050   // Ensure this subregister index is aligned in the super register.
2051   const TargetRegisterClass *MatchRC =
2052       getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
2053   return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
2054 }
2055 
2056 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
2057   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
2058       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
2059     return !ST.hasMFMAInlineLiteralBug();
2060 
2061   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
2062          OpType <= AMDGPU::OPERAND_SRC_LAST;
2063 }
2064 
2065 bool SIRegisterInfo::shouldRewriteCopySrc(
2066   const TargetRegisterClass *DefRC,
2067   unsigned DefSubReg,
2068   const TargetRegisterClass *SrcRC,
2069   unsigned SrcSubReg) const {
2070   // We want to prefer the smallest register class possible, so we don't want to
2071   // stop and rewrite on anything that looks like a subregister
2072   // extract. Operations mostly don't care about the super register class, so we
2073   // only want to stop on the most basic of copies between the same register
2074   // class.
2075   //
2076   // e.g. if we have something like
2077   // %0 = ...
2078   // %1 = ...
2079   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2080   // %3 = COPY %2, sub0
2081   //
2082   // We want to look through the COPY to find:
2083   //  => %3 = COPY %0
2084 
2085   // Plain copy.
2086   return getCommonSubClass(DefRC, SrcRC) != nullptr;
2087 }
2088 
2089 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
2090   // TODO: 64-bit operands have extending behavior from 32-bit literal.
2091   return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
2092          OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
2093 }
2094 
2095 /// Returns a lowest register that is not used at any point in the function.
2096 ///        If all registers are used, then this function will return
2097 ///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
2098 ///         highest unused register.
2099 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
2100                                               const TargetRegisterClass *RC,
2101                                               const MachineFunction &MF,
2102                                               bool ReserveHighestVGPR) const {
2103   if (ReserveHighestVGPR) {
2104     for (MCRegister Reg : reverse(*RC))
2105       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2106         return Reg;
2107   } else {
2108     for (MCRegister Reg : *RC)
2109       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2110         return Reg;
2111   }
2112   return MCRegister();
2113 }
2114 
2115 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
2116                                                    unsigned EltSize) const {
2117   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
2118   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
2119 
2120   const unsigned RegDWORDs = RegBitWidth / 32;
2121   const unsigned EltDWORDs = EltSize / 4;
2122   assert(RegSplitParts.size() + 1 >= EltDWORDs);
2123 
2124   const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
2125   const unsigned NumParts = RegDWORDs / EltDWORDs;
2126 
2127   return makeArrayRef(Parts.data(), NumParts);
2128 }
2129 
2130 const TargetRegisterClass*
2131 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
2132                                   Register Reg) const {
2133   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
2134 }
2135 
2136 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
2137                             Register Reg) const {
2138   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
2139   // Registers without classes are unaddressable, SGPR-like registers.
2140   return RC && hasVGPRs(RC);
2141 }
2142 
2143 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
2144                             Register Reg) const {
2145   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
2146 
2147   // Registers without classes are unaddressable, SGPR-like registers.
2148   return RC && hasAGPRs(RC);
2149 }
2150 
2151 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
2152                                     const TargetRegisterClass *SrcRC,
2153                                     unsigned SubReg,
2154                                     const TargetRegisterClass *DstRC,
2155                                     unsigned DstSubReg,
2156                                     const TargetRegisterClass *NewRC,
2157                                     LiveIntervals &LIS) const {
2158   unsigned SrcSize = getRegSizeInBits(*SrcRC);
2159   unsigned DstSize = getRegSizeInBits(*DstRC);
2160   unsigned NewSize = getRegSizeInBits(*NewRC);
2161 
2162   // Do not increase size of registers beyond dword, we would need to allocate
2163   // adjacent registers and constraint regalloc more than needed.
2164 
2165   // Always allow dword coalescing.
2166   if (SrcSize <= 32 || DstSize <= 32)
2167     return true;
2168 
2169   return NewSize <= DstSize || NewSize <= SrcSize;
2170 }
2171 
2172 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
2173                                              MachineFunction &MF) const {
2174   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2175 
2176   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
2177                                                        MF.getFunction());
2178   switch (RC->getID()) {
2179   default:
2180     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
2181   case AMDGPU::VGPR_32RegClassID:
2182   case AMDGPU::VGPR_LO16RegClassID:
2183   case AMDGPU::VGPR_HI16RegClassID:
2184     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
2185   case AMDGPU::SGPR_32RegClassID:
2186   case AMDGPU::SGPR_LO16RegClassID:
2187     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
2188   }
2189 }
2190 
2191 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
2192                                                 unsigned Idx) const {
2193   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
2194       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
2195     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
2196                                const_cast<MachineFunction &>(MF));
2197 
2198   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
2199     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
2200                                const_cast<MachineFunction &>(MF));
2201 
2202   llvm_unreachable("Unexpected register pressure set!");
2203 }
2204 
2205 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
2206   static const int Empty[] = { -1 };
2207 
2208   if (RegPressureIgnoredUnits[RegUnit])
2209     return Empty;
2210 
2211   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
2212 }
2213 
2214 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
2215   // Not a callee saved register.
2216   return AMDGPU::SGPR30_SGPR31;
2217 }
2218 
2219 const TargetRegisterClass *
2220 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
2221                                          const RegisterBank &RB,
2222                                          const MachineRegisterInfo &MRI) const {
2223   switch (RB.getID()) {
2224   case AMDGPU::VGPRRegBankID:
2225     return getVGPRClassForBitWidth(std::max(32u, Size));
2226   case AMDGPU::VCCRegBankID:
2227     assert(Size == 1);
2228     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2229                     : &AMDGPU::SReg_64_XEXECRegClass;
2230   case AMDGPU::SGPRRegBankID:
2231     return getSGPRClassForBitWidth(std::max(32u, Size));
2232   case AMDGPU::AGPRRegBankID:
2233     return getAGPRClassForBitWidth(std::max(32u, Size));
2234   default:
2235     llvm_unreachable("unknown register bank");
2236   }
2237 }
2238 
2239 const TargetRegisterClass *
2240 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
2241                                          const MachineRegisterInfo &MRI) const {
2242   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
2243   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
2244     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
2245 
2246   const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
2247   return getAllocatableClass(RC);
2248 }
2249 
2250 MCRegister SIRegisterInfo::getVCC() const {
2251   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
2252 }
2253 
2254 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
2255   // VGPR tuples have an alignment requirement on gfx90a variants.
2256   return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
2257                                 : &AMDGPU::VReg_64RegClass;
2258 }
2259 
2260 const TargetRegisterClass *
2261 SIRegisterInfo::getRegClass(unsigned RCID) const {
2262   switch ((int)RCID) {
2263   case AMDGPU::SReg_1RegClassID:
2264     return getBoolRC();
2265   case AMDGPU::SReg_1_XEXECRegClassID:
2266     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2267       : &AMDGPU::SReg_64_XEXECRegClass;
2268   case -1:
2269     return nullptr;
2270   default:
2271     return AMDGPUGenRegisterInfo::getRegClass(RCID);
2272   }
2273 }
2274 
2275 // Find reaching register definition
2276 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
2277                                               MachineInstr &Use,
2278                                               MachineRegisterInfo &MRI,
2279                                               LiveIntervals *LIS) const {
2280   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
2281   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
2282   SlotIndex DefIdx;
2283 
2284   if (Reg.isVirtual()) {
2285     if (!LIS->hasInterval(Reg))
2286       return nullptr;
2287     LiveInterval &LI = LIS->getInterval(Reg);
2288     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
2289                                   : MRI.getMaxLaneMaskForVReg(Reg);
2290     VNInfo *V = nullptr;
2291     if (LI.hasSubRanges()) {
2292       for (auto &S : LI.subranges()) {
2293         if ((S.LaneMask & SubLanes) == SubLanes) {
2294           V = S.getVNInfoAt(UseIdx);
2295           break;
2296         }
2297       }
2298     } else {
2299       V = LI.getVNInfoAt(UseIdx);
2300     }
2301     if (!V)
2302       return nullptr;
2303     DefIdx = V->def;
2304   } else {
2305     // Find last def.
2306     for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
2307          ++Units) {
2308       LiveRange &LR = LIS->getRegUnit(*Units);
2309       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
2310         if (!DefIdx.isValid() ||
2311             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
2312                           LIS->getInstructionFromIndex(V->def)))
2313           DefIdx = V->def;
2314       } else {
2315         return nullptr;
2316       }
2317     }
2318   }
2319 
2320   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2321 
2322   if (!Def || !MDT.dominates(Def, &Use))
2323     return nullptr;
2324 
2325   assert(Def->modifiesRegister(Reg, this));
2326 
2327   return Def;
2328 }
2329 
2330 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
2331   assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
2332 
2333   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
2334                                          AMDGPU::SReg_32RegClass,
2335                                          AMDGPU::AGPR_32RegClass } ) {
2336     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
2337       return Super;
2338   }
2339   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
2340                                             &AMDGPU::VGPR_32RegClass)) {
2341       return Super;
2342   }
2343 
2344   return AMDGPU::NoRegister;
2345 }
2346 
2347 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
2348   switch (PhysReg) {
2349   case AMDGPU::SGPR_NULL:
2350   case AMDGPU::SRC_SHARED_BASE:
2351   case AMDGPU::SRC_PRIVATE_BASE:
2352   case AMDGPU::SRC_SHARED_LIMIT:
2353   case AMDGPU::SRC_PRIVATE_LIMIT:
2354     return true;
2355   default:
2356     return false;
2357   }
2358 }
2359 
2360 ArrayRef<MCPhysReg>
2361 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
2362   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
2363                       ST.getMaxNumSGPRs(MF) / 4);
2364 }
2365 
2366 ArrayRef<MCPhysReg>
2367 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
2368   return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
2369                       ST.getMaxNumSGPRs(MF) / 2);
2370 }
2371 
2372 ArrayRef<MCPhysReg>
2373 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
2374   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
2375 }
2376