1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIRegisterInfo.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "MCTargetDesc/AMDGPUInstPrinter.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 #include "llvm/CodeGen/SlotIndexes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/LLVMContext.h"
29 
30 using namespace llvm;
31 
32 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
33   for (unsigned i = 0; PSets[i] != -1; ++i) {
34     if (PSets[i] == (int)PSetID)
35       return true;
36   }
37   return false;
38 }
39 
40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
41                                          BitVector &PressureSets) const {
42   for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
43     const int *PSets = getRegUnitPressureSets(*U);
44     if (hasPressureSet(PSets, PSetID)) {
45       PressureSets.set(PSetID);
46       break;
47     }
48   }
49 }
50 
51 static cl::opt<bool> EnableSpillSGPRToSMEM(
52   "amdgpu-spill-sgpr-to-smem",
53   cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableSpillSGPRToVGPR(
57   "amdgpu-spill-sgpr-to-vgpr",
58   cl::desc("Enable spilling VGPRs to SGPRs"),
59   cl::ReallyHidden,
60   cl::init(true));
61 
62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
63   AMDGPURegisterInfo(),
64   SGPRPressureSets(getNumRegPressureSets()),
65   VGPRPressureSets(getNumRegPressureSets()),
66   AGPRPressureSets(getNumRegPressureSets()),
67   SpillSGPRToVGPR(false),
68   SpillSGPRToSMEM(false),
69   isWave32(ST.isWave32()) {
70   if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
71     SpillSGPRToSMEM = true;
72   else if (EnableSpillSGPRToVGPR)
73     SpillSGPRToVGPR = true;
74 
75   unsigned NumRegPressureSets = getNumRegPressureSets();
76 
77   SGPRSetID = NumRegPressureSets;
78   VGPRSetID = NumRegPressureSets;
79   AGPRSetID = NumRegPressureSets;
80 
81   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
82     classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
83     classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
84     classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
85   }
86 
87   // Determine the number of reg units for each pressure set.
88   std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
89   for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
90     const int *PSets = getRegUnitPressureSets(i);
91     for (unsigned j = 0; PSets[j] != -1; ++j) {
92       ++PressureSetRegUnits[PSets[j]];
93     }
94   }
95 
96   unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
97   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
98     if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
99       VGPRSetID = i;
100       VGPRMax = PressureSetRegUnits[i];
101       continue;
102     }
103     if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
104       SGPRSetID = i;
105       SGPRMax = PressureSetRegUnits[i];
106     }
107     if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
108       AGPRSetID = i;
109       AGPRMax = PressureSetRegUnits[i];
110       continue;
111     }
112   }
113 
114   assert(SGPRSetID < NumRegPressureSets &&
115          VGPRSetID < NumRegPressureSets &&
116          AGPRSetID < NumRegPressureSets);
117 }
118 
119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
120   const MachineFunction &MF) const {
121 
122   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
123   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
124   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
125   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
126 }
127 
128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
129   unsigned Reg;
130 
131   // Try to place it in a hole after PrivateSegmentBufferReg.
132   if (RegCount & 3) {
133     // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
134     // alignment constraints, so we have a hole where can put the wave offset.
135     Reg = RegCount - 1;
136   } else {
137     // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
138     // wave offset before it.
139     Reg = RegCount - 5;
140   }
141 
142   return Reg;
143 }
144 
145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
146   const MachineFunction &MF) const {
147   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
148   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
149   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
150 }
151 
152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
153   BitVector Reserved(getNumRegs());
154 
155   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
156   // this seems likely to result in bugs, so I'm marking them as reserved.
157   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
158   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
159 
160   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
161   reserveRegisterTuples(Reserved, AMDGPU::M0);
162 
163   // Reserve src_vccz, src_execz, src_scc.
164   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
165   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
166   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
167 
168   // Reserve the memory aperture registers.
169   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
170   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
171   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
172   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
173 
174   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
175   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
176 
177   // Reserve xnack_mask registers - support is not implemented in Codegen.
178   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
179 
180   // Reserve lds_direct register - support is not implemented in Codegen.
181   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
182 
183   // Reserve Trap Handler registers - support is not implemented in Codegen.
184   reserveRegisterTuples(Reserved, AMDGPU::TBA);
185   reserveRegisterTuples(Reserved, AMDGPU::TMA);
186   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
187   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
188   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
189   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
190   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
191   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
192   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
193   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
194 
195   // Reserve null register - it shall never be allocated
196   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
197 
198   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
199   // will result in bugs.
200   if (isWave32) {
201     Reserved.set(AMDGPU::VCC);
202     Reserved.set(AMDGPU::VCC_HI);
203   }
204 
205   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
206 
207   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
208   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
209   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
210     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
211     reserveRegisterTuples(Reserved, Reg);
212   }
213 
214   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
215   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
216   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
217     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
218     reserveRegisterTuples(Reserved, Reg);
219     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
220     reserveRegisterTuples(Reserved, Reg);
221   }
222 
223   // Reserve all the rest AGPRs if there are no instructions to use it.
224   if (!ST.hasMAIInsts()) {
225     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
226       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
227       reserveRegisterTuples(Reserved, Reg);
228     }
229   }
230 
231   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
232 
233   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
234   if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
235     // Reserve 1 SGPR for scratch wave offset in case we need to spill.
236     reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
237   }
238 
239   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
240   if (ScratchRSrcReg != AMDGPU::NoRegister) {
241     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
242     // to spill.
243     // TODO: May need to reserve a VGPR if doing LDS spilling.
244     reserveRegisterTuples(Reserved, ScratchRSrcReg);
245     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
246   }
247 
248   // We have to assume the SP is needed in case there are calls in the function,
249   // which is detected after the function is lowered. If we aren't really going
250   // to need SP, don't bother reserving it.
251   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
252 
253   if (StackPtrReg != AMDGPU::NoRegister) {
254     reserveRegisterTuples(Reserved, StackPtrReg);
255     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
256   }
257 
258   unsigned FrameReg = MFI->getFrameOffsetReg();
259   if (FrameReg != AMDGPU::NoRegister) {
260     reserveRegisterTuples(Reserved, FrameReg);
261     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
262   }
263 
264   for (unsigned Reg : MFI->WWMReservedRegs) {
265     reserveRegisterTuples(Reserved, Reg);
266   }
267 
268   // FIXME: Stop using reserved registers for this.
269   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
270     reserveRegisterTuples(Reserved, Reg);
271 
272   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
273     reserveRegisterTuples(Reserved, Reg);
274 
275   return Reserved;
276 }
277 
278 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
279   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
280   // On entry, the base address is 0, so it can't possibly need any more
281   // alignment.
282 
283   // FIXME: Should be able to specify the entry frame alignment per calling
284   // convention instead.
285   if (Info->isEntryFunction())
286     return false;
287 
288   return TargetRegisterInfo::canRealignStack(MF);
289 }
290 
291 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
292   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
293   if (Info->isEntryFunction()) {
294     const MachineFrameInfo &MFI = Fn.getFrameInfo();
295     return MFI.hasStackObjects() || MFI.hasCalls();
296   }
297 
298   // May need scavenger for dealing with callee saved registers.
299   return true;
300 }
301 
302 bool SIRegisterInfo::requiresFrameIndexScavenging(
303   const MachineFunction &MF) const {
304   const MachineFrameInfo &MFI = MF.getFrameInfo();
305   if (MFI.hasStackObjects())
306     return true;
307 
308   // May need to deal with callee saved registers.
309   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
310   return !Info->isEntryFunction();
311 }
312 
313 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
314   const MachineFunction &MF) const {
315   const MachineFrameInfo &MFI = MF.getFrameInfo();
316   if (!MFI.hasStackObjects())
317     return false;
318 
319   // The scavenger is used for large frames which may require finding a free
320   // register for large offsets.
321   if (!isUInt<12>(MFI.getStackSize()))
322     return true;
323 
324   // If using scalar stores, for spills, m0 is needed for the scalar store
325   // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
326   // register for it during frame index elimination, so the scavenger is
327   // directly needed.
328   return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
329          MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
330 }
331 
332 bool SIRegisterInfo::requiresVirtualBaseRegisters(
333   const MachineFunction &) const {
334   // There are no special dedicated stack or frame pointers.
335   return true;
336 }
337 
338 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
339   // This helps catch bugs as verifier errors.
340   return true;
341 }
342 
343 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
344   assert(SIInstrInfo::isMUBUF(*MI));
345 
346   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
347                                           AMDGPU::OpName::offset);
348   return MI->getOperand(OffIdx).getImm();
349 }
350 
351 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
352                                                  int Idx) const {
353   if (!SIInstrInfo::isMUBUF(*MI))
354     return 0;
355 
356   assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
357                                            AMDGPU::OpName::vaddr) &&
358          "Should never see frame index on non-address operand");
359 
360   return getMUBUFInstrOffset(MI);
361 }
362 
363 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
364   if (!MI->mayLoadOrStore())
365     return false;
366 
367   int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
368 
369   return !isUInt<12>(FullOffset);
370 }
371 
372 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
373                                                   unsigned BaseReg,
374                                                   int FrameIdx,
375                                                   int64_t Offset) const {
376   MachineBasicBlock::iterator Ins = MBB->begin();
377   DebugLoc DL; // Defaults to "unknown"
378 
379   if (Ins != MBB->end())
380     DL = Ins->getDebugLoc();
381 
382   MachineFunction *MF = MBB->getParent();
383   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
384   const SIInstrInfo *TII = Subtarget.getInstrInfo();
385 
386   if (Offset == 0) {
387     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
388       .addFrameIndex(FrameIdx);
389     return;
390   }
391 
392   MachineRegisterInfo &MRI = MF->getRegInfo();
393   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
394 
395   Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
396 
397   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
398     .addImm(Offset);
399   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
400     .addFrameIndex(FrameIdx);
401 
402   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
403     .addReg(OffsetReg, RegState::Kill)
404     .addReg(FIReg)
405     .addImm(0); // clamp bit
406 }
407 
408 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
409                                        int64_t Offset) const {
410 
411   MachineBasicBlock *MBB = MI.getParent();
412   MachineFunction *MF = MBB->getParent();
413   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
414   const SIInstrInfo *TII = Subtarget.getInstrInfo();
415 
416 #ifndef NDEBUG
417   // FIXME: Is it possible to be storing a frame index to itself?
418   bool SeenFI = false;
419   for (const MachineOperand &MO: MI.operands()) {
420     if (MO.isFI()) {
421       if (SeenFI)
422         llvm_unreachable("should not see multiple frame indices");
423 
424       SeenFI = true;
425     }
426   }
427 #endif
428 
429   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
430   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
431   assert(TII->isMUBUF(MI));
432   assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
433          MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
434          "should only be seeing frame offset relative FrameIndex");
435 
436 
437   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
438   int64_t NewOffset = OffsetOp->getImm() + Offset;
439   assert(isUInt<12>(NewOffset) && "offset should be legal");
440 
441   FIOp->ChangeToRegister(BaseReg, false);
442   OffsetOp->setImm(NewOffset);
443 }
444 
445 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
446                                         unsigned BaseReg,
447                                         int64_t Offset) const {
448   if (!SIInstrInfo::isMUBUF(*MI))
449     return false;
450 
451   int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
452 
453   return isUInt<12>(NewOffset);
454 }
455 
456 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
457   const MachineFunction &MF, unsigned Kind) const {
458   // This is inaccurate. It depends on the instruction and address space. The
459   // only place where we should hit this is for dealing with frame indexes /
460   // private accesses, so this is correct in that case.
461   return &AMDGPU::VGPR_32RegClass;
462 }
463 
464 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
465 
466   switch (Op) {
467   case AMDGPU::SI_SPILL_S1024_SAVE:
468   case AMDGPU::SI_SPILL_S1024_RESTORE:
469   case AMDGPU::SI_SPILL_V1024_SAVE:
470   case AMDGPU::SI_SPILL_V1024_RESTORE:
471   case AMDGPU::SI_SPILL_A1024_SAVE:
472   case AMDGPU::SI_SPILL_A1024_RESTORE:
473     return 32;
474   case AMDGPU::SI_SPILL_S512_SAVE:
475   case AMDGPU::SI_SPILL_S512_RESTORE:
476   case AMDGPU::SI_SPILL_V512_SAVE:
477   case AMDGPU::SI_SPILL_V512_RESTORE:
478   case AMDGPU::SI_SPILL_A512_SAVE:
479   case AMDGPU::SI_SPILL_A512_RESTORE:
480     return 16;
481   case AMDGPU::SI_SPILL_S256_SAVE:
482   case AMDGPU::SI_SPILL_S256_RESTORE:
483   case AMDGPU::SI_SPILL_V256_SAVE:
484   case AMDGPU::SI_SPILL_V256_RESTORE:
485     return 8;
486   case AMDGPU::SI_SPILL_S160_SAVE:
487   case AMDGPU::SI_SPILL_S160_RESTORE:
488   case AMDGPU::SI_SPILL_V160_SAVE:
489   case AMDGPU::SI_SPILL_V160_RESTORE:
490     return 5;
491   case AMDGPU::SI_SPILL_S128_SAVE:
492   case AMDGPU::SI_SPILL_S128_RESTORE:
493   case AMDGPU::SI_SPILL_V128_SAVE:
494   case AMDGPU::SI_SPILL_V128_RESTORE:
495   case AMDGPU::SI_SPILL_A128_SAVE:
496   case AMDGPU::SI_SPILL_A128_RESTORE:
497     return 4;
498   case AMDGPU::SI_SPILL_S96_SAVE:
499   case AMDGPU::SI_SPILL_S96_RESTORE:
500   case AMDGPU::SI_SPILL_V96_SAVE:
501   case AMDGPU::SI_SPILL_V96_RESTORE:
502     return 3;
503   case AMDGPU::SI_SPILL_S64_SAVE:
504   case AMDGPU::SI_SPILL_S64_RESTORE:
505   case AMDGPU::SI_SPILL_V64_SAVE:
506   case AMDGPU::SI_SPILL_V64_RESTORE:
507   case AMDGPU::SI_SPILL_A64_SAVE:
508   case AMDGPU::SI_SPILL_A64_RESTORE:
509     return 2;
510   case AMDGPU::SI_SPILL_S32_SAVE:
511   case AMDGPU::SI_SPILL_S32_RESTORE:
512   case AMDGPU::SI_SPILL_V32_SAVE:
513   case AMDGPU::SI_SPILL_V32_RESTORE:
514   case AMDGPU::SI_SPILL_A32_SAVE:
515   case AMDGPU::SI_SPILL_A32_RESTORE:
516     return 1;
517   default: llvm_unreachable("Invalid spill opcode");
518   }
519 }
520 
521 static int getOffsetMUBUFStore(unsigned Opc) {
522   switch (Opc) {
523   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
524     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
525   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
526     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
527   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
528     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
529   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
530     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
531   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
532     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
533   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
534     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
535   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
536     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
537   default:
538     return -1;
539   }
540 }
541 
542 static int getOffsetMUBUFLoad(unsigned Opc) {
543   switch (Opc) {
544   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
545     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
546   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
547     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
548   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
549     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
550   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
551     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
552   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
553     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
554   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
555     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
556   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
557     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
558   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
559     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
560   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
561     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
562   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
563     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
564   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
565     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
566   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
567     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
568   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
569     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
570   default:
571     return -1;
572   }
573 }
574 
575 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
576                                            int Index,
577                                            unsigned Lane,
578                                            unsigned ValueReg,
579                                            bool IsKill) {
580   MachineBasicBlock *MBB = MI->getParent();
581   MachineFunction *MF = MI->getParent()->getParent();
582   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
583   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
584   const SIInstrInfo *TII = ST.getInstrInfo();
585 
586   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
587 
588   if (Reg == AMDGPU::NoRegister)
589     return MachineInstrBuilder();
590 
591   bool IsStore = MI->mayStore();
592   MachineRegisterInfo &MRI = MF->getRegInfo();
593   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
594 
595   unsigned Dst = IsStore ? Reg : ValueReg;
596   unsigned Src = IsStore ? ValueReg : Reg;
597   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
598                                                    : AMDGPU::V_ACCVGPR_READ_B32;
599 
600   return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
601            .addReg(Src, getKillRegState(IsKill));
602 }
603 
604 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
605 // need to handle the case where an SGPR may need to be spilled while spilling.
606 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
607                                       MachineFrameInfo &MFI,
608                                       MachineBasicBlock::iterator MI,
609                                       int Index,
610                                       int64_t Offset) {
611   MachineBasicBlock *MBB = MI->getParent();
612   const DebugLoc &DL = MI->getDebugLoc();
613   bool IsStore = MI->mayStore();
614 
615   unsigned Opc = MI->getOpcode();
616   int LoadStoreOp = IsStore ?
617     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
618   if (LoadStoreOp == -1)
619     return false;
620 
621   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
622   if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
623     return true;
624 
625   MachineInstrBuilder NewMI =
626       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
627           .add(*Reg)
628           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
629           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
630           .addImm(Offset)
631           .addImm(0) // glc
632           .addImm(0) // slc
633           .addImm(0) // tfe
634           .addImm(0) // dlc
635           .cloneMemRefs(*MI);
636 
637   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
638                                                        AMDGPU::OpName::vdata_in);
639   if (VDataIn)
640     NewMI.add(*VDataIn);
641   return true;
642 }
643 
644 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
645                                          unsigned LoadStoreOp,
646                                          int Index,
647                                          unsigned ValueReg,
648                                          bool IsKill,
649                                          unsigned ScratchRsrcReg,
650                                          unsigned ScratchOffsetReg,
651                                          int64_t InstOffset,
652                                          MachineMemOperand *MMO,
653                                          RegScavenger *RS) const {
654   MachineBasicBlock *MBB = MI->getParent();
655   MachineFunction *MF = MI->getParent()->getParent();
656   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
657   const SIInstrInfo *TII = ST.getInstrInfo();
658   const MachineFrameInfo &MFI = MF->getFrameInfo();
659 
660   const MCInstrDesc &Desc = TII->get(LoadStoreOp);
661   const DebugLoc &DL = MI->getDebugLoc();
662   bool IsStore = Desc.mayStore();
663 
664   bool Scavenged = false;
665   unsigned SOffset = ScratchOffsetReg;
666 
667   const unsigned EltSize = 4;
668   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
669   unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
670   unsigned Size = NumSubRegs * EltSize;
671   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
672   int64_t ScratchOffsetRegDelta = 0;
673 
674   unsigned Align = MFI.getObjectAlignment(Index);
675   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
676 
677   Register TmpReg =
678     hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
679                  : Register();
680 
681   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
682 
683   if (!isUInt<12>(Offset + Size - EltSize)) {
684     SOffset = AMDGPU::NoRegister;
685 
686     // We currently only support spilling VGPRs to EltSize boundaries, meaning
687     // we can simplify the adjustment of Offset here to just scale with
688     // WavefrontSize.
689     Offset *= ST.getWavefrontSize();
690 
691     // We don't have access to the register scavenger if this function is called
692     // during  PEI::scavengeFrameVirtualRegs().
693     if (RS)
694       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
695 
696     if (SOffset == AMDGPU::NoRegister) {
697       // There are no free SGPRs, and since we are in the process of spilling
698       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
699       // on SI/CI and on VI it is true until we implement spilling using scalar
700       // stores), we have no way to free up an SGPR.  Our solution here is to
701       // add the offset directly to the ScratchOffset register, and then
702       // subtract the offset after the spill to return ScratchOffset to it's
703       // original value.
704       SOffset = ScratchOffsetReg;
705       ScratchOffsetRegDelta = Offset;
706     } else {
707       Scavenged = true;
708     }
709 
710     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
711       .addReg(ScratchOffsetReg)
712       .addImm(Offset);
713 
714     Offset = 0;
715   }
716 
717   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
718     Register SubReg = NumSubRegs == 1
719                           ? Register(ValueReg)
720                           : getSubReg(ValueReg, getSubRegFromChannel(i));
721 
722     unsigned SOffsetRegState = 0;
723     unsigned SrcDstRegState = getDefRegState(!IsStore);
724     if (i + 1 == e) {
725       SOffsetRegState |= getKillRegState(Scavenged);
726       // The last implicit use carries the "Kill" flag.
727       SrcDstRegState |= getKillRegState(IsKill);
728     }
729 
730     auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
731 
732     if (!MIB.getInstr()) {
733       unsigned FinalReg = SubReg;
734       if (TmpReg != AMDGPU::NoRegister) {
735         if (IsStore)
736           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
737             .addReg(SubReg, getKillRegState(IsKill));
738         SubReg = TmpReg;
739       }
740 
741       MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
742       MachineMemOperand *NewMMO
743         = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
744                                    EltSize, MinAlign(Align, EltSize * i));
745 
746       MIB = BuildMI(*MBB, MI, DL, Desc)
747         .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
748         .addReg(ScratchRsrcReg)
749         .addReg(SOffset, SOffsetRegState)
750         .addImm(Offset)
751         .addImm(0) // glc
752         .addImm(0) // slc
753         .addImm(0) // tfe
754         .addImm(0) // dlc
755         .addMemOperand(NewMMO);
756 
757       if (!IsStore && TmpReg != AMDGPU::NoRegister)
758         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
759                       FinalReg)
760           .addReg(TmpReg, RegState::Kill);
761     }
762 
763     if (NumSubRegs > 1)
764       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
765   }
766 
767   if (ScratchOffsetRegDelta != 0) {
768     // Subtract the offset we added to the ScratchOffset register.
769     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
770         .addReg(ScratchOffsetReg)
771         .addImm(ScratchOffsetRegDelta);
772   }
773 }
774 
775 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
776                                                      bool Store) {
777   if (SuperRegSize % 16 == 0) {
778     return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
779                          AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
780   }
781 
782   if (SuperRegSize % 8 == 0) {
783     return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
784                         AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
785   }
786 
787   return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
788                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
789 }
790 
791 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
792                                int Index,
793                                RegScavenger *RS,
794                                bool OnlyToVGPR) const {
795   MachineBasicBlock *MBB = MI->getParent();
796   MachineFunction *MF = MBB->getParent();
797   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
798   DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
799 
800   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
801     = MFI->getSGPRToVGPRSpills(Index);
802   bool SpillToVGPR = !VGPRSpills.empty();
803   if (OnlyToVGPR && !SpillToVGPR)
804     return false;
805 
806   MachineRegisterInfo &MRI = MF->getRegInfo();
807   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
808   const SIInstrInfo *TII = ST.getInstrInfo();
809 
810   Register SuperReg = MI->getOperand(0).getReg();
811   bool IsKill = MI->getOperand(0).isKill();
812   const DebugLoc &DL = MI->getDebugLoc();
813 
814   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
815 
816   bool SpillToSMEM = spillSGPRToSMEM();
817   if (SpillToSMEM && OnlyToVGPR)
818     return false;
819 
820   Register FrameReg = getFrameRegister(*MF);
821 
822   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
823                          SuperReg != MFI->getFrameOffsetReg() &&
824                          SuperReg != MFI->getScratchWaveOffsetReg()));
825 
826   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
827 
828   unsigned OffsetReg = AMDGPU::M0;
829   unsigned M0CopyReg = AMDGPU::NoRegister;
830 
831   if (SpillToSMEM) {
832     if (RS->isRegUsed(AMDGPU::M0)) {
833       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
834       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
835         .addReg(AMDGPU::M0);
836     }
837   }
838 
839   unsigned ScalarStoreOp;
840   unsigned EltSize = 4;
841   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
842   if (SpillToSMEM && isSGPRClass(RC)) {
843     // XXX - if private_element_size is larger than 4 it might be useful to be
844     // able to spill wider vmem spills.
845     std::tie(EltSize, ScalarStoreOp) =
846           getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
847   }
848 
849   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
850   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
851 
852   // SubReg carries the "Kill" flag when SubReg == SuperReg.
853   unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
854   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
855     Register SubReg =
856         NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
857 
858     if (SpillToSMEM) {
859       int64_t FrOffset = FrameInfo.getObjectOffset(Index);
860 
861       // The allocated memory size is really the wavefront size * the frame
862       // index size. The widest register class is 64 bytes, so a 4-byte scratch
863       // allocation is enough to spill this in a single stack object.
864       //
865       // FIXME: Frame size/offsets are computed earlier than this, so the extra
866       // space is still unnecessarily allocated.
867 
868       unsigned Align = FrameInfo.getObjectAlignment(Index);
869       MachinePointerInfo PtrInfo
870         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
871       MachineMemOperand *MMO
872         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
873                                    EltSize, MinAlign(Align, EltSize * i));
874 
875       // SMEM instructions only support a single offset, so increment the wave
876       // offset.
877 
878       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
879       if (Offset != 0) {
880         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
881           .addReg(FrameReg)
882           .addImm(Offset);
883       } else {
884         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
885           .addReg(FrameReg);
886       }
887 
888       BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
889         .addReg(SubReg, getKillRegState(IsKill)) // sdata
890         .addReg(MFI->getScratchRSrcReg())        // sbase
891         .addReg(OffsetReg, RegState::Kill)       // soff
892         .addImm(0)                               // glc
893         .addImm(0)                               // dlc
894         .addMemOperand(MMO);
895 
896       continue;
897     }
898 
899     if (SpillToVGPR) {
900       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
901 
902       // During SGPR spilling to VGPR, determine if the VGPR is defined. The
903       // only circumstance in which we say it is undefined is when it is the
904       // first spill to this VGPR in the first basic block.
905       bool VGPRDefined = true;
906       if (MBB == &MF->front())
907         VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
908 
909       // Mark the "old value of vgpr" input undef only if this is the first sgpr
910       // spill to this specific vgpr in the first basic block.
911       BuildMI(*MBB, MI, DL,
912               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
913               Spill.VGPR)
914         .addReg(SubReg, getKillRegState(IsKill))
915         .addImm(Spill.Lane)
916         .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
917 
918       // FIXME: Since this spills to another register instead of an actual
919       // frame index, we should delete the frame index when all references to
920       // it are fixed.
921     } else {
922       // XXX - Can to VGPR spill fail for some subregisters but not others?
923       if (OnlyToVGPR)
924         return false;
925 
926       // Spill SGPR to a frame index.
927       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
928       Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
929       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
930 
931       MachineInstrBuilder Mov
932         = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
933         .addReg(SubReg, SubKillState);
934 
935 
936       // There could be undef components of a spilled super register.
937       // TODO: Can we detect this and skip the spill?
938       if (NumSubRegs > 1) {
939         // The last implicit use of the SuperReg carries the "Kill" flag.
940         unsigned SuperKillState = 0;
941         if (i + 1 == e)
942           SuperKillState |= getKillRegState(IsKill);
943         Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
944       }
945 
946       unsigned Align = FrameInfo.getObjectAlignment(Index);
947       MachinePointerInfo PtrInfo
948         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
949       MachineMemOperand *MMO
950         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
951                                    EltSize, MinAlign(Align, EltSize * i));
952       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
953         .addReg(TmpReg, RegState::Kill)       // src
954         .addFrameIndex(Index)                 // vaddr
955         .addReg(MFI->getScratchRSrcReg())     // srrsrc
956         .addReg(MFI->getStackPtrOffsetReg())  // soffset
957         .addImm(i * 4)                        // offset
958         .addMemOperand(MMO);
959     }
960   }
961 
962   if (M0CopyReg != AMDGPU::NoRegister) {
963     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
964       .addReg(M0CopyReg, RegState::Kill);
965   }
966 
967   MI->eraseFromParent();
968   MFI->addToSpilledSGPRs(NumSubRegs);
969   return true;
970 }
971 
972 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
973                                  int Index,
974                                  RegScavenger *RS,
975                                  bool OnlyToVGPR) const {
976   MachineFunction *MF = MI->getParent()->getParent();
977   MachineRegisterInfo &MRI = MF->getRegInfo();
978   MachineBasicBlock *MBB = MI->getParent();
979   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
980 
981   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
982     = MFI->getSGPRToVGPRSpills(Index);
983   bool SpillToVGPR = !VGPRSpills.empty();
984   if (OnlyToVGPR && !SpillToVGPR)
985     return false;
986 
987   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
988   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
989   const SIInstrInfo *TII = ST.getInstrInfo();
990   const DebugLoc &DL = MI->getDebugLoc();
991 
992   Register SuperReg = MI->getOperand(0).getReg();
993   bool SpillToSMEM = spillSGPRToSMEM();
994   if (SpillToSMEM && OnlyToVGPR)
995     return false;
996 
997   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
998 
999   unsigned OffsetReg = AMDGPU::M0;
1000   unsigned M0CopyReg = AMDGPU::NoRegister;
1001 
1002   if (SpillToSMEM) {
1003     if (RS->isRegUsed(AMDGPU::M0)) {
1004       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1005       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
1006         .addReg(AMDGPU::M0);
1007     }
1008   }
1009 
1010   unsigned EltSize = 4;
1011   unsigned ScalarLoadOp;
1012 
1013   Register FrameReg = getFrameRegister(*MF);
1014 
1015   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1016   if (SpillToSMEM && isSGPRClass(RC)) {
1017     // XXX - if private_element_size is larger than 4 it might be useful to be
1018     // able to spill wider vmem spills.
1019     std::tie(EltSize, ScalarLoadOp) =
1020           getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
1021   }
1022 
1023   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1024   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1025 
1026   // SubReg carries the "Kill" flag when SubReg == SuperReg.
1027   int64_t FrOffset = FrameInfo.getObjectOffset(Index);
1028 
1029   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1030     Register SubReg =
1031         NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
1032 
1033     if (SpillToSMEM) {
1034       // FIXME: Size may be > 4 but extra bytes wasted.
1035       unsigned Align = FrameInfo.getObjectAlignment(Index);
1036       MachinePointerInfo PtrInfo
1037         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1038       MachineMemOperand *MMO
1039         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
1040                                    EltSize, MinAlign(Align, EltSize * i));
1041 
1042       // Add i * 4 offset
1043       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
1044       if (Offset != 0) {
1045         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
1046           .addReg(FrameReg)
1047           .addImm(Offset);
1048       } else {
1049         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
1050           .addReg(FrameReg);
1051       }
1052 
1053       auto MIB =
1054         BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
1055         .addReg(MFI->getScratchRSrcReg())  // sbase
1056         .addReg(OffsetReg, RegState::Kill) // soff
1057         .addImm(0)                         // glc
1058         .addImm(0)                         // dlc
1059         .addMemOperand(MMO);
1060 
1061       if (NumSubRegs > 1 && i == 0)
1062         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1063 
1064       continue;
1065     }
1066 
1067     if (SpillToVGPR) {
1068       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1069       auto MIB =
1070         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1071                 SubReg)
1072         .addReg(Spill.VGPR)
1073         .addImm(Spill.Lane);
1074 
1075       if (NumSubRegs > 1 && i == 0)
1076         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1077     } else {
1078       if (OnlyToVGPR)
1079         return false;
1080 
1081       // Restore SGPR from a stack slot.
1082       // FIXME: We should use S_LOAD_DWORD here for VI.
1083       Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1084       unsigned Align = FrameInfo.getObjectAlignment(Index);
1085 
1086       MachinePointerInfo PtrInfo
1087         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1088 
1089       MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
1090         MachineMemOperand::MOLoad, EltSize,
1091         MinAlign(Align, EltSize * i));
1092 
1093       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
1094         .addFrameIndex(Index)                 // vaddr
1095         .addReg(MFI->getScratchRSrcReg())     // srsrc
1096         .addReg(MFI->getStackPtrOffsetReg())  // soffset
1097         .addImm(i * 4)                        // offset
1098         .addMemOperand(MMO);
1099 
1100       auto MIB =
1101         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
1102         .addReg(TmpReg, RegState::Kill);
1103 
1104       if (NumSubRegs > 1)
1105         MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
1106     }
1107   }
1108 
1109   if (M0CopyReg != AMDGPU::NoRegister) {
1110     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
1111       .addReg(M0CopyReg, RegState::Kill);
1112   }
1113 
1114   MI->eraseFromParent();
1115   return true;
1116 }
1117 
1118 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1119 /// a VGPR and the stack slot can be safely eliminated when all other users are
1120 /// handled.
1121 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1122   MachineBasicBlock::iterator MI,
1123   int FI,
1124   RegScavenger *RS) const {
1125   switch (MI->getOpcode()) {
1126   case AMDGPU::SI_SPILL_S1024_SAVE:
1127   case AMDGPU::SI_SPILL_S512_SAVE:
1128   case AMDGPU::SI_SPILL_S256_SAVE:
1129   case AMDGPU::SI_SPILL_S160_SAVE:
1130   case AMDGPU::SI_SPILL_S128_SAVE:
1131   case AMDGPU::SI_SPILL_S96_SAVE:
1132   case AMDGPU::SI_SPILL_S64_SAVE:
1133   case AMDGPU::SI_SPILL_S32_SAVE:
1134     return spillSGPR(MI, FI, RS, true);
1135   case AMDGPU::SI_SPILL_S1024_RESTORE:
1136   case AMDGPU::SI_SPILL_S512_RESTORE:
1137   case AMDGPU::SI_SPILL_S256_RESTORE:
1138   case AMDGPU::SI_SPILL_S160_RESTORE:
1139   case AMDGPU::SI_SPILL_S128_RESTORE:
1140   case AMDGPU::SI_SPILL_S96_RESTORE:
1141   case AMDGPU::SI_SPILL_S64_RESTORE:
1142   case AMDGPU::SI_SPILL_S32_RESTORE:
1143     return restoreSGPR(MI, FI, RS, true);
1144   default:
1145     llvm_unreachable("not an SGPR spill instruction");
1146   }
1147 }
1148 
1149 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1150                                         int SPAdj, unsigned FIOperandNum,
1151                                         RegScavenger *RS) const {
1152   MachineFunction *MF = MI->getParent()->getParent();
1153   MachineRegisterInfo &MRI = MF->getRegInfo();
1154   MachineBasicBlock *MBB = MI->getParent();
1155   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1156   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1157   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
1158   const SIInstrInfo *TII = ST.getInstrInfo();
1159   DebugLoc DL = MI->getDebugLoc();
1160 
1161   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1162 
1163   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1164   int Index = MI->getOperand(FIOperandNum).getIndex();
1165 
1166   Register FrameReg = getFrameRegister(*MF);
1167 
1168   switch (MI->getOpcode()) {
1169     // SGPR register spill
1170     case AMDGPU::SI_SPILL_S1024_SAVE:
1171     case AMDGPU::SI_SPILL_S512_SAVE:
1172     case AMDGPU::SI_SPILL_S256_SAVE:
1173     case AMDGPU::SI_SPILL_S160_SAVE:
1174     case AMDGPU::SI_SPILL_S128_SAVE:
1175     case AMDGPU::SI_SPILL_S96_SAVE:
1176     case AMDGPU::SI_SPILL_S64_SAVE:
1177     case AMDGPU::SI_SPILL_S32_SAVE: {
1178       spillSGPR(MI, Index, RS);
1179       break;
1180     }
1181 
1182     // SGPR register restore
1183     case AMDGPU::SI_SPILL_S1024_RESTORE:
1184     case AMDGPU::SI_SPILL_S512_RESTORE:
1185     case AMDGPU::SI_SPILL_S256_RESTORE:
1186     case AMDGPU::SI_SPILL_S160_RESTORE:
1187     case AMDGPU::SI_SPILL_S128_RESTORE:
1188     case AMDGPU::SI_SPILL_S96_RESTORE:
1189     case AMDGPU::SI_SPILL_S64_RESTORE:
1190     case AMDGPU::SI_SPILL_S32_RESTORE: {
1191       restoreSGPR(MI, Index, RS);
1192       break;
1193     }
1194 
1195     // VGPR register spill
1196     case AMDGPU::SI_SPILL_V1024_SAVE:
1197     case AMDGPU::SI_SPILL_V512_SAVE:
1198     case AMDGPU::SI_SPILL_V256_SAVE:
1199     case AMDGPU::SI_SPILL_V160_SAVE:
1200     case AMDGPU::SI_SPILL_V128_SAVE:
1201     case AMDGPU::SI_SPILL_V96_SAVE:
1202     case AMDGPU::SI_SPILL_V64_SAVE:
1203     case AMDGPU::SI_SPILL_V32_SAVE:
1204     case AMDGPU::SI_SPILL_A1024_SAVE:
1205     case AMDGPU::SI_SPILL_A512_SAVE:
1206     case AMDGPU::SI_SPILL_A128_SAVE:
1207     case AMDGPU::SI_SPILL_A64_SAVE:
1208     case AMDGPU::SI_SPILL_A32_SAVE: {
1209       const MachineOperand *VData = TII->getNamedOperand(*MI,
1210                                                          AMDGPU::OpName::vdata);
1211       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1212              MFI->getStackPtrOffsetReg());
1213 
1214       buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
1215             Index,
1216             VData->getReg(), VData->isKill(),
1217             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1218             FrameReg,
1219             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1220             *MI->memoperands_begin(),
1221             RS);
1222       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1223       MI->eraseFromParent();
1224       break;
1225     }
1226     case AMDGPU::SI_SPILL_V32_RESTORE:
1227     case AMDGPU::SI_SPILL_V64_RESTORE:
1228     case AMDGPU::SI_SPILL_V96_RESTORE:
1229     case AMDGPU::SI_SPILL_V128_RESTORE:
1230     case AMDGPU::SI_SPILL_V160_RESTORE:
1231     case AMDGPU::SI_SPILL_V256_RESTORE:
1232     case AMDGPU::SI_SPILL_V512_RESTORE:
1233     case AMDGPU::SI_SPILL_V1024_RESTORE:
1234     case AMDGPU::SI_SPILL_A32_RESTORE:
1235     case AMDGPU::SI_SPILL_A64_RESTORE:
1236     case AMDGPU::SI_SPILL_A128_RESTORE:
1237     case AMDGPU::SI_SPILL_A512_RESTORE:
1238     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1239       const MachineOperand *VData = TII->getNamedOperand(*MI,
1240                                                          AMDGPU::OpName::vdata);
1241       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1242              MFI->getStackPtrOffsetReg());
1243 
1244       buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
1245             Index,
1246             VData->getReg(), VData->isKill(),
1247             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1248             FrameReg,
1249             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1250             *MI->memoperands_begin(),
1251             RS);
1252       MI->eraseFromParent();
1253       break;
1254     }
1255 
1256     default: {
1257       const DebugLoc &DL = MI->getDebugLoc();
1258       bool IsMUBUF = TII->isMUBUF(*MI);
1259 
1260       if (!IsMUBUF && !MFI->isEntryFunction()) {
1261         // Convert to an absolute stack address by finding the offset from the
1262         // scratch wave base and scaling by the wave size.
1263         //
1264         // In an entry function/kernel the offset is already the absolute
1265         // address relative to the frame register.
1266 
1267         Register DiffReg =
1268             MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1269 
1270         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1271         Register ResultReg = IsCopy ?
1272           MI->getOperand(0).getReg() :
1273           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1274 
1275         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
1276           .addReg(FrameReg)
1277           .addReg(MFI->getScratchWaveOffsetReg());
1278 
1279         int64_t Offset = FrameInfo.getObjectOffset(Index);
1280         if (Offset == 0) {
1281           // XXX - This never happens because of emergency scavenging slot at 0?
1282           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1283             .addImm(Log2_32(ST.getWavefrontSize()))
1284             .addReg(DiffReg);
1285         } else {
1286           Register ScaledReg =
1287               MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1288 
1289           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
1290             .addImm(Log2_32(ST.getWavefrontSize()))
1291             .addReg(DiffReg, RegState::Kill);
1292 
1293           // TODO: Fold if use instruction is another add of a constant.
1294           if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1295             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1296               .addImm(Offset)
1297               .addReg(ScaledReg, RegState::Kill)
1298               .addImm(0); // clamp bit
1299           } else {
1300             Register ConstOffsetReg =
1301                 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1302 
1303             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1304               .addImm(Offset);
1305             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1306               .addReg(ConstOffsetReg, RegState::Kill)
1307               .addReg(ScaledReg, RegState::Kill)
1308               .addImm(0); // clamp bit
1309           }
1310         }
1311 
1312         // Don't introduce an extra copy if we're just materializing in a mov.
1313         if (IsCopy)
1314           MI->eraseFromParent();
1315         else
1316           FIOp.ChangeToRegister(ResultReg, false, false, true);
1317         return;
1318       }
1319 
1320       if (IsMUBUF) {
1321         // Disable offen so we don't need a 0 vgpr base.
1322         assert(static_cast<int>(FIOperandNum) ==
1323                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1324                                           AMDGPU::OpName::vaddr));
1325 
1326         assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1327                MFI->getStackPtrOffsetReg());
1328 
1329         TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
1330 
1331         int64_t Offset = FrameInfo.getObjectOffset(Index);
1332         int64_t OldImm
1333           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1334         int64_t NewOffset = OldImm + Offset;
1335 
1336         if (isUInt<12>(NewOffset) &&
1337             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
1338           MI->eraseFromParent();
1339           return;
1340         }
1341       }
1342 
1343       // If the offset is simply too big, don't convert to a scratch wave offset
1344       // relative index.
1345 
1346       int64_t Offset = FrameInfo.getObjectOffset(Index);
1347       FIOp.ChangeToImmediate(Offset);
1348       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1349         Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1350         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1351           .addImm(Offset);
1352         FIOp.ChangeToRegister(TmpReg, false, false, true);
1353       }
1354     }
1355   }
1356 }
1357 
1358 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
1359   return AMDGPUInstPrinter::getRegisterName(Reg);
1360 }
1361 
1362 // FIXME: This is very slow. It might be worth creating a map from physreg to
1363 // register class.
1364 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
1365   assert(!Register::isVirtualRegister(Reg));
1366 
1367   static const TargetRegisterClass *const BaseClasses[] = {
1368     &AMDGPU::VGPR_32RegClass,
1369     &AMDGPU::SReg_32RegClass,
1370     &AMDGPU::AGPR_32RegClass,
1371     &AMDGPU::VReg_64RegClass,
1372     &AMDGPU::SReg_64RegClass,
1373     &AMDGPU::AReg_64RegClass,
1374     &AMDGPU::VReg_96RegClass,
1375     &AMDGPU::SReg_96RegClass,
1376     &AMDGPU::VReg_128RegClass,
1377     &AMDGPU::SReg_128RegClass,
1378     &AMDGPU::AReg_128RegClass,
1379     &AMDGPU::VReg_160RegClass,
1380     &AMDGPU::SReg_160RegClass,
1381     &AMDGPU::VReg_256RegClass,
1382     &AMDGPU::SReg_256RegClass,
1383     &AMDGPU::VReg_512RegClass,
1384     &AMDGPU::SReg_512RegClass,
1385     &AMDGPU::AReg_512RegClass,
1386     &AMDGPU::SReg_1024RegClass,
1387     &AMDGPU::VReg_1024RegClass,
1388     &AMDGPU::AReg_1024RegClass,
1389     &AMDGPU::SCC_CLASSRegClass,
1390     &AMDGPU::Pseudo_SReg_32RegClass,
1391     &AMDGPU::Pseudo_SReg_128RegClass,
1392   };
1393 
1394   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1395     if (BaseClass->contains(Reg)) {
1396       return BaseClass;
1397     }
1398   }
1399   return nullptr;
1400 }
1401 
1402 // TODO: It might be helpful to have some target specific flags in
1403 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1404 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1405   unsigned Size = getRegSizeInBits(*RC);
1406   if (Size < 32)
1407     return false;
1408   switch (Size) {
1409   case 32:
1410     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
1411   case 64:
1412     return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
1413   case 96:
1414     return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
1415   case 128:
1416     return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
1417   case 160:
1418     return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
1419   case 256:
1420     return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
1421   case 512:
1422     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
1423   case 1024:
1424     return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
1425   default:
1426     llvm_unreachable("Invalid register class size");
1427   }
1428 }
1429 
1430 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1431   unsigned Size = getRegSizeInBits(*RC);
1432   if (Size < 32)
1433     return false;
1434   switch (Size) {
1435   case 32:
1436     return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
1437   case 64:
1438     return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
1439   case 96:
1440     return false;
1441   case 128:
1442     return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
1443   case 160:
1444   case 256:
1445     return false;
1446   case 512:
1447     return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
1448   case 1024:
1449     return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
1450   default:
1451     llvm_unreachable("Invalid register class size");
1452   }
1453 }
1454 
1455 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
1456                                          const TargetRegisterClass *SRC) const {
1457   switch (getRegSizeInBits(*SRC)) {
1458   case 32:
1459     return &AMDGPU::VGPR_32RegClass;
1460   case 64:
1461     return &AMDGPU::VReg_64RegClass;
1462   case 96:
1463     return &AMDGPU::VReg_96RegClass;
1464   case 128:
1465     return &AMDGPU::VReg_128RegClass;
1466   case 160:
1467     return &AMDGPU::VReg_160RegClass;
1468   case 256:
1469     return &AMDGPU::VReg_256RegClass;
1470   case 512:
1471     return &AMDGPU::VReg_512RegClass;
1472   case 1024:
1473     return &AMDGPU::VReg_1024RegClass;
1474   default:
1475     llvm_unreachable("Invalid register class size");
1476   }
1477 }
1478 
1479 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
1480                                          const TargetRegisterClass *SRC) const {
1481   switch (getRegSizeInBits(*SRC)) {
1482   case 32:
1483     return &AMDGPU::AGPR_32RegClass;
1484   case 64:
1485     return &AMDGPU::AReg_64RegClass;
1486   case 128:
1487     return &AMDGPU::AReg_128RegClass;
1488   case 512:
1489     return &AMDGPU::AReg_512RegClass;
1490   case 1024:
1491     return &AMDGPU::AReg_1024RegClass;
1492   default:
1493     llvm_unreachable("Invalid register class size");
1494   }
1495 }
1496 
1497 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
1498                                          const TargetRegisterClass *VRC) const {
1499   switch (getRegSizeInBits(*VRC)) {
1500   case 32:
1501     return &AMDGPU::SGPR_32RegClass;
1502   case 64:
1503     return &AMDGPU::SReg_64RegClass;
1504   case 96:
1505     return &AMDGPU::SReg_96RegClass;
1506   case 128:
1507     return &AMDGPU::SReg_128RegClass;
1508   case 160:
1509     return &AMDGPU::SReg_160RegClass;
1510   case 256:
1511     return &AMDGPU::SReg_256RegClass;
1512   case 512:
1513     return &AMDGPU::SReg_512RegClass;
1514   case 1024:
1515     return &AMDGPU::SReg_1024RegClass;
1516   default:
1517     llvm_unreachable("Invalid register class size");
1518   }
1519 }
1520 
1521 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1522                          const TargetRegisterClass *RC, unsigned SubIdx) const {
1523   if (SubIdx == AMDGPU::NoSubRegister)
1524     return RC;
1525 
1526   // We can assume that each lane corresponds to one 32-bit register.
1527   unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
1528   if (isSGPRClass(RC)) {
1529     switch (Count) {
1530     case 1:
1531       return &AMDGPU::SGPR_32RegClass;
1532     case 2:
1533       return &AMDGPU::SReg_64RegClass;
1534     case 3:
1535       return &AMDGPU::SReg_96RegClass;
1536     case 4:
1537       return &AMDGPU::SReg_128RegClass;
1538     case 5:
1539       return &AMDGPU::SReg_160RegClass;
1540     case 8:
1541       return &AMDGPU::SReg_256RegClass;
1542     case 16:
1543       return &AMDGPU::SReg_512RegClass;
1544     case 32: /* fall-through */
1545     default:
1546       llvm_unreachable("Invalid sub-register class size");
1547     }
1548   } else if (hasAGPRs(RC)) {
1549     switch (Count) {
1550     case 1:
1551       return &AMDGPU::AGPR_32RegClass;
1552     case 2:
1553       return &AMDGPU::AReg_64RegClass;
1554     case 4:
1555       return &AMDGPU::AReg_128RegClass;
1556     case 16:
1557       return &AMDGPU::AReg_512RegClass;
1558     case 32: /* fall-through */
1559     default:
1560       llvm_unreachable("Invalid sub-register class size");
1561     }
1562   } else {
1563     switch (Count) {
1564     case 1:
1565       return &AMDGPU::VGPR_32RegClass;
1566     case 2:
1567       return &AMDGPU::VReg_64RegClass;
1568     case 3:
1569       return &AMDGPU::VReg_96RegClass;
1570     case 4:
1571       return &AMDGPU::VReg_128RegClass;
1572     case 5:
1573       return &AMDGPU::VReg_160RegClass;
1574     case 8:
1575       return &AMDGPU::VReg_256RegClass;
1576     case 16:
1577       return &AMDGPU::VReg_512RegClass;
1578     case 32: /* fall-through */
1579     default:
1580       llvm_unreachable("Invalid sub-register class size");
1581     }
1582   }
1583 }
1584 
1585 bool SIRegisterInfo::shouldRewriteCopySrc(
1586   const TargetRegisterClass *DefRC,
1587   unsigned DefSubReg,
1588   const TargetRegisterClass *SrcRC,
1589   unsigned SrcSubReg) const {
1590   // We want to prefer the smallest register class possible, so we don't want to
1591   // stop and rewrite on anything that looks like a subregister
1592   // extract. Operations mostly don't care about the super register class, so we
1593   // only want to stop on the most basic of copies between the same register
1594   // class.
1595   //
1596   // e.g. if we have something like
1597   // %0 = ...
1598   // %1 = ...
1599   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1600   // %3 = COPY %2, sub0
1601   //
1602   // We want to look through the COPY to find:
1603   //  => %3 = COPY %0
1604 
1605   // Plain copy.
1606   return getCommonSubClass(DefRC, SrcRC) != nullptr;
1607 }
1608 
1609 /// Returns a register that is not used at any point in the function.
1610 ///        If all registers are used, then this function will return
1611 //         AMDGPU::NoRegister.
1612 unsigned
1613 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1614                                    const TargetRegisterClass *RC,
1615                                    const MachineFunction &MF) const {
1616 
1617   for (unsigned Reg : *RC)
1618     if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1619       return Reg;
1620   return AMDGPU::NoRegister;
1621 }
1622 
1623 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1624                                                    unsigned EltSize) const {
1625   if (EltSize == 4) {
1626     static const int16_t Sub0_31[] = {
1627       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1628       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1629       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1630       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1631       AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
1632       AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
1633       AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
1634       AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
1635     };
1636 
1637     static const int16_t Sub0_15[] = {
1638       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1639       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1640       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1641       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1642     };
1643 
1644     static const int16_t Sub0_7[] = {
1645       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1646       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1647     };
1648 
1649     static const int16_t Sub0_4[] = {
1650       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
1651     };
1652 
1653     static const int16_t Sub0_3[] = {
1654       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1655     };
1656 
1657     static const int16_t Sub0_2[] = {
1658       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
1659     };
1660 
1661     static const int16_t Sub0_1[] = {
1662       AMDGPU::sub0, AMDGPU::sub1,
1663     };
1664 
1665     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1666     case 32:
1667       return {};
1668     case 64:
1669       return makeArrayRef(Sub0_1);
1670     case 96:
1671       return makeArrayRef(Sub0_2);
1672     case 128:
1673       return makeArrayRef(Sub0_3);
1674     case 160:
1675       return makeArrayRef(Sub0_4);
1676     case 256:
1677       return makeArrayRef(Sub0_7);
1678     case 512:
1679       return makeArrayRef(Sub0_15);
1680     case 1024:
1681       return makeArrayRef(Sub0_31);
1682     default:
1683       llvm_unreachable("unhandled register size");
1684     }
1685   }
1686 
1687   if (EltSize == 8) {
1688     static const int16_t Sub0_31_64[] = {
1689       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1690       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1691       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1692       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1693       AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
1694       AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
1695       AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
1696       AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
1697     };
1698 
1699     static const int16_t Sub0_15_64[] = {
1700       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1701       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1702       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1703       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
1704     };
1705 
1706     static const int16_t Sub0_7_64[] = {
1707       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1708       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
1709     };
1710 
1711 
1712     static const int16_t Sub0_3_64[] = {
1713       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
1714     };
1715 
1716     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1717     case 64:
1718       return {};
1719     case 128:
1720       return makeArrayRef(Sub0_3_64);
1721     case 256:
1722       return makeArrayRef(Sub0_7_64);
1723     case 512:
1724       return makeArrayRef(Sub0_15_64);
1725     case 1024:
1726       return makeArrayRef(Sub0_31_64);
1727     default:
1728       llvm_unreachable("unhandled register size");
1729     }
1730   }
1731 
1732   if (EltSize == 16) {
1733 
1734     static const int16_t Sub0_31_128[] = {
1735       AMDGPU::sub0_sub1_sub2_sub3,
1736       AMDGPU::sub4_sub5_sub6_sub7,
1737       AMDGPU::sub8_sub9_sub10_sub11,
1738       AMDGPU::sub12_sub13_sub14_sub15,
1739       AMDGPU::sub16_sub17_sub18_sub19,
1740       AMDGPU::sub20_sub21_sub22_sub23,
1741       AMDGPU::sub24_sub25_sub26_sub27,
1742       AMDGPU::sub28_sub29_sub30_sub31
1743     };
1744 
1745     static const int16_t Sub0_15_128[] = {
1746       AMDGPU::sub0_sub1_sub2_sub3,
1747       AMDGPU::sub4_sub5_sub6_sub7,
1748       AMDGPU::sub8_sub9_sub10_sub11,
1749       AMDGPU::sub12_sub13_sub14_sub15
1750     };
1751 
1752     static const int16_t Sub0_7_128[] = {
1753       AMDGPU::sub0_sub1_sub2_sub3,
1754       AMDGPU::sub4_sub5_sub6_sub7
1755     };
1756 
1757     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1758     case 128:
1759       return {};
1760     case 256:
1761       return makeArrayRef(Sub0_7_128);
1762     case 512:
1763       return makeArrayRef(Sub0_15_128);
1764     case 1024:
1765       return makeArrayRef(Sub0_31_128);
1766     default:
1767       llvm_unreachable("unhandled register size");
1768     }
1769   }
1770 
1771   assert(EltSize == 32 && "unhandled elt size");
1772 
1773   static const int16_t Sub0_31_256[] = {
1774     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1775     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
1776     AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
1777     AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
1778   };
1779 
1780   static const int16_t Sub0_15_256[] = {
1781     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1782     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
1783   };
1784 
1785   switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1786   case 256:
1787     return {};
1788   case 512:
1789     return makeArrayRef(Sub0_15_256);
1790   case 1024:
1791     return makeArrayRef(Sub0_31_256);
1792   default:
1793     llvm_unreachable("unhandled register size");
1794   }
1795 }
1796 
1797 const TargetRegisterClass*
1798 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1799                                   unsigned Reg) const {
1800   if (Register::isVirtualRegister(Reg))
1801     return  MRI.getRegClass(Reg);
1802 
1803   return getPhysRegClass(Reg);
1804 }
1805 
1806 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1807                             unsigned Reg) const {
1808   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1809   assert(RC && "Register class for the reg not found");
1810   return hasVGPRs(RC);
1811 }
1812 
1813 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1814                             unsigned Reg) const {
1815   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1816   assert(RC && "Register class for the reg not found");
1817   return hasAGPRs(RC);
1818 }
1819 
1820 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1821                                     const TargetRegisterClass *SrcRC,
1822                                     unsigned SubReg,
1823                                     const TargetRegisterClass *DstRC,
1824                                     unsigned DstSubReg,
1825                                     const TargetRegisterClass *NewRC,
1826                                     LiveIntervals &LIS) const {
1827   unsigned SrcSize = getRegSizeInBits(*SrcRC);
1828   unsigned DstSize = getRegSizeInBits(*DstRC);
1829   unsigned NewSize = getRegSizeInBits(*NewRC);
1830 
1831   // Do not increase size of registers beyond dword, we would need to allocate
1832   // adjacent registers and constraint regalloc more than needed.
1833 
1834   // Always allow dword coalescing.
1835   if (SrcSize <= 32 || DstSize <= 32)
1836     return true;
1837 
1838   return NewSize <= DstSize || NewSize <= SrcSize;
1839 }
1840 
1841 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1842                                              MachineFunction &MF) const {
1843 
1844   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1845   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1846 
1847   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1848                                                        MF.getFunction());
1849   switch (RC->getID()) {
1850   default:
1851     return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
1852   case AMDGPU::VGPR_32RegClassID:
1853     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1854   case AMDGPU::SGPR_32RegClassID:
1855     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1856   }
1857 }
1858 
1859 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1860                                                 unsigned Idx) const {
1861   if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
1862     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
1863                                const_cast<MachineFunction &>(MF));
1864 
1865   if (Idx == getSGPRPressureSet())
1866     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
1867                                const_cast<MachineFunction &>(MF));
1868 
1869   return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
1870 }
1871 
1872 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
1873   static const int Empty[] = { -1 };
1874 
1875   if (hasRegUnit(AMDGPU::M0, RegUnit))
1876     return Empty;
1877   return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
1878 }
1879 
1880 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
1881   // Not a callee saved register.
1882   return AMDGPU::SGPR30_SGPR31;
1883 }
1884 
1885 const TargetRegisterClass *
1886 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
1887                                          const RegisterBank &RB,
1888                                          const MachineRegisterInfo &MRI) const {
1889   switch (Size) {
1890   case 1: {
1891     switch (RB.getID()) {
1892     case AMDGPU::VGPRRegBankID:
1893       return &AMDGPU::VGPR_32RegClass;
1894     case AMDGPU::VCCRegBankID:
1895       return isWave32 ?
1896         &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
1897     case AMDGPU::SGPRRegBankID:
1898       return &AMDGPU::SReg_32_XM0RegClass;
1899     case AMDGPU::SCCRegBankID:
1900       // This needs to return an allocatable class, so don't bother returning
1901       // the dummy SCC class.
1902       return &AMDGPU::SReg_32_XM0RegClass;
1903     default:
1904       llvm_unreachable("unknown register bank");
1905     }
1906   }
1907   case 32:
1908     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1909                                                  &AMDGPU::SReg_32_XM0RegClass;
1910   case 64:
1911     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
1912                                                  &AMDGPU::SReg_64_XEXECRegClass;
1913   case 96:
1914     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
1915                                                  &AMDGPU::SReg_96RegClass;
1916   case 128:
1917     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
1918                                                  &AMDGPU::SReg_128RegClass;
1919   case 160:
1920     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
1921                                                  &AMDGPU::SReg_160RegClass;
1922   case 256:
1923     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
1924                                                  &AMDGPU::SReg_256RegClass;
1925   case 512:
1926     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
1927                                                  &AMDGPU::SReg_512RegClass;
1928   default:
1929     if (Size < 32)
1930       return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1931                                                    &AMDGPU::SReg_32_XM0RegClass;
1932     return nullptr;
1933   }
1934 }
1935 
1936 const TargetRegisterClass *
1937 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
1938                                          const MachineRegisterInfo &MRI) const {
1939   if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
1940     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
1941   return nullptr;
1942 }
1943 
1944 unsigned SIRegisterInfo::getVCC() const {
1945   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
1946 }
1947 
1948 const TargetRegisterClass *
1949 SIRegisterInfo::getRegClass(unsigned RCID) const {
1950   switch ((int)RCID) {
1951   case AMDGPU::SReg_1RegClassID:
1952     return getBoolRC();
1953   case AMDGPU::SReg_1_XEXECRegClassID:
1954     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
1955       : &AMDGPU::SReg_64_XEXECRegClass;
1956   case -1:
1957     return nullptr;
1958   default:
1959     return AMDGPURegisterInfo::getRegClass(RCID);
1960   }
1961 }
1962 
1963 // Find reaching register definition
1964 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
1965                                               MachineInstr &Use,
1966                                               MachineRegisterInfo &MRI,
1967                                               LiveIntervals *LIS) const {
1968   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
1969   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
1970   SlotIndex DefIdx;
1971 
1972   if (Register::isVirtualRegister(Reg)) {
1973     if (!LIS->hasInterval(Reg))
1974       return nullptr;
1975     LiveInterval &LI = LIS->getInterval(Reg);
1976     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
1977                                   : MRI.getMaxLaneMaskForVReg(Reg);
1978     VNInfo *V = nullptr;
1979     if (LI.hasSubRanges()) {
1980       for (auto &S : LI.subranges()) {
1981         if ((S.LaneMask & SubLanes) == SubLanes) {
1982           V = S.getVNInfoAt(UseIdx);
1983           break;
1984         }
1985       }
1986     } else {
1987       V = LI.getVNInfoAt(UseIdx);
1988     }
1989     if (!V)
1990       return nullptr;
1991     DefIdx = V->def;
1992   } else {
1993     // Find last def.
1994     for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
1995       LiveRange &LR = LIS->getRegUnit(*Units);
1996       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
1997         if (!DefIdx.isValid() ||
1998             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
1999                           LIS->getInstructionFromIndex(V->def)))
2000           DefIdx = V->def;
2001       } else {
2002         return nullptr;
2003       }
2004     }
2005   }
2006 
2007   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2008 
2009   if (!Def || !MDT.dominates(Def, &Use))
2010     return nullptr;
2011 
2012   assert(Def->modifiesRegister(Reg, this));
2013 
2014   return Def;
2015 }
2016