1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIRegisterInfo.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "AMDGPUSubtarget.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "MCTargetDesc/AMDGPUInstPrinter.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/MachineFrameInfo.h"
24 #include "llvm/CodeGen/MachineInstrBuilder.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 #include "llvm/CodeGen/SlotIndexes.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/LLVMContext.h"
29 
30 using namespace llvm;
31 
32 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
33   for (unsigned i = 0; PSets[i] != -1; ++i) {
34     if (PSets[i] == (int)PSetID)
35       return true;
36   }
37   return false;
38 }
39 
40 void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
41                                          BitVector &PressureSets) const {
42   for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
43     const int *PSets = getRegUnitPressureSets(*U);
44     if (hasPressureSet(PSets, PSetID)) {
45       PressureSets.set(PSetID);
46       break;
47     }
48   }
49 }
50 
51 static cl::opt<bool> EnableSpillSGPRToSMEM(
52   "amdgpu-spill-sgpr-to-smem",
53   cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableSpillSGPRToVGPR(
57   "amdgpu-spill-sgpr-to-vgpr",
58   cl::desc("Enable spilling VGPRs to SGPRs"),
59   cl::ReallyHidden,
60   cl::init(true));
61 
62 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
63   AMDGPURegisterInfo(),
64   SGPRPressureSets(getNumRegPressureSets()),
65   VGPRPressureSets(getNumRegPressureSets()),
66   AGPRPressureSets(getNumRegPressureSets()),
67   SpillSGPRToVGPR(false),
68   SpillSGPRToSMEM(false),
69   isWave32(ST.isWave32()) {
70   if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
71     SpillSGPRToSMEM = true;
72   else if (EnableSpillSGPRToVGPR)
73     SpillSGPRToVGPR = true;
74 
75   unsigned NumRegPressureSets = getNumRegPressureSets();
76 
77   SGPRSetID = NumRegPressureSets;
78   VGPRSetID = NumRegPressureSets;
79   AGPRSetID = NumRegPressureSets;
80 
81   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
82     classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
83     classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
84     classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
85   }
86 
87   // Determine the number of reg units for each pressure set.
88   std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
89   for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
90     const int *PSets = getRegUnitPressureSets(i);
91     for (unsigned j = 0; PSets[j] != -1; ++j) {
92       ++PressureSetRegUnits[PSets[j]];
93     }
94   }
95 
96   unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
97   for (unsigned i = 0; i < NumRegPressureSets; ++i) {
98     if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
99       VGPRSetID = i;
100       VGPRMax = PressureSetRegUnits[i];
101       continue;
102     }
103     if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
104       SGPRSetID = i;
105       SGPRMax = PressureSetRegUnits[i];
106     }
107     if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
108       AGPRSetID = i;
109       AGPRMax = PressureSetRegUnits[i];
110       continue;
111     }
112   }
113 
114   assert(SGPRSetID < NumRegPressureSets &&
115          VGPRSetID < NumRegPressureSets &&
116          AGPRSetID < NumRegPressureSets);
117 }
118 
119 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
120   const MachineFunction &MF) const {
121 
122   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
123   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
124   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
125   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
126 }
127 
128 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
129   unsigned Reg;
130 
131   // Try to place it in a hole after PrivateSegmentBufferReg.
132   if (RegCount & 3) {
133     // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
134     // alignment constraints, so we have a hole where can put the wave offset.
135     Reg = RegCount - 1;
136   } else {
137     // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
138     // wave offset before it.
139     Reg = RegCount - 5;
140   }
141 
142   return Reg;
143 }
144 
145 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
146   const MachineFunction &MF) const {
147   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
148   unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
149   return AMDGPU::SGPR_32RegClass.getRegister(Reg);
150 }
151 
152 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
153   BitVector Reserved(getNumRegs());
154 
155   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
156   // this seems likely to result in bugs, so I'm marking them as reserved.
157   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
158   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
159 
160   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
161   reserveRegisterTuples(Reserved, AMDGPU::M0);
162 
163   // Reserve src_vccz, src_execz, src_scc.
164   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
165   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
166   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
167 
168   // Reserve the memory aperture registers.
169   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
170   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
171   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
172   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
173 
174   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
175   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
176 
177   // Reserve xnack_mask registers - support is not implemented in Codegen.
178   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
179 
180   // Reserve lds_direct register - support is not implemented in Codegen.
181   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
182 
183   // Reserve Trap Handler registers - support is not implemented in Codegen.
184   reserveRegisterTuples(Reserved, AMDGPU::TBA);
185   reserveRegisterTuples(Reserved, AMDGPU::TMA);
186   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
187   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
188   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
189   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
190   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
191   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
192   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
193   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
194 
195   // Reserve null register - it shall never be allocated
196   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
197 
198   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
199   // will result in bugs.
200   if (isWave32) {
201     Reserved.set(AMDGPU::VCC);
202     Reserved.set(AMDGPU::VCC_HI);
203   }
204 
205   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
206 
207   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
208   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
209   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
210     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
211     reserveRegisterTuples(Reserved, Reg);
212   }
213 
214   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
215   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
216   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
217     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
218     reserveRegisterTuples(Reserved, Reg);
219     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
220     reserveRegisterTuples(Reserved, Reg);
221   }
222 
223   // Reserve all the rest AGPRs if there are no instructions to use it.
224   if (!ST.hasMAIInsts()) {
225     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
226       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
227       reserveRegisterTuples(Reserved, Reg);
228     }
229   }
230 
231   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
232 
233   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
234   if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
235     // Reserve 1 SGPR for scratch wave offset in case we need to spill.
236     reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
237   }
238 
239   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
240   if (ScratchRSrcReg != AMDGPU::NoRegister) {
241     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
242     // to spill.
243     // TODO: May need to reserve a VGPR if doing LDS spilling.
244     reserveRegisterTuples(Reserved, ScratchRSrcReg);
245     assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
246   }
247 
248   // We have to assume the SP is needed in case there are calls in the function,
249   // which is detected after the function is lowered. If we aren't really going
250   // to need SP, don't bother reserving it.
251   unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
252 
253   if (StackPtrReg != AMDGPU::NoRegister) {
254     reserveRegisterTuples(Reserved, StackPtrReg);
255     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
256   }
257 
258   unsigned FrameReg = MFI->getFrameOffsetReg();
259   if (FrameReg != AMDGPU::NoRegister) {
260     reserveRegisterTuples(Reserved, FrameReg);
261     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
262   }
263 
264   for (unsigned Reg : MFI->WWMReservedRegs) {
265     reserveRegisterTuples(Reserved, Reg);
266   }
267 
268   // FIXME: Stop using reserved registers for this.
269   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
270     reserveRegisterTuples(Reserved, Reg);
271 
272   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
273     reserveRegisterTuples(Reserved, Reg);
274 
275   return Reserved;
276 }
277 
278 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
279   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
280   // On entry, the base address is 0, so it can't possibly need any more
281   // alignment.
282 
283   // FIXME: Should be able to specify the entry frame alignment per calling
284   // convention instead.
285   if (Info->isEntryFunction())
286     return false;
287 
288   return TargetRegisterInfo::canRealignStack(MF);
289 }
290 
291 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
292   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
293   if (Info->isEntryFunction()) {
294     const MachineFrameInfo &MFI = Fn.getFrameInfo();
295     return MFI.hasStackObjects() || MFI.hasCalls();
296   }
297 
298   // May need scavenger for dealing with callee saved registers.
299   return true;
300 }
301 
302 bool SIRegisterInfo::requiresFrameIndexScavenging(
303   const MachineFunction &MF) const {
304   const MachineFrameInfo &MFI = MF.getFrameInfo();
305   if (MFI.hasStackObjects())
306     return true;
307 
308   // May need to deal with callee saved registers.
309   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
310   return !Info->isEntryFunction();
311 }
312 
313 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
314   const MachineFunction &MF) const {
315   const MachineFrameInfo &MFI = MF.getFrameInfo();
316   if (!MFI.hasStackObjects())
317     return false;
318 
319   // The scavenger is used for large frames which may require finding a free
320   // register for large offsets.
321   if (!isUInt<12>(MFI.getStackSize()))
322     return true;
323 
324   // If using scalar stores, for spills, m0 is needed for the scalar store
325   // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
326   // register for it during frame index elimination, so the scavenger is
327   // directly needed.
328   return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
329          MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
330 }
331 
332 bool SIRegisterInfo::requiresVirtualBaseRegisters(
333   const MachineFunction &) const {
334   // There are no special dedicated stack or frame pointers.
335   return true;
336 }
337 
338 bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
339   // This helps catch bugs as verifier errors.
340   return true;
341 }
342 
343 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
344   assert(SIInstrInfo::isMUBUF(*MI));
345 
346   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
347                                           AMDGPU::OpName::offset);
348   return MI->getOperand(OffIdx).getImm();
349 }
350 
351 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
352                                                  int Idx) const {
353   if (!SIInstrInfo::isMUBUF(*MI))
354     return 0;
355 
356   assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
357                                            AMDGPU::OpName::vaddr) &&
358          "Should never see frame index on non-address operand");
359 
360   return getMUBUFInstrOffset(MI);
361 }
362 
363 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
364   if (!MI->mayLoadOrStore())
365     return false;
366 
367   int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
368 
369   return !isUInt<12>(FullOffset);
370 }
371 
372 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
373                                                   unsigned BaseReg,
374                                                   int FrameIdx,
375                                                   int64_t Offset) const {
376   MachineBasicBlock::iterator Ins = MBB->begin();
377   DebugLoc DL; // Defaults to "unknown"
378 
379   if (Ins != MBB->end())
380     DL = Ins->getDebugLoc();
381 
382   MachineFunction *MF = MBB->getParent();
383   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
384   const SIInstrInfo *TII = Subtarget.getInstrInfo();
385 
386   if (Offset == 0) {
387     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
388       .addFrameIndex(FrameIdx);
389     return;
390   }
391 
392   MachineRegisterInfo &MRI = MF->getRegInfo();
393   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
394 
395   unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
396 
397   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
398     .addImm(Offset);
399   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
400     .addFrameIndex(FrameIdx);
401 
402   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
403     .addReg(OffsetReg, RegState::Kill)
404     .addReg(FIReg)
405     .addImm(0); // clamp bit
406 }
407 
408 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
409                                        int64_t Offset) const {
410 
411   MachineBasicBlock *MBB = MI.getParent();
412   MachineFunction *MF = MBB->getParent();
413   const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
414   const SIInstrInfo *TII = Subtarget.getInstrInfo();
415 
416 #ifndef NDEBUG
417   // FIXME: Is it possible to be storing a frame index to itself?
418   bool SeenFI = false;
419   for (const MachineOperand &MO: MI.operands()) {
420     if (MO.isFI()) {
421       if (SeenFI)
422         llvm_unreachable("should not see multiple frame indices");
423 
424       SeenFI = true;
425     }
426   }
427 #endif
428 
429   MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
430   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
431   assert(TII->isMUBUF(MI));
432   assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
433          MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
434          "should only be seeing frame offset relative FrameIndex");
435 
436 
437   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
438   int64_t NewOffset = OffsetOp->getImm() + Offset;
439   assert(isUInt<12>(NewOffset) && "offset should be legal");
440 
441   FIOp->ChangeToRegister(BaseReg, false);
442   OffsetOp->setImm(NewOffset);
443 }
444 
445 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
446                                         unsigned BaseReg,
447                                         int64_t Offset) const {
448   if (!SIInstrInfo::isMUBUF(*MI))
449     return false;
450 
451   int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
452 
453   return isUInt<12>(NewOffset);
454 }
455 
456 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
457   const MachineFunction &MF, unsigned Kind) const {
458   // This is inaccurate. It depends on the instruction and address space. The
459   // only place where we should hit this is for dealing with frame indexes /
460   // private accesses, so this is correct in that case.
461   return &AMDGPU::VGPR_32RegClass;
462 }
463 
464 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
465 
466   switch (Op) {
467   case AMDGPU::SI_SPILL_S1024_SAVE:
468   case AMDGPU::SI_SPILL_S1024_RESTORE:
469   case AMDGPU::SI_SPILL_V1024_SAVE:
470   case AMDGPU::SI_SPILL_V1024_RESTORE:
471   case AMDGPU::SI_SPILL_A1024_SAVE:
472   case AMDGPU::SI_SPILL_A1024_RESTORE:
473     return 32;
474   case AMDGPU::SI_SPILL_S512_SAVE:
475   case AMDGPU::SI_SPILL_S512_RESTORE:
476   case AMDGPU::SI_SPILL_V512_SAVE:
477   case AMDGPU::SI_SPILL_V512_RESTORE:
478   case AMDGPU::SI_SPILL_A512_SAVE:
479   case AMDGPU::SI_SPILL_A512_RESTORE:
480     return 16;
481   case AMDGPU::SI_SPILL_S256_SAVE:
482   case AMDGPU::SI_SPILL_S256_RESTORE:
483   case AMDGPU::SI_SPILL_V256_SAVE:
484   case AMDGPU::SI_SPILL_V256_RESTORE:
485     return 8;
486   case AMDGPU::SI_SPILL_S160_SAVE:
487   case AMDGPU::SI_SPILL_S160_RESTORE:
488   case AMDGPU::SI_SPILL_V160_SAVE:
489   case AMDGPU::SI_SPILL_V160_RESTORE:
490     return 5;
491   case AMDGPU::SI_SPILL_S128_SAVE:
492   case AMDGPU::SI_SPILL_S128_RESTORE:
493   case AMDGPU::SI_SPILL_V128_SAVE:
494   case AMDGPU::SI_SPILL_V128_RESTORE:
495   case AMDGPU::SI_SPILL_A128_SAVE:
496   case AMDGPU::SI_SPILL_A128_RESTORE:
497     return 4;
498   case AMDGPU::SI_SPILL_S96_SAVE:
499   case AMDGPU::SI_SPILL_S96_RESTORE:
500   case AMDGPU::SI_SPILL_V96_SAVE:
501   case AMDGPU::SI_SPILL_V96_RESTORE:
502     return 3;
503   case AMDGPU::SI_SPILL_S64_SAVE:
504   case AMDGPU::SI_SPILL_S64_RESTORE:
505   case AMDGPU::SI_SPILL_V64_SAVE:
506   case AMDGPU::SI_SPILL_V64_RESTORE:
507   case AMDGPU::SI_SPILL_A64_SAVE:
508   case AMDGPU::SI_SPILL_A64_RESTORE:
509     return 2;
510   case AMDGPU::SI_SPILL_S32_SAVE:
511   case AMDGPU::SI_SPILL_S32_RESTORE:
512   case AMDGPU::SI_SPILL_V32_SAVE:
513   case AMDGPU::SI_SPILL_V32_RESTORE:
514   case AMDGPU::SI_SPILL_A32_SAVE:
515   case AMDGPU::SI_SPILL_A32_RESTORE:
516     return 1;
517   default: llvm_unreachable("Invalid spill opcode");
518   }
519 }
520 
521 static int getOffsetMUBUFStore(unsigned Opc) {
522   switch (Opc) {
523   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
524     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
525   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
526     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
527   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
528     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
529   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
530     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
531   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
532     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
533   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
534     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
535   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
536     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
537   default:
538     return -1;
539   }
540 }
541 
542 static int getOffsetMUBUFLoad(unsigned Opc) {
543   switch (Opc) {
544   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
545     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
546   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
547     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
548   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
549     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
550   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
551     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
552   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
553     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
554   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
555     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
556   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
557     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
558   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
559     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
560   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
561     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
562   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
563     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
564   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
565     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
566   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
567     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
568   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
569     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
570   default:
571     return -1;
572   }
573 }
574 
575 static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
576                                            int Index,
577                                            unsigned Lane,
578                                            unsigned ValueReg,
579                                            bool IsKill) {
580   MachineBasicBlock *MBB = MI->getParent();
581   MachineFunction *MF = MI->getParent()->getParent();
582   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
583   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
584   const SIInstrInfo *TII = ST.getInstrInfo();
585 
586   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
587 
588   if (Reg == AMDGPU::NoRegister)
589     return MachineInstrBuilder();
590 
591   bool IsStore = MI->mayStore();
592   MachineRegisterInfo &MRI = MF->getRegInfo();
593   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
594 
595   unsigned Dst = IsStore ? Reg : ValueReg;
596   unsigned Src = IsStore ? ValueReg : Reg;
597   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
598                                                    : AMDGPU::V_ACCVGPR_READ_B32;
599 
600   return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
601            .addReg(Src, getKillRegState(IsKill));
602 }
603 
604 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
605 // need to handle the case where an SGPR may need to be spilled while spilling.
606 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
607                                       MachineFrameInfo &MFI,
608                                       MachineBasicBlock::iterator MI,
609                                       int Index,
610                                       int64_t Offset) {
611   MachineBasicBlock *MBB = MI->getParent();
612   const DebugLoc &DL = MI->getDebugLoc();
613   bool IsStore = MI->mayStore();
614 
615   unsigned Opc = MI->getOpcode();
616   int LoadStoreOp = IsStore ?
617     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
618   if (LoadStoreOp == -1)
619     return false;
620 
621   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
622   if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
623     return true;
624 
625   MachineInstrBuilder NewMI =
626       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
627           .add(*Reg)
628           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
629           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
630           .addImm(Offset)
631           .addImm(0) // glc
632           .addImm(0) // slc
633           .addImm(0) // tfe
634           .addImm(0) // dlc
635           .cloneMemRefs(*MI);
636 
637   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
638                                                        AMDGPU::OpName::vdata_in);
639   if (VDataIn)
640     NewMI.add(*VDataIn);
641   return true;
642 }
643 
644 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
645                                          unsigned LoadStoreOp,
646                                          int Index,
647                                          unsigned ValueReg,
648                                          bool IsKill,
649                                          unsigned ScratchRsrcReg,
650                                          unsigned ScratchOffsetReg,
651                                          int64_t InstOffset,
652                                          MachineMemOperand *MMO,
653                                          RegScavenger *RS) const {
654   MachineBasicBlock *MBB = MI->getParent();
655   MachineFunction *MF = MI->getParent()->getParent();
656   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
657   const SIInstrInfo *TII = ST.getInstrInfo();
658   const MachineFrameInfo &MFI = MF->getFrameInfo();
659 
660   const MCInstrDesc &Desc = TII->get(LoadStoreOp);
661   const DebugLoc &DL = MI->getDebugLoc();
662   bool IsStore = Desc.mayStore();
663 
664   bool Scavenged = false;
665   unsigned SOffset = ScratchOffsetReg;
666 
667   const unsigned EltSize = 4;
668   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
669   unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
670   unsigned Size = NumSubRegs * EltSize;
671   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
672   int64_t ScratchOffsetRegDelta = 0;
673 
674   unsigned Align = MFI.getObjectAlignment(Index);
675   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
676 
677   Register TmpReg =
678     hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
679                  : Register();
680 
681   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
682 
683   if (!isUInt<12>(Offset + Size - EltSize)) {
684     SOffset = AMDGPU::NoRegister;
685 
686     // We currently only support spilling VGPRs to EltSize boundaries, meaning
687     // we can simplify the adjustment of Offset here to just scale with
688     // WavefrontSize.
689     Offset *= ST.getWavefrontSize();
690 
691     // We don't have access to the register scavenger if this function is called
692     // during  PEI::scavengeFrameVirtualRegs().
693     if (RS)
694       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
695 
696     if (SOffset == AMDGPU::NoRegister) {
697       // There are no free SGPRs, and since we are in the process of spilling
698       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
699       // on SI/CI and on VI it is true until we implement spilling using scalar
700       // stores), we have no way to free up an SGPR.  Our solution here is to
701       // add the offset directly to the ScratchOffset register, and then
702       // subtract the offset after the spill to return ScratchOffset to it's
703       // original value.
704       SOffset = ScratchOffsetReg;
705       ScratchOffsetRegDelta = Offset;
706     } else {
707       Scavenged = true;
708     }
709 
710     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
711       .addReg(ScratchOffsetReg)
712       .addImm(Offset);
713 
714     Offset = 0;
715   }
716 
717   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
718     unsigned SubReg = NumSubRegs == 1 ?
719       Register(ValueReg) : getSubReg(ValueReg, getSubRegFromChannel(i));
720 
721     unsigned SOffsetRegState = 0;
722     unsigned SrcDstRegState = getDefRegState(!IsStore);
723     if (i + 1 == e) {
724       SOffsetRegState |= getKillRegState(Scavenged);
725       // The last implicit use carries the "Kill" flag.
726       SrcDstRegState |= getKillRegState(IsKill);
727     }
728 
729     auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
730 
731     if (!MIB.getInstr()) {
732       unsigned FinalReg = SubReg;
733       if (TmpReg != AMDGPU::NoRegister) {
734         if (IsStore)
735           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
736             .addReg(SubReg, getKillRegState(IsKill));
737         SubReg = TmpReg;
738       }
739 
740       MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
741       MachineMemOperand *NewMMO
742         = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
743                                    EltSize, MinAlign(Align, EltSize * i));
744 
745       MIB = BuildMI(*MBB, MI, DL, Desc)
746         .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
747         .addReg(ScratchRsrcReg)
748         .addReg(SOffset, SOffsetRegState)
749         .addImm(Offset)
750         .addImm(0) // glc
751         .addImm(0) // slc
752         .addImm(0) // tfe
753         .addImm(0) // dlc
754         .addMemOperand(NewMMO);
755 
756       if (!IsStore && TmpReg != AMDGPU::NoRegister)
757         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
758                       FinalReg)
759           .addReg(TmpReg, RegState::Kill);
760     }
761 
762     if (NumSubRegs > 1)
763       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
764   }
765 
766   if (ScratchOffsetRegDelta != 0) {
767     // Subtract the offset we added to the ScratchOffset register.
768     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
769         .addReg(ScratchOffsetReg)
770         .addImm(ScratchOffsetRegDelta);
771   }
772 }
773 
774 static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
775                                                      bool Store) {
776   if (SuperRegSize % 16 == 0) {
777     return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
778                          AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
779   }
780 
781   if (SuperRegSize % 8 == 0) {
782     return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
783                         AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
784   }
785 
786   return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
787                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
788 }
789 
790 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
791                                int Index,
792                                RegScavenger *RS,
793                                bool OnlyToVGPR) const {
794   MachineBasicBlock *MBB = MI->getParent();
795   MachineFunction *MF = MBB->getParent();
796   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
797   DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
798 
799   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
800     = MFI->getSGPRToVGPRSpills(Index);
801   bool SpillToVGPR = !VGPRSpills.empty();
802   if (OnlyToVGPR && !SpillToVGPR)
803     return false;
804 
805   MachineRegisterInfo &MRI = MF->getRegInfo();
806   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
807   const SIInstrInfo *TII = ST.getInstrInfo();
808 
809   Register SuperReg = MI->getOperand(0).getReg();
810   bool IsKill = MI->getOperand(0).isKill();
811   const DebugLoc &DL = MI->getDebugLoc();
812 
813   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
814 
815   bool SpillToSMEM = spillSGPRToSMEM();
816   if (SpillToSMEM && OnlyToVGPR)
817     return false;
818 
819   Register FrameReg = getFrameRegister(*MF);
820 
821   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
822                          SuperReg != MFI->getFrameOffsetReg() &&
823                          SuperReg != MFI->getScratchWaveOffsetReg()));
824 
825   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
826 
827   unsigned OffsetReg = AMDGPU::M0;
828   unsigned M0CopyReg = AMDGPU::NoRegister;
829 
830   if (SpillToSMEM) {
831     if (RS->isRegUsed(AMDGPU::M0)) {
832       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
833       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
834         .addReg(AMDGPU::M0);
835     }
836   }
837 
838   unsigned ScalarStoreOp;
839   unsigned EltSize = 4;
840   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
841   if (SpillToSMEM && isSGPRClass(RC)) {
842     // XXX - if private_element_size is larger than 4 it might be useful to be
843     // able to spill wider vmem spills.
844     std::tie(EltSize, ScalarStoreOp) =
845           getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
846   }
847 
848   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
849   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
850 
851   // SubReg carries the "Kill" flag when SubReg == SuperReg.
852   unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
853   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
854     unsigned SubReg = NumSubRegs == 1 ?
855       SuperReg : getSubReg(SuperReg, SplitParts[i]);
856 
857     if (SpillToSMEM) {
858       int64_t FrOffset = FrameInfo.getObjectOffset(Index);
859 
860       // The allocated memory size is really the wavefront size * the frame
861       // index size. The widest register class is 64 bytes, so a 4-byte scratch
862       // allocation is enough to spill this in a single stack object.
863       //
864       // FIXME: Frame size/offsets are computed earlier than this, so the extra
865       // space is still unnecessarily allocated.
866 
867       unsigned Align = FrameInfo.getObjectAlignment(Index);
868       MachinePointerInfo PtrInfo
869         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
870       MachineMemOperand *MMO
871         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
872                                    EltSize, MinAlign(Align, EltSize * i));
873 
874       // SMEM instructions only support a single offset, so increment the wave
875       // offset.
876 
877       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
878       if (Offset != 0) {
879         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
880           .addReg(FrameReg)
881           .addImm(Offset);
882       } else {
883         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
884           .addReg(FrameReg);
885       }
886 
887       BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
888         .addReg(SubReg, getKillRegState(IsKill)) // sdata
889         .addReg(MFI->getScratchRSrcReg())        // sbase
890         .addReg(OffsetReg, RegState::Kill)       // soff
891         .addImm(0)                               // glc
892         .addImm(0)                               // dlc
893         .addMemOperand(MMO);
894 
895       continue;
896     }
897 
898     if (SpillToVGPR) {
899       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
900 
901       // During SGPR spilling to VGPR, determine if the VGPR is defined. The
902       // only circumstance in which we say it is undefined is when it is the
903       // first spill to this VGPR in the first basic block.
904       bool VGPRDefined = true;
905       if (MBB == &MF->front())
906         VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
907 
908       // Mark the "old value of vgpr" input undef only if this is the first sgpr
909       // spill to this specific vgpr in the first basic block.
910       BuildMI(*MBB, MI, DL,
911               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
912               Spill.VGPR)
913         .addReg(SubReg, getKillRegState(IsKill))
914         .addImm(Spill.Lane)
915         .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
916 
917       // FIXME: Since this spills to another register instead of an actual
918       // frame index, we should delete the frame index when all references to
919       // it are fixed.
920     } else {
921       // XXX - Can to VGPR spill fail for some subregisters but not others?
922       if (OnlyToVGPR)
923         return false;
924 
925       // Spill SGPR to a frame index.
926       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
927       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
928       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
929 
930       MachineInstrBuilder Mov
931         = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
932         .addReg(SubReg, SubKillState);
933 
934 
935       // There could be undef components of a spilled super register.
936       // TODO: Can we detect this and skip the spill?
937       if (NumSubRegs > 1) {
938         // The last implicit use of the SuperReg carries the "Kill" flag.
939         unsigned SuperKillState = 0;
940         if (i + 1 == e)
941           SuperKillState |= getKillRegState(IsKill);
942         Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
943       }
944 
945       unsigned Align = FrameInfo.getObjectAlignment(Index);
946       MachinePointerInfo PtrInfo
947         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
948       MachineMemOperand *MMO
949         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
950                                    EltSize, MinAlign(Align, EltSize * i));
951       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
952         .addReg(TmpReg, RegState::Kill)       // src
953         .addFrameIndex(Index)                 // vaddr
954         .addReg(MFI->getScratchRSrcReg())     // srrsrc
955         .addReg(MFI->getStackPtrOffsetReg())  // soffset
956         .addImm(i * 4)                        // offset
957         .addMemOperand(MMO);
958     }
959   }
960 
961   if (M0CopyReg != AMDGPU::NoRegister) {
962     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
963       .addReg(M0CopyReg, RegState::Kill);
964   }
965 
966   MI->eraseFromParent();
967   MFI->addToSpilledSGPRs(NumSubRegs);
968   return true;
969 }
970 
971 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
972                                  int Index,
973                                  RegScavenger *RS,
974                                  bool OnlyToVGPR) const {
975   MachineFunction *MF = MI->getParent()->getParent();
976   MachineRegisterInfo &MRI = MF->getRegInfo();
977   MachineBasicBlock *MBB = MI->getParent();
978   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
979 
980   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
981     = MFI->getSGPRToVGPRSpills(Index);
982   bool SpillToVGPR = !VGPRSpills.empty();
983   if (OnlyToVGPR && !SpillToVGPR)
984     return false;
985 
986   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
987   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
988   const SIInstrInfo *TII = ST.getInstrInfo();
989   const DebugLoc &DL = MI->getDebugLoc();
990 
991   Register SuperReg = MI->getOperand(0).getReg();
992   bool SpillToSMEM = spillSGPRToSMEM();
993   if (SpillToSMEM && OnlyToVGPR)
994     return false;
995 
996   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
997 
998   unsigned OffsetReg = AMDGPU::M0;
999   unsigned M0CopyReg = AMDGPU::NoRegister;
1000 
1001   if (SpillToSMEM) {
1002     if (RS->isRegUsed(AMDGPU::M0)) {
1003       M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1004       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
1005         .addReg(AMDGPU::M0);
1006     }
1007   }
1008 
1009   unsigned EltSize = 4;
1010   unsigned ScalarLoadOp;
1011 
1012   Register FrameReg = getFrameRegister(*MF);
1013 
1014   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1015   if (SpillToSMEM && isSGPRClass(RC)) {
1016     // XXX - if private_element_size is larger than 4 it might be useful to be
1017     // able to spill wider vmem spills.
1018     std::tie(EltSize, ScalarLoadOp) =
1019           getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
1020   }
1021 
1022   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1023   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1024 
1025   // SubReg carries the "Kill" flag when SubReg == SuperReg.
1026   int64_t FrOffset = FrameInfo.getObjectOffset(Index);
1027 
1028   for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1029     unsigned SubReg = NumSubRegs == 1 ?
1030       SuperReg : getSubReg(SuperReg, SplitParts[i]);
1031 
1032     if (SpillToSMEM) {
1033       // FIXME: Size may be > 4 but extra bytes wasted.
1034       unsigned Align = FrameInfo.getObjectAlignment(Index);
1035       MachinePointerInfo PtrInfo
1036         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1037       MachineMemOperand *MMO
1038         = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
1039                                    EltSize, MinAlign(Align, EltSize * i));
1040 
1041       // Add i * 4 offset
1042       int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
1043       if (Offset != 0) {
1044         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
1045           .addReg(FrameReg)
1046           .addImm(Offset);
1047       } else {
1048         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
1049           .addReg(FrameReg);
1050       }
1051 
1052       auto MIB =
1053         BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
1054         .addReg(MFI->getScratchRSrcReg())  // sbase
1055         .addReg(OffsetReg, RegState::Kill) // soff
1056         .addImm(0)                         // glc
1057         .addImm(0)                         // dlc
1058         .addMemOperand(MMO);
1059 
1060       if (NumSubRegs > 1 && i == 0)
1061         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1062 
1063       continue;
1064     }
1065 
1066     if (SpillToVGPR) {
1067       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1068       auto MIB =
1069         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1070                 SubReg)
1071         .addReg(Spill.VGPR)
1072         .addImm(Spill.Lane);
1073 
1074       if (NumSubRegs > 1 && i == 0)
1075         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1076     } else {
1077       if (OnlyToVGPR)
1078         return false;
1079 
1080       // Restore SGPR from a stack slot.
1081       // FIXME: We should use S_LOAD_DWORD here for VI.
1082       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1083       unsigned Align = FrameInfo.getObjectAlignment(Index);
1084 
1085       MachinePointerInfo PtrInfo
1086         = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
1087 
1088       MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
1089         MachineMemOperand::MOLoad, EltSize,
1090         MinAlign(Align, EltSize * i));
1091 
1092       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
1093         .addFrameIndex(Index)                 // vaddr
1094         .addReg(MFI->getScratchRSrcReg())     // srsrc
1095         .addReg(MFI->getStackPtrOffsetReg())  // soffset
1096         .addImm(i * 4)                        // offset
1097         .addMemOperand(MMO);
1098 
1099       auto MIB =
1100         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
1101         .addReg(TmpReg, RegState::Kill);
1102 
1103       if (NumSubRegs > 1)
1104         MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
1105     }
1106   }
1107 
1108   if (M0CopyReg != AMDGPU::NoRegister) {
1109     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
1110       .addReg(M0CopyReg, RegState::Kill);
1111   }
1112 
1113   MI->eraseFromParent();
1114   return true;
1115 }
1116 
1117 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1118 /// a VGPR and the stack slot can be safely eliminated when all other users are
1119 /// handled.
1120 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1121   MachineBasicBlock::iterator MI,
1122   int FI,
1123   RegScavenger *RS) const {
1124   switch (MI->getOpcode()) {
1125   case AMDGPU::SI_SPILL_S1024_SAVE:
1126   case AMDGPU::SI_SPILL_S512_SAVE:
1127   case AMDGPU::SI_SPILL_S256_SAVE:
1128   case AMDGPU::SI_SPILL_S160_SAVE:
1129   case AMDGPU::SI_SPILL_S128_SAVE:
1130   case AMDGPU::SI_SPILL_S96_SAVE:
1131   case AMDGPU::SI_SPILL_S64_SAVE:
1132   case AMDGPU::SI_SPILL_S32_SAVE:
1133     return spillSGPR(MI, FI, RS, true);
1134   case AMDGPU::SI_SPILL_S1024_RESTORE:
1135   case AMDGPU::SI_SPILL_S512_RESTORE:
1136   case AMDGPU::SI_SPILL_S256_RESTORE:
1137   case AMDGPU::SI_SPILL_S160_RESTORE:
1138   case AMDGPU::SI_SPILL_S128_RESTORE:
1139   case AMDGPU::SI_SPILL_S96_RESTORE:
1140   case AMDGPU::SI_SPILL_S64_RESTORE:
1141   case AMDGPU::SI_SPILL_S32_RESTORE:
1142     return restoreSGPR(MI, FI, RS, true);
1143   default:
1144     llvm_unreachable("not an SGPR spill instruction");
1145   }
1146 }
1147 
1148 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1149                                         int SPAdj, unsigned FIOperandNum,
1150                                         RegScavenger *RS) const {
1151   MachineFunction *MF = MI->getParent()->getParent();
1152   MachineRegisterInfo &MRI = MF->getRegInfo();
1153   MachineBasicBlock *MBB = MI->getParent();
1154   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1155   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1156   const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
1157   const SIInstrInfo *TII = ST.getInstrInfo();
1158   DebugLoc DL = MI->getDebugLoc();
1159 
1160   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1161 
1162   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1163   int Index = MI->getOperand(FIOperandNum).getIndex();
1164 
1165   Register FrameReg = getFrameRegister(*MF);
1166 
1167   switch (MI->getOpcode()) {
1168     // SGPR register spill
1169     case AMDGPU::SI_SPILL_S1024_SAVE:
1170     case AMDGPU::SI_SPILL_S512_SAVE:
1171     case AMDGPU::SI_SPILL_S256_SAVE:
1172     case AMDGPU::SI_SPILL_S160_SAVE:
1173     case AMDGPU::SI_SPILL_S128_SAVE:
1174     case AMDGPU::SI_SPILL_S96_SAVE:
1175     case AMDGPU::SI_SPILL_S64_SAVE:
1176     case AMDGPU::SI_SPILL_S32_SAVE: {
1177       spillSGPR(MI, Index, RS);
1178       break;
1179     }
1180 
1181     // SGPR register restore
1182     case AMDGPU::SI_SPILL_S1024_RESTORE:
1183     case AMDGPU::SI_SPILL_S512_RESTORE:
1184     case AMDGPU::SI_SPILL_S256_RESTORE:
1185     case AMDGPU::SI_SPILL_S160_RESTORE:
1186     case AMDGPU::SI_SPILL_S128_RESTORE:
1187     case AMDGPU::SI_SPILL_S96_RESTORE:
1188     case AMDGPU::SI_SPILL_S64_RESTORE:
1189     case AMDGPU::SI_SPILL_S32_RESTORE: {
1190       restoreSGPR(MI, Index, RS);
1191       break;
1192     }
1193 
1194     // VGPR register spill
1195     case AMDGPU::SI_SPILL_V1024_SAVE:
1196     case AMDGPU::SI_SPILL_V512_SAVE:
1197     case AMDGPU::SI_SPILL_V256_SAVE:
1198     case AMDGPU::SI_SPILL_V160_SAVE:
1199     case AMDGPU::SI_SPILL_V128_SAVE:
1200     case AMDGPU::SI_SPILL_V96_SAVE:
1201     case AMDGPU::SI_SPILL_V64_SAVE:
1202     case AMDGPU::SI_SPILL_V32_SAVE:
1203     case AMDGPU::SI_SPILL_A1024_SAVE:
1204     case AMDGPU::SI_SPILL_A512_SAVE:
1205     case AMDGPU::SI_SPILL_A128_SAVE:
1206     case AMDGPU::SI_SPILL_A64_SAVE:
1207     case AMDGPU::SI_SPILL_A32_SAVE: {
1208       const MachineOperand *VData = TII->getNamedOperand(*MI,
1209                                                          AMDGPU::OpName::vdata);
1210       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1211              MFI->getStackPtrOffsetReg());
1212 
1213       buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
1214             Index,
1215             VData->getReg(), VData->isKill(),
1216             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1217             FrameReg,
1218             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1219             *MI->memoperands_begin(),
1220             RS);
1221       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1222       MI->eraseFromParent();
1223       break;
1224     }
1225     case AMDGPU::SI_SPILL_V32_RESTORE:
1226     case AMDGPU::SI_SPILL_V64_RESTORE:
1227     case AMDGPU::SI_SPILL_V96_RESTORE:
1228     case AMDGPU::SI_SPILL_V128_RESTORE:
1229     case AMDGPU::SI_SPILL_V160_RESTORE:
1230     case AMDGPU::SI_SPILL_V256_RESTORE:
1231     case AMDGPU::SI_SPILL_V512_RESTORE:
1232     case AMDGPU::SI_SPILL_V1024_RESTORE:
1233     case AMDGPU::SI_SPILL_A32_RESTORE:
1234     case AMDGPU::SI_SPILL_A64_RESTORE:
1235     case AMDGPU::SI_SPILL_A128_RESTORE:
1236     case AMDGPU::SI_SPILL_A512_RESTORE:
1237     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1238       const MachineOperand *VData = TII->getNamedOperand(*MI,
1239                                                          AMDGPU::OpName::vdata);
1240       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1241              MFI->getStackPtrOffsetReg());
1242 
1243       buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
1244             Index,
1245             VData->getReg(), VData->isKill(),
1246             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
1247             FrameReg,
1248             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1249             *MI->memoperands_begin(),
1250             RS);
1251       MI->eraseFromParent();
1252       break;
1253     }
1254 
1255     default: {
1256       const DebugLoc &DL = MI->getDebugLoc();
1257       bool IsMUBUF = TII->isMUBUF(*MI);
1258 
1259       if (!IsMUBUF && !MFI->isEntryFunction()) {
1260         // Convert to an absolute stack address by finding the offset from the
1261         // scratch wave base and scaling by the wave size.
1262         //
1263         // In an entry function/kernel the offset is already the absolute
1264         // address relative to the frame register.
1265 
1266         unsigned DiffReg
1267           = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1268 
1269         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1270         Register ResultReg = IsCopy ?
1271           MI->getOperand(0).getReg() :
1272           MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1273 
1274         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
1275           .addReg(FrameReg)
1276           .addReg(MFI->getScratchWaveOffsetReg());
1277 
1278         int64_t Offset = FrameInfo.getObjectOffset(Index);
1279         if (Offset == 0) {
1280           // XXX - This never happens because of emergency scavenging slot at 0?
1281           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1282             .addImm(Log2_32(ST.getWavefrontSize()))
1283             .addReg(DiffReg);
1284         } else {
1285           unsigned ScaledReg
1286             = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1287 
1288           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
1289             .addImm(Log2_32(ST.getWavefrontSize()))
1290             .addReg(DiffReg, RegState::Kill);
1291 
1292           // TODO: Fold if use instruction is another add of a constant.
1293           if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1294             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1295               .addImm(Offset)
1296               .addReg(ScaledReg, RegState::Kill)
1297               .addImm(0); // clamp bit
1298           } else {
1299             unsigned ConstOffsetReg
1300               = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1301 
1302             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1303               .addImm(Offset);
1304             TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
1305               .addReg(ConstOffsetReg, RegState::Kill)
1306               .addReg(ScaledReg, RegState::Kill)
1307               .addImm(0); // clamp bit
1308           }
1309         }
1310 
1311         // Don't introduce an extra copy if we're just materializing in a mov.
1312         if (IsCopy)
1313           MI->eraseFromParent();
1314         else
1315           FIOp.ChangeToRegister(ResultReg, false, false, true);
1316         return;
1317       }
1318 
1319       if (IsMUBUF) {
1320         // Disable offen so we don't need a 0 vgpr base.
1321         assert(static_cast<int>(FIOperandNum) ==
1322                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1323                                           AMDGPU::OpName::vaddr));
1324 
1325         assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1326                MFI->getStackPtrOffsetReg());
1327 
1328         TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
1329 
1330         int64_t Offset = FrameInfo.getObjectOffset(Index);
1331         int64_t OldImm
1332           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1333         int64_t NewOffset = OldImm + Offset;
1334 
1335         if (isUInt<12>(NewOffset) &&
1336             buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
1337           MI->eraseFromParent();
1338           return;
1339         }
1340       }
1341 
1342       // If the offset is simply too big, don't convert to a scratch wave offset
1343       // relative index.
1344 
1345       int64_t Offset = FrameInfo.getObjectOffset(Index);
1346       FIOp.ChangeToImmediate(Offset);
1347       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1348         unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1349         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1350           .addImm(Offset);
1351         FIOp.ChangeToRegister(TmpReg, false, false, true);
1352       }
1353     }
1354   }
1355 }
1356 
1357 StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
1358   return AMDGPUInstPrinter::getRegisterName(Reg);
1359 }
1360 
1361 // FIXME: This is very slow. It might be worth creating a map from physreg to
1362 // register class.
1363 const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
1364   assert(!Register::isVirtualRegister(Reg));
1365 
1366   static const TargetRegisterClass *const BaseClasses[] = {
1367     &AMDGPU::VGPR_32RegClass,
1368     &AMDGPU::SReg_32RegClass,
1369     &AMDGPU::AGPR_32RegClass,
1370     &AMDGPU::VReg_64RegClass,
1371     &AMDGPU::SReg_64RegClass,
1372     &AMDGPU::AReg_64RegClass,
1373     &AMDGPU::VReg_96RegClass,
1374     &AMDGPU::SReg_96RegClass,
1375     &AMDGPU::VReg_128RegClass,
1376     &AMDGPU::SReg_128RegClass,
1377     &AMDGPU::AReg_128RegClass,
1378     &AMDGPU::VReg_160RegClass,
1379     &AMDGPU::SReg_160RegClass,
1380     &AMDGPU::VReg_256RegClass,
1381     &AMDGPU::SReg_256RegClass,
1382     &AMDGPU::VReg_512RegClass,
1383     &AMDGPU::SReg_512RegClass,
1384     &AMDGPU::AReg_512RegClass,
1385     &AMDGPU::SReg_1024RegClass,
1386     &AMDGPU::VReg_1024RegClass,
1387     &AMDGPU::AReg_1024RegClass,
1388     &AMDGPU::SCC_CLASSRegClass,
1389     &AMDGPU::Pseudo_SReg_32RegClass,
1390     &AMDGPU::Pseudo_SReg_128RegClass,
1391   };
1392 
1393   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1394     if (BaseClass->contains(Reg)) {
1395       return BaseClass;
1396     }
1397   }
1398   return nullptr;
1399 }
1400 
1401 // TODO: It might be helpful to have some target specific flags in
1402 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1403 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1404   unsigned Size = getRegSizeInBits(*RC);
1405   if (Size < 32)
1406     return false;
1407   switch (Size) {
1408   case 32:
1409     return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
1410   case 64:
1411     return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
1412   case 96:
1413     return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
1414   case 128:
1415     return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
1416   case 160:
1417     return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
1418   case 256:
1419     return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
1420   case 512:
1421     return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
1422   case 1024:
1423     return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
1424   default:
1425     llvm_unreachable("Invalid register class size");
1426   }
1427 }
1428 
1429 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1430   unsigned Size = getRegSizeInBits(*RC);
1431   if (Size < 32)
1432     return false;
1433   switch (Size) {
1434   case 32:
1435     return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
1436   case 64:
1437     return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
1438   case 96:
1439     return false;
1440   case 128:
1441     return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
1442   case 160:
1443   case 256:
1444     return false;
1445   case 512:
1446     return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
1447   case 1024:
1448     return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
1449   default:
1450     llvm_unreachable("Invalid register class size");
1451   }
1452 }
1453 
1454 const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
1455                                          const TargetRegisterClass *SRC) const {
1456   switch (getRegSizeInBits(*SRC)) {
1457   case 32:
1458     return &AMDGPU::VGPR_32RegClass;
1459   case 64:
1460     return &AMDGPU::VReg_64RegClass;
1461   case 96:
1462     return &AMDGPU::VReg_96RegClass;
1463   case 128:
1464     return &AMDGPU::VReg_128RegClass;
1465   case 160:
1466     return &AMDGPU::VReg_160RegClass;
1467   case 256:
1468     return &AMDGPU::VReg_256RegClass;
1469   case 512:
1470     return &AMDGPU::VReg_512RegClass;
1471   case 1024:
1472     return &AMDGPU::VReg_1024RegClass;
1473   default:
1474     llvm_unreachable("Invalid register class size");
1475   }
1476 }
1477 
1478 const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
1479                                          const TargetRegisterClass *SRC) const {
1480   switch (getRegSizeInBits(*SRC)) {
1481   case 32:
1482     return &AMDGPU::AGPR_32RegClass;
1483   case 64:
1484     return &AMDGPU::AReg_64RegClass;
1485   case 128:
1486     return &AMDGPU::AReg_128RegClass;
1487   case 512:
1488     return &AMDGPU::AReg_512RegClass;
1489   case 1024:
1490     return &AMDGPU::AReg_1024RegClass;
1491   default:
1492     llvm_unreachable("Invalid register class size");
1493   }
1494 }
1495 
1496 const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
1497                                          const TargetRegisterClass *VRC) const {
1498   switch (getRegSizeInBits(*VRC)) {
1499   case 32:
1500     return &AMDGPU::SGPR_32RegClass;
1501   case 64:
1502     return &AMDGPU::SReg_64RegClass;
1503   case 96:
1504     return &AMDGPU::SReg_96RegClass;
1505   case 128:
1506     return &AMDGPU::SReg_128RegClass;
1507   case 160:
1508     return &AMDGPU::SReg_160RegClass;
1509   case 256:
1510     return &AMDGPU::SReg_256RegClass;
1511   case 512:
1512     return &AMDGPU::SReg_512RegClass;
1513   case 1024:
1514     return &AMDGPU::SReg_1024RegClass;
1515   default:
1516     llvm_unreachable("Invalid register class size");
1517   }
1518 }
1519 
1520 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1521                          const TargetRegisterClass *RC, unsigned SubIdx) const {
1522   if (SubIdx == AMDGPU::NoSubRegister)
1523     return RC;
1524 
1525   // We can assume that each lane corresponds to one 32-bit register.
1526   unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
1527   if (isSGPRClass(RC)) {
1528     switch (Count) {
1529     case 1:
1530       return &AMDGPU::SGPR_32RegClass;
1531     case 2:
1532       return &AMDGPU::SReg_64RegClass;
1533     case 3:
1534       return &AMDGPU::SReg_96RegClass;
1535     case 4:
1536       return &AMDGPU::SReg_128RegClass;
1537     case 5:
1538       return &AMDGPU::SReg_160RegClass;
1539     case 8:
1540       return &AMDGPU::SReg_256RegClass;
1541     case 16:
1542       return &AMDGPU::SReg_512RegClass;
1543     case 32: /* fall-through */
1544     default:
1545       llvm_unreachable("Invalid sub-register class size");
1546     }
1547   } else if (hasAGPRs(RC)) {
1548     switch (Count) {
1549     case 1:
1550       return &AMDGPU::AGPR_32RegClass;
1551     case 2:
1552       return &AMDGPU::AReg_64RegClass;
1553     case 4:
1554       return &AMDGPU::AReg_128RegClass;
1555     case 16:
1556       return &AMDGPU::AReg_512RegClass;
1557     case 32: /* fall-through */
1558     default:
1559       llvm_unreachable("Invalid sub-register class size");
1560     }
1561   } else {
1562     switch (Count) {
1563     case 1:
1564       return &AMDGPU::VGPR_32RegClass;
1565     case 2:
1566       return &AMDGPU::VReg_64RegClass;
1567     case 3:
1568       return &AMDGPU::VReg_96RegClass;
1569     case 4:
1570       return &AMDGPU::VReg_128RegClass;
1571     case 5:
1572       return &AMDGPU::VReg_160RegClass;
1573     case 8:
1574       return &AMDGPU::VReg_256RegClass;
1575     case 16:
1576       return &AMDGPU::VReg_512RegClass;
1577     case 32: /* fall-through */
1578     default:
1579       llvm_unreachable("Invalid sub-register class size");
1580     }
1581   }
1582 }
1583 
1584 bool SIRegisterInfo::shouldRewriteCopySrc(
1585   const TargetRegisterClass *DefRC,
1586   unsigned DefSubReg,
1587   const TargetRegisterClass *SrcRC,
1588   unsigned SrcSubReg) const {
1589   // We want to prefer the smallest register class possible, so we don't want to
1590   // stop and rewrite on anything that looks like a subregister
1591   // extract. Operations mostly don't care about the super register class, so we
1592   // only want to stop on the most basic of copies between the same register
1593   // class.
1594   //
1595   // e.g. if we have something like
1596   // %0 = ...
1597   // %1 = ...
1598   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1599   // %3 = COPY %2, sub0
1600   //
1601   // We want to look through the COPY to find:
1602   //  => %3 = COPY %0
1603 
1604   // Plain copy.
1605   return getCommonSubClass(DefRC, SrcRC) != nullptr;
1606 }
1607 
1608 /// Returns a register that is not used at any point in the function.
1609 ///        If all registers are used, then this function will return
1610 //         AMDGPU::NoRegister.
1611 unsigned
1612 SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1613                                    const TargetRegisterClass *RC,
1614                                    const MachineFunction &MF) const {
1615 
1616   for (unsigned Reg : *RC)
1617     if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
1618       return Reg;
1619   return AMDGPU::NoRegister;
1620 }
1621 
1622 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
1623                                                    unsigned EltSize) const {
1624   if (EltSize == 4) {
1625     static const int16_t Sub0_31[] = {
1626       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1627       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1628       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1629       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1630       AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
1631       AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
1632       AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
1633       AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
1634     };
1635 
1636     static const int16_t Sub0_15[] = {
1637       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1638       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1639       AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1640       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1641     };
1642 
1643     static const int16_t Sub0_7[] = {
1644       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1645       AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1646     };
1647 
1648     static const int16_t Sub0_4[] = {
1649       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
1650     };
1651 
1652     static const int16_t Sub0_3[] = {
1653       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1654     };
1655 
1656     static const int16_t Sub0_2[] = {
1657       AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
1658     };
1659 
1660     static const int16_t Sub0_1[] = {
1661       AMDGPU::sub0, AMDGPU::sub1,
1662     };
1663 
1664     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1665     case 32:
1666       return {};
1667     case 64:
1668       return makeArrayRef(Sub0_1);
1669     case 96:
1670       return makeArrayRef(Sub0_2);
1671     case 128:
1672       return makeArrayRef(Sub0_3);
1673     case 160:
1674       return makeArrayRef(Sub0_4);
1675     case 256:
1676       return makeArrayRef(Sub0_7);
1677     case 512:
1678       return makeArrayRef(Sub0_15);
1679     case 1024:
1680       return makeArrayRef(Sub0_31);
1681     default:
1682       llvm_unreachable("unhandled register size");
1683     }
1684   }
1685 
1686   if (EltSize == 8) {
1687     static const int16_t Sub0_31_64[] = {
1688       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1689       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1690       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1691       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1692       AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
1693       AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
1694       AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
1695       AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
1696     };
1697 
1698     static const int16_t Sub0_15_64[] = {
1699       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1700       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1701       AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1702       AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
1703     };
1704 
1705     static const int16_t Sub0_7_64[] = {
1706       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1707       AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
1708     };
1709 
1710 
1711     static const int16_t Sub0_3_64[] = {
1712       AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
1713     };
1714 
1715     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1716     case 64:
1717       return {};
1718     case 128:
1719       return makeArrayRef(Sub0_3_64);
1720     case 256:
1721       return makeArrayRef(Sub0_7_64);
1722     case 512:
1723       return makeArrayRef(Sub0_15_64);
1724     case 1024:
1725       return makeArrayRef(Sub0_31_64);
1726     default:
1727       llvm_unreachable("unhandled register size");
1728     }
1729   }
1730 
1731   if (EltSize == 16) {
1732 
1733     static const int16_t Sub0_31_128[] = {
1734       AMDGPU::sub0_sub1_sub2_sub3,
1735       AMDGPU::sub4_sub5_sub6_sub7,
1736       AMDGPU::sub8_sub9_sub10_sub11,
1737       AMDGPU::sub12_sub13_sub14_sub15,
1738       AMDGPU::sub16_sub17_sub18_sub19,
1739       AMDGPU::sub20_sub21_sub22_sub23,
1740       AMDGPU::sub24_sub25_sub26_sub27,
1741       AMDGPU::sub28_sub29_sub30_sub31
1742     };
1743 
1744     static const int16_t Sub0_15_128[] = {
1745       AMDGPU::sub0_sub1_sub2_sub3,
1746       AMDGPU::sub4_sub5_sub6_sub7,
1747       AMDGPU::sub8_sub9_sub10_sub11,
1748       AMDGPU::sub12_sub13_sub14_sub15
1749     };
1750 
1751     static const int16_t Sub0_7_128[] = {
1752       AMDGPU::sub0_sub1_sub2_sub3,
1753       AMDGPU::sub4_sub5_sub6_sub7
1754     };
1755 
1756     switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1757     case 128:
1758       return {};
1759     case 256:
1760       return makeArrayRef(Sub0_7_128);
1761     case 512:
1762       return makeArrayRef(Sub0_15_128);
1763     case 1024:
1764       return makeArrayRef(Sub0_31_128);
1765     default:
1766       llvm_unreachable("unhandled register size");
1767     }
1768   }
1769 
1770   assert(EltSize == 32 && "unhandled elt size");
1771 
1772   static const int16_t Sub0_31_256[] = {
1773     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1774     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
1775     AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
1776     AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
1777   };
1778 
1779   static const int16_t Sub0_15_256[] = {
1780     AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
1781     AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
1782   };
1783 
1784   switch (AMDGPU::getRegBitWidth(*RC->MC)) {
1785   case 256:
1786     return {};
1787   case 512:
1788     return makeArrayRef(Sub0_15_256);
1789   case 1024:
1790     return makeArrayRef(Sub0_31_256);
1791   default:
1792     llvm_unreachable("unhandled register size");
1793   }
1794 }
1795 
1796 const TargetRegisterClass*
1797 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
1798                                   unsigned Reg) const {
1799   if (Register::isVirtualRegister(Reg))
1800     return  MRI.getRegClass(Reg);
1801 
1802   return getPhysRegClass(Reg);
1803 }
1804 
1805 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
1806                             unsigned Reg) const {
1807   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1808   assert(RC && "Register class for the reg not found");
1809   return hasVGPRs(RC);
1810 }
1811 
1812 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
1813                             unsigned Reg) const {
1814   const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
1815   assert(RC && "Register class for the reg not found");
1816   return hasAGPRs(RC);
1817 }
1818 
1819 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
1820                                     const TargetRegisterClass *SrcRC,
1821                                     unsigned SubReg,
1822                                     const TargetRegisterClass *DstRC,
1823                                     unsigned DstSubReg,
1824                                     const TargetRegisterClass *NewRC,
1825                                     LiveIntervals &LIS) const {
1826   unsigned SrcSize = getRegSizeInBits(*SrcRC);
1827   unsigned DstSize = getRegSizeInBits(*DstRC);
1828   unsigned NewSize = getRegSizeInBits(*NewRC);
1829 
1830   // Do not increase size of registers beyond dword, we would need to allocate
1831   // adjacent registers and constraint regalloc more than needed.
1832 
1833   // Always allow dword coalescing.
1834   if (SrcSize <= 32 || DstSize <= 32)
1835     return true;
1836 
1837   return NewSize <= DstSize || NewSize <= SrcSize;
1838 }
1839 
1840 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
1841                                              MachineFunction &MF) const {
1842 
1843   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1844   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1845 
1846   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
1847                                                        MF.getFunction());
1848   switch (RC->getID()) {
1849   default:
1850     return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
1851   case AMDGPU::VGPR_32RegClassID:
1852     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
1853   case AMDGPU::SGPR_32RegClassID:
1854     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
1855   }
1856 }
1857 
1858 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
1859                                                 unsigned Idx) const {
1860   if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
1861     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
1862                                const_cast<MachineFunction &>(MF));
1863 
1864   if (Idx == getSGPRPressureSet())
1865     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
1866                                const_cast<MachineFunction &>(MF));
1867 
1868   return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
1869 }
1870 
1871 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
1872   static const int Empty[] = { -1 };
1873 
1874   if (hasRegUnit(AMDGPU::M0, RegUnit))
1875     return Empty;
1876   return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
1877 }
1878 
1879 unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
1880   // Not a callee saved register.
1881   return AMDGPU::SGPR30_SGPR31;
1882 }
1883 
1884 const TargetRegisterClass *
1885 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
1886                                          const RegisterBank &RB,
1887                                          const MachineRegisterInfo &MRI) const {
1888   switch (Size) {
1889   case 1: {
1890     switch (RB.getID()) {
1891     case AMDGPU::VGPRRegBankID:
1892       return &AMDGPU::VGPR_32RegClass;
1893     case AMDGPU::VCCRegBankID:
1894       return isWave32 ?
1895         &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
1896     case AMDGPU::SGPRRegBankID:
1897       return &AMDGPU::SReg_32_XM0RegClass;
1898     case AMDGPU::SCCRegBankID:
1899       // This needs to return an allocatable class, so don't bother returning
1900       // the dummy SCC class.
1901       return &AMDGPU::SReg_32_XM0RegClass;
1902     default:
1903       llvm_unreachable("unknown register bank");
1904     }
1905   }
1906   case 32:
1907     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1908                                                  &AMDGPU::SReg_32_XM0RegClass;
1909   case 64:
1910     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
1911                                                  &AMDGPU::SReg_64_XEXECRegClass;
1912   case 96:
1913     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
1914                                                  &AMDGPU::SReg_96RegClass;
1915   case 128:
1916     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
1917                                                  &AMDGPU::SReg_128RegClass;
1918   case 160:
1919     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
1920                                                  &AMDGPU::SReg_160RegClass;
1921   case 256:
1922     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
1923                                                  &AMDGPU::SReg_256RegClass;
1924   case 512:
1925     return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
1926                                                  &AMDGPU::SReg_512RegClass;
1927   default:
1928     if (Size < 32)
1929       return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
1930                                                    &AMDGPU::SReg_32_XM0RegClass;
1931     return nullptr;
1932   }
1933 }
1934 
1935 const TargetRegisterClass *
1936 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
1937                                          const MachineRegisterInfo &MRI) const {
1938   if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
1939     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
1940   return nullptr;
1941 }
1942 
1943 unsigned SIRegisterInfo::getVCC() const {
1944   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
1945 }
1946 
1947 const TargetRegisterClass *
1948 SIRegisterInfo::getRegClass(unsigned RCID) const {
1949   switch ((int)RCID) {
1950   case AMDGPU::SReg_1RegClassID:
1951     return getBoolRC();
1952   case AMDGPU::SReg_1_XEXECRegClassID:
1953     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
1954       : &AMDGPU::SReg_64_XEXECRegClass;
1955   case -1:
1956     return nullptr;
1957   default:
1958     return AMDGPURegisterInfo::getRegClass(RCID);
1959   }
1960 }
1961 
1962 // Find reaching register definition
1963 MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
1964                                               MachineInstr &Use,
1965                                               MachineRegisterInfo &MRI,
1966                                               LiveIntervals *LIS) const {
1967   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
1968   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
1969   SlotIndex DefIdx;
1970 
1971   if (Register::isVirtualRegister(Reg)) {
1972     if (!LIS->hasInterval(Reg))
1973       return nullptr;
1974     LiveInterval &LI = LIS->getInterval(Reg);
1975     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
1976                                   : MRI.getMaxLaneMaskForVReg(Reg);
1977     VNInfo *V = nullptr;
1978     if (LI.hasSubRanges()) {
1979       for (auto &S : LI.subranges()) {
1980         if ((S.LaneMask & SubLanes) == SubLanes) {
1981           V = S.getVNInfoAt(UseIdx);
1982           break;
1983         }
1984       }
1985     } else {
1986       V = LI.getVNInfoAt(UseIdx);
1987     }
1988     if (!V)
1989       return nullptr;
1990     DefIdx = V->def;
1991   } else {
1992     // Find last def.
1993     for (MCRegUnitIterator Units(Reg, this); Units.isValid(); ++Units) {
1994       LiveRange &LR = LIS->getRegUnit(*Units);
1995       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
1996         if (!DefIdx.isValid() ||
1997             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
1998                           LIS->getInstructionFromIndex(V->def)))
1999           DefIdx = V->def;
2000       } else {
2001         return nullptr;
2002       }
2003     }
2004   }
2005 
2006   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2007 
2008   if (!Def || !MDT.dominates(Def, &Use))
2009     return nullptr;
2010 
2011   assert(Def->modifiesRegister(Reg, this));
2012 
2013   return Def;
2014 }
2015