1 //===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIInstrInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineInstrBuilder.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/CodeGen/ScheduleDAG.h"
24 #include "llvm/IR/Function.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/Debug.h"
28 
29 using namespace llvm;
30 
31 // Must be at least 4 to be able to branch over minimum unconditional branch
32 // code. This is only for making it possible to write reasonably small tests for
33 // long branches.
34 static cl::opt<unsigned>
35 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
36                  cl::desc("Restrict range of branch instructions (DEBUG)"));
37 
38 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
39   : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
40 
41 //===----------------------------------------------------------------------===//
42 // TargetInstrInfo callbacks
43 //===----------------------------------------------------------------------===//
44 
45 static unsigned getNumOperandsNoGlue(SDNode *Node) {
46   unsigned N = Node->getNumOperands();
47   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
48     --N;
49   return N;
50 }
51 
52 static SDValue findChainOperand(SDNode *Load) {
53   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
54   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
55   return LastOp;
56 }
57 
58 /// \brief Returns true if both nodes have the same value for the given
59 ///        operand \p Op, or if both nodes do not have this operand.
60 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
61   unsigned Opc0 = N0->getMachineOpcode();
62   unsigned Opc1 = N1->getMachineOpcode();
63 
64   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
65   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
66 
67   if (Op0Idx == -1 && Op1Idx == -1)
68     return true;
69 
70 
71   if ((Op0Idx == -1 && Op1Idx != -1) ||
72       (Op1Idx == -1 && Op0Idx != -1))
73     return false;
74 
75   // getNamedOperandIdx returns the index for the MachineInstr's operands,
76   // which includes the result as the first operand. We are indexing into the
77   // MachineSDNode's operands, so we need to skip the result operand to get
78   // the real index.
79   --Op0Idx;
80   --Op1Idx;
81 
82   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
83 }
84 
85 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
86                                                     AliasAnalysis *AA) const {
87   // TODO: The generic check fails for VALU instructions that should be
88   // rematerializable due to implicit reads of exec. We really want all of the
89   // generic logic for this except for this.
90   switch (MI.getOpcode()) {
91   case AMDGPU::V_MOV_B32_e32:
92   case AMDGPU::V_MOV_B32_e64:
93   case AMDGPU::V_MOV_B64_PSEUDO:
94     return true;
95   default:
96     return false;
97   }
98 }
99 
100 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
101                                           int64_t &Offset0,
102                                           int64_t &Offset1) const {
103   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
104     return false;
105 
106   unsigned Opc0 = Load0->getMachineOpcode();
107   unsigned Opc1 = Load1->getMachineOpcode();
108 
109   // Make sure both are actually loads.
110   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
111     return false;
112 
113   if (isDS(Opc0) && isDS(Opc1)) {
114 
115     // FIXME: Handle this case:
116     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
117       return false;
118 
119     // Check base reg.
120     if (Load0->getOperand(1) != Load1->getOperand(1))
121       return false;
122 
123     // Check chain.
124     if (findChainOperand(Load0) != findChainOperand(Load1))
125       return false;
126 
127     // Skip read2 / write2 variants for simplicity.
128     // TODO: We should report true if the used offsets are adjacent (excluded
129     // st64 versions).
130     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
131         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
132       return false;
133 
134     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
135     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
136     return true;
137   }
138 
139   if (isSMRD(Opc0) && isSMRD(Opc1)) {
140     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
141 
142     // Check base reg.
143     if (Load0->getOperand(0) != Load1->getOperand(0))
144       return false;
145 
146     const ConstantSDNode *Load0Offset =
147         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
148     const ConstantSDNode *Load1Offset =
149         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
150 
151     if (!Load0Offset || !Load1Offset)
152       return false;
153 
154     // Check chain.
155     if (findChainOperand(Load0) != findChainOperand(Load1))
156       return false;
157 
158     Offset0 = Load0Offset->getZExtValue();
159     Offset1 = Load1Offset->getZExtValue();
160     return true;
161   }
162 
163   // MUBUF and MTBUF can access the same addresses.
164   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
165 
166     // MUBUF and MTBUF have vaddr at different indices.
167     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
168         findChainOperand(Load0) != findChainOperand(Load1) ||
169         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
170         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
171       return false;
172 
173     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
174     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
175 
176     if (OffIdx0 == -1 || OffIdx1 == -1)
177       return false;
178 
179     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
180     // inlcude the output in the operand list, but SDNodes don't, we need to
181     // subtract the index by one.
182     --OffIdx0;
183     --OffIdx1;
184 
185     SDValue Off0 = Load0->getOperand(OffIdx0);
186     SDValue Off1 = Load1->getOperand(OffIdx1);
187 
188     // The offset might be a FrameIndexSDNode.
189     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
190       return false;
191 
192     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
193     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
194     return true;
195   }
196 
197   return false;
198 }
199 
200 static bool isStride64(unsigned Opc) {
201   switch (Opc) {
202   case AMDGPU::DS_READ2ST64_B32:
203   case AMDGPU::DS_READ2ST64_B64:
204   case AMDGPU::DS_WRITE2ST64_B32:
205   case AMDGPU::DS_WRITE2ST64_B64:
206     return true;
207   default:
208     return false;
209   }
210 }
211 
212 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
213                                         int64_t &Offset,
214                                         const TargetRegisterInfo *TRI) const {
215   unsigned Opc = LdSt.getOpcode();
216 
217   if (isDS(LdSt)) {
218     const MachineOperand *OffsetImm =
219         getNamedOperand(LdSt, AMDGPU::OpName::offset);
220     if (OffsetImm) {
221       // Normal, single offset LDS instruction.
222       const MachineOperand *AddrReg =
223           getNamedOperand(LdSt, AMDGPU::OpName::addr);
224 
225       BaseReg = AddrReg->getReg();
226       Offset = OffsetImm->getImm();
227       return true;
228     }
229 
230     // The 2 offset instructions use offset0 and offset1 instead. We can treat
231     // these as a load with a single offset if the 2 offsets are consecutive. We
232     // will use this for some partially aligned loads.
233     const MachineOperand *Offset0Imm =
234         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
235     const MachineOperand *Offset1Imm =
236         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
237 
238     uint8_t Offset0 = Offset0Imm->getImm();
239     uint8_t Offset1 = Offset1Imm->getImm();
240 
241     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
242       // Each of these offsets is in element sized units, so we need to convert
243       // to bytes of the individual reads.
244 
245       unsigned EltSize;
246       if (LdSt.mayLoad())
247         EltSize = getOpRegClass(LdSt, 0)->getSize() / 2;
248       else {
249         assert(LdSt.mayStore());
250         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
251         EltSize = getOpRegClass(LdSt, Data0Idx)->getSize();
252       }
253 
254       if (isStride64(Opc))
255         EltSize *= 64;
256 
257       const MachineOperand *AddrReg =
258           getNamedOperand(LdSt, AMDGPU::OpName::addr);
259       BaseReg = AddrReg->getReg();
260       Offset = EltSize * Offset0;
261       return true;
262     }
263 
264     return false;
265   }
266 
267   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
268     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
269     if (SOffset && SOffset->isReg())
270       return false;
271 
272     const MachineOperand *AddrReg =
273         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
274     if (!AddrReg)
275       return false;
276 
277     const MachineOperand *OffsetImm =
278         getNamedOperand(LdSt, AMDGPU::OpName::offset);
279     BaseReg = AddrReg->getReg();
280     Offset = OffsetImm->getImm();
281 
282     if (SOffset) // soffset can be an inline immediate.
283       Offset += SOffset->getImm();
284 
285     return true;
286   }
287 
288   if (isSMRD(LdSt)) {
289     const MachineOperand *OffsetImm =
290         getNamedOperand(LdSt, AMDGPU::OpName::offset);
291     if (!OffsetImm)
292       return false;
293 
294     const MachineOperand *SBaseReg =
295         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
296     BaseReg = SBaseReg->getReg();
297     Offset = OffsetImm->getImm();
298     return true;
299   }
300 
301   if (isFLAT(LdSt)) {
302     const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
303     BaseReg = AddrReg->getReg();
304     Offset = 0;
305     return true;
306   }
307 
308   return false;
309 }
310 
311 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
312                                       MachineInstr &SecondLdSt,
313                                       unsigned NumLoads) const {
314   const MachineOperand *FirstDst = nullptr;
315   const MachineOperand *SecondDst = nullptr;
316 
317   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
318       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
319     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
320     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
321   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
322     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
323     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
324   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
325     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
326     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
327   }
328 
329   if (!FirstDst || !SecondDst)
330     return false;
331 
332   // Try to limit clustering based on the total number of bytes loaded
333   // rather than the number of instructions.  This is done to help reduce
334   // register pressure.  The method used is somewhat inexact, though,
335   // because it assumes that all loads in the cluster will load the
336   // same number of bytes as FirstLdSt.
337 
338   // The unit of this value is bytes.
339   // FIXME: This needs finer tuning.
340   unsigned LoadClusterThreshold = 16;
341 
342   const MachineRegisterInfo &MRI =
343       FirstLdSt.getParent()->getParent()->getRegInfo();
344   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
345 
346   return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
347 }
348 
349 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
350                               MachineBasicBlock::iterator MI,
351                               const DebugLoc &DL, unsigned DestReg,
352                               unsigned SrcReg, bool KillSrc) const {
353   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
354 
355   if (RC == &AMDGPU::VGPR_32RegClass) {
356     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
357            AMDGPU::SReg_32RegClass.contains(SrcReg));
358     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
359       .addReg(SrcReg, getKillRegState(KillSrc));
360     return;
361   }
362 
363   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
364       RC == &AMDGPU::SReg_32RegClass) {
365     if (SrcReg == AMDGPU::SCC) {
366       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
367           .addImm(-1)
368           .addImm(0);
369       return;
370     }
371 
372     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
373     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
374             .addReg(SrcReg, getKillRegState(KillSrc));
375     return;
376   }
377 
378   if (RC == &AMDGPU::SReg_64RegClass) {
379     if (DestReg == AMDGPU::VCC) {
380       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
381         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
382           .addReg(SrcReg, getKillRegState(KillSrc));
383       } else {
384         // FIXME: Hack until VReg_1 removed.
385         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
386         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
387           .addImm(0)
388           .addReg(SrcReg, getKillRegState(KillSrc));
389       }
390 
391       return;
392     }
393 
394     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
395     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
396             .addReg(SrcReg, getKillRegState(KillSrc));
397     return;
398   }
399 
400   if (DestReg == AMDGPU::SCC) {
401     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
402     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
403       .addReg(SrcReg, getKillRegState(KillSrc))
404       .addImm(0);
405     return;
406   }
407 
408   unsigned EltSize = 4;
409   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
410   if (RI.isSGPRClass(RC)) {
411     if (RC->getSize() > 4) {
412       Opcode =  AMDGPU::S_MOV_B64;
413       EltSize = 8;
414     } else {
415       Opcode = AMDGPU::S_MOV_B32;
416       EltSize = 4;
417     }
418   }
419 
420   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
421   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
422 
423   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
424     unsigned SubIdx;
425     if (Forward)
426       SubIdx = SubIndices[Idx];
427     else
428       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
429 
430     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
431       get(Opcode), RI.getSubReg(DestReg, SubIdx));
432 
433     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
434 
435     if (Idx == SubIndices.size() - 1)
436       Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
437 
438     if (Idx == 0)
439       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
440 
441     Builder.addReg(SrcReg, RegState::Implicit);
442   }
443 }
444 
445 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
446   int NewOpc;
447 
448   // Try to map original to commuted opcode
449   NewOpc = AMDGPU::getCommuteRev(Opcode);
450   if (NewOpc != -1)
451     // Check if the commuted (REV) opcode exists on the target.
452     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
453 
454   // Try to map commuted to original opcode
455   NewOpc = AMDGPU::getCommuteOrig(Opcode);
456   if (NewOpc != -1)
457     // Check if the original (non-REV) opcode exists on the target.
458     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
459 
460   return Opcode;
461 }
462 
463 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
464 
465   if (DstRC->getSize() == 4) {
466     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
467   } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
468     return AMDGPU::S_MOV_B64;
469   } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
470     return  AMDGPU::V_MOV_B64_PSEUDO;
471   }
472   return AMDGPU::COPY;
473 }
474 
475 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
476   switch (Size) {
477   case 4:
478     return AMDGPU::SI_SPILL_S32_SAVE;
479   case 8:
480     return AMDGPU::SI_SPILL_S64_SAVE;
481   case 16:
482     return AMDGPU::SI_SPILL_S128_SAVE;
483   case 32:
484     return AMDGPU::SI_SPILL_S256_SAVE;
485   case 64:
486     return AMDGPU::SI_SPILL_S512_SAVE;
487   default:
488     llvm_unreachable("unknown register size");
489   }
490 }
491 
492 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
493   switch (Size) {
494   case 4:
495     return AMDGPU::SI_SPILL_V32_SAVE;
496   case 8:
497     return AMDGPU::SI_SPILL_V64_SAVE;
498   case 12:
499     return AMDGPU::SI_SPILL_V96_SAVE;
500   case 16:
501     return AMDGPU::SI_SPILL_V128_SAVE;
502   case 32:
503     return AMDGPU::SI_SPILL_V256_SAVE;
504   case 64:
505     return AMDGPU::SI_SPILL_V512_SAVE;
506   default:
507     llvm_unreachable("unknown register size");
508   }
509 }
510 
511 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
512                                       MachineBasicBlock::iterator MI,
513                                       unsigned SrcReg, bool isKill,
514                                       int FrameIndex,
515                                       const TargetRegisterClass *RC,
516                                       const TargetRegisterInfo *TRI) const {
517   MachineFunction *MF = MBB.getParent();
518   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
519   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
520   DebugLoc DL = MBB.findDebugLoc(MI);
521 
522   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
523   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
524   MachinePointerInfo PtrInfo
525     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
526   MachineMemOperand *MMO
527     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
528                                Size, Align);
529 
530   if (RI.isSGPRClass(RC)) {
531     MFI->setHasSpilledSGPRs();
532 
533     // We are only allowed to create one new instruction when spilling
534     // registers, so we need to use pseudo instruction for spilling SGPRs.
535     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(RC->getSize()));
536 
537     // The SGPR spill/restore instructions only work on number sgprs, so we need
538     // to make sure we are using the correct register class.
539     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
540       MachineRegisterInfo &MRI = MF->getRegInfo();
541       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
542     }
543 
544     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
545       .addReg(SrcReg, getKillRegState(isKill)) // data
546       .addFrameIndex(FrameIndex)               // addr
547       .addMemOperand(MMO)
548       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
549       .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
550     // Add the scratch resource registers as implicit uses because we may end up
551     // needing them, and need to ensure that the reserved registers are
552     // correctly handled.
553 
554     if (ST.hasScalarStores()) {
555       // m0 is used for offset to scalar stores if used to spill.
556       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
557     }
558 
559     return;
560   }
561 
562   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
563     LLVMContext &Ctx = MF->getFunction()->getContext();
564     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
565                   " spill register");
566     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
567       .addReg(SrcReg);
568 
569     return;
570   }
571 
572   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
573 
574   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
575   MFI->setHasSpilledVGPRs();
576   BuildMI(MBB, MI, DL, get(Opcode))
577     .addReg(SrcReg, getKillRegState(isKill)) // data
578     .addFrameIndex(FrameIndex)               // addr
579     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
580     .addReg(MFI->getScratchWaveOffsetReg())  // scratch_offset
581     .addImm(0)                               // offset
582     .addMemOperand(MMO);
583 }
584 
585 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
586   switch (Size) {
587   case 4:
588     return AMDGPU::SI_SPILL_S32_RESTORE;
589   case 8:
590     return AMDGPU::SI_SPILL_S64_RESTORE;
591   case 16:
592     return AMDGPU::SI_SPILL_S128_RESTORE;
593   case 32:
594     return AMDGPU::SI_SPILL_S256_RESTORE;
595   case 64:
596     return AMDGPU::SI_SPILL_S512_RESTORE;
597   default:
598     llvm_unreachable("unknown register size");
599   }
600 }
601 
602 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
603   switch (Size) {
604   case 4:
605     return AMDGPU::SI_SPILL_V32_RESTORE;
606   case 8:
607     return AMDGPU::SI_SPILL_V64_RESTORE;
608   case 12:
609     return AMDGPU::SI_SPILL_V96_RESTORE;
610   case 16:
611     return AMDGPU::SI_SPILL_V128_RESTORE;
612   case 32:
613     return AMDGPU::SI_SPILL_V256_RESTORE;
614   case 64:
615     return AMDGPU::SI_SPILL_V512_RESTORE;
616   default:
617     llvm_unreachable("unknown register size");
618   }
619 }
620 
621 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
622                                        MachineBasicBlock::iterator MI,
623                                        unsigned DestReg, int FrameIndex,
624                                        const TargetRegisterClass *RC,
625                                        const TargetRegisterInfo *TRI) const {
626   MachineFunction *MF = MBB.getParent();
627   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
628   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
629   DebugLoc DL = MBB.findDebugLoc(MI);
630   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
631   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
632 
633   MachinePointerInfo PtrInfo
634     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
635 
636   MachineMemOperand *MMO = MF->getMachineMemOperand(
637     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
638 
639   if (RI.isSGPRClass(RC)) {
640     // FIXME: Maybe this should not include a memoperand because it will be
641     // lowered to non-memory instructions.
642     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(RC->getSize()));
643     if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
644       MachineRegisterInfo &MRI = MF->getRegInfo();
645       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
646     }
647 
648     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
649       .addFrameIndex(FrameIndex) // addr
650       .addMemOperand(MMO)
651       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
652       .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
653 
654     if (ST.hasScalarStores()) {
655       // m0 is used for offset to scalar stores if used to spill.
656       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine);
657     }
658 
659     return;
660   }
661 
662   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
663     LLVMContext &Ctx = MF->getFunction()->getContext();
664     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
665                   " restore register");
666     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
667 
668     return;
669   }
670 
671   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
672 
673   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
674   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
675     .addFrameIndex(FrameIndex)              // vaddr
676     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
677     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
678     .addImm(0)                              // offset
679     .addMemOperand(MMO);
680 }
681 
682 /// \param @Offset Offset in bytes of the FrameIndex being spilled
683 unsigned SIInstrInfo::calculateLDSSpillAddress(
684     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
685     unsigned FrameOffset, unsigned Size) const {
686   MachineFunction *MF = MBB.getParent();
687   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
688   const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
689   const SIRegisterInfo *TRI = ST.getRegisterInfo();
690   DebugLoc DL = MBB.findDebugLoc(MI);
691   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
692   unsigned WavefrontSize = ST.getWavefrontSize();
693 
694   unsigned TIDReg = MFI->getTIDReg();
695   if (!MFI->hasCalculatedTID()) {
696     MachineBasicBlock &Entry = MBB.getParent()->front();
697     MachineBasicBlock::iterator Insert = Entry.front();
698     DebugLoc DL = Insert->getDebugLoc();
699 
700     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
701                                    *MF);
702     if (TIDReg == AMDGPU::NoRegister)
703       return TIDReg;
704 
705     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
706         WorkGroupSize > WavefrontSize) {
707 
708       unsigned TIDIGXReg
709         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
710       unsigned TIDIGYReg
711         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
712       unsigned TIDIGZReg
713         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
714       unsigned InputPtrReg =
715           TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
716       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
717         if (!Entry.isLiveIn(Reg))
718           Entry.addLiveIn(Reg);
719       }
720 
721       RS->enterBasicBlock(Entry);
722       // FIXME: Can we scavenge an SReg_64 and access the subregs?
723       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
724       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
725       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
726               .addReg(InputPtrReg)
727               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
728       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
729               .addReg(InputPtrReg)
730               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
731 
732       // NGROUPS.X * NGROUPS.Y
733       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
734               .addReg(STmp1)
735               .addReg(STmp0);
736       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
737       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
738               .addReg(STmp1)
739               .addReg(TIDIGXReg);
740       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
741       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
742               .addReg(STmp0)
743               .addReg(TIDIGYReg)
744               .addReg(TIDReg);
745       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
746       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
747               .addReg(TIDReg)
748               .addReg(TIDIGZReg);
749     } else {
750       // Get the wave id
751       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
752               TIDReg)
753               .addImm(-1)
754               .addImm(0);
755 
756       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
757               TIDReg)
758               .addImm(-1)
759               .addReg(TIDReg);
760     }
761 
762     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
763             TIDReg)
764             .addImm(2)
765             .addReg(TIDReg);
766     MFI->setTIDReg(TIDReg);
767   }
768 
769   // Add FrameIndex to LDS offset
770   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
771   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
772           .addImm(LDSOffset)
773           .addReg(TIDReg);
774 
775   return TmpReg;
776 }
777 
778 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
779                                    MachineBasicBlock::iterator MI,
780                                    int Count) const {
781   DebugLoc DL = MBB.findDebugLoc(MI);
782   while (Count > 0) {
783     int Arg;
784     if (Count >= 8)
785       Arg = 7;
786     else
787       Arg = Count - 1;
788     Count -= 8;
789     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
790             .addImm(Arg);
791   }
792 }
793 
794 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
795                              MachineBasicBlock::iterator MI) const {
796   insertWaitStates(MBB, MI, 1);
797 }
798 
799 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
800   switch (MI.getOpcode()) {
801   default: return 1; // FIXME: Do wait states equal cycles?
802 
803   case AMDGPU::S_NOP:
804     return MI.getOperand(0).getImm() + 1;
805   }
806 }
807 
808 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
809   MachineBasicBlock &MBB = *MI.getParent();
810   DebugLoc DL = MBB.findDebugLoc(MI);
811   switch (MI.getOpcode()) {
812   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
813   case AMDGPU::S_MOV_B64_term: {
814     // This is only a terminator to get the correct spill code placement during
815     // register allocation.
816     MI.setDesc(get(AMDGPU::S_MOV_B64));
817     break;
818   }
819   case AMDGPU::S_XOR_B64_term: {
820     // This is only a terminator to get the correct spill code placement during
821     // register allocation.
822     MI.setDesc(get(AMDGPU::S_XOR_B64));
823     break;
824   }
825   case AMDGPU::S_ANDN2_B64_term: {
826     // This is only a terminator to get the correct spill code placement during
827     // register allocation.
828     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
829     break;
830   }
831   case AMDGPU::V_MOV_B64_PSEUDO: {
832     unsigned Dst = MI.getOperand(0).getReg();
833     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
834     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
835 
836     const MachineOperand &SrcOp = MI.getOperand(1);
837     // FIXME: Will this work for 64-bit floating point immediates?
838     assert(!SrcOp.isFPImm());
839     if (SrcOp.isImm()) {
840       APInt Imm(64, SrcOp.getImm());
841       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
842         .addImm(Imm.getLoBits(32).getZExtValue())
843         .addReg(Dst, RegState::Implicit | RegState::Define);
844       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
845         .addImm(Imm.getHiBits(32).getZExtValue())
846         .addReg(Dst, RegState::Implicit | RegState::Define);
847     } else {
848       assert(SrcOp.isReg());
849       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
850         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
851         .addReg(Dst, RegState::Implicit | RegState::Define);
852       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
853         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
854         .addReg(Dst, RegState::Implicit | RegState::Define);
855     }
856     MI.eraseFromParent();
857     break;
858   }
859   case AMDGPU::V_MOVRELD_B32_V1:
860   case AMDGPU::V_MOVRELD_B32_V2:
861   case AMDGPU::V_MOVRELD_B32_V4:
862   case AMDGPU::V_MOVRELD_B32_V8:
863   case AMDGPU::V_MOVRELD_B32_V16: {
864     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
865     unsigned VecReg = MI.getOperand(0).getReg();
866     bool IsUndef = MI.getOperand(1).isUndef();
867     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
868     assert(VecReg == MI.getOperand(1).getReg());
869 
870     MachineInstr *MovRel =
871         BuildMI(MBB, MI, DL, MovRelDesc)
872             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
873             .add(MI.getOperand(2))
874             .addReg(VecReg, RegState::ImplicitDefine)
875             .addReg(VecReg,
876                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
877 
878     const int ImpDefIdx =
879         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
880     const int ImpUseIdx = ImpDefIdx + 1;
881     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
882 
883     MI.eraseFromParent();
884     break;
885   }
886   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
887     MachineFunction &MF = *MBB.getParent();
888     unsigned Reg = MI.getOperand(0).getReg();
889     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
890     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
891 
892     // Create a bundle so these instructions won't be re-ordered by the
893     // post-RA scheduler.
894     MIBundleBuilder Bundler(MBB, MI);
895     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
896 
897     // Add 32-bit offset from this instruction to the start of the
898     // constant data.
899     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
900                        .addReg(RegLo)
901                        .add(MI.getOperand(1)));
902 
903     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
904                                   .addReg(RegHi);
905     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
906       MIB.addImm(0);
907     else
908       MIB.add(MI.getOperand(2));
909 
910     Bundler.append(MIB);
911     llvm::finalizeBundle(MBB, Bundler.begin());
912 
913     MI.eraseFromParent();
914     break;
915   }
916   }
917   return true;
918 }
919 
920 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
921                                       MachineOperand &Src0,
922                                       unsigned Src0OpName,
923                                       MachineOperand &Src1,
924                                       unsigned Src1OpName) const {
925   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
926   if (!Src0Mods)
927     return false;
928 
929   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
930   assert(Src1Mods &&
931          "All commutable instructions have both src0 and src1 modifiers");
932 
933   int Src0ModsVal = Src0Mods->getImm();
934   int Src1ModsVal = Src1Mods->getImm();
935 
936   Src1Mods->setImm(Src0ModsVal);
937   Src0Mods->setImm(Src1ModsVal);
938   return true;
939 }
940 
941 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
942                                              MachineOperand &RegOp,
943                                              MachineOperand &NonRegOp) {
944   unsigned Reg = RegOp.getReg();
945   unsigned SubReg = RegOp.getSubReg();
946   bool IsKill = RegOp.isKill();
947   bool IsDead = RegOp.isDead();
948   bool IsUndef = RegOp.isUndef();
949   bool IsDebug = RegOp.isDebug();
950 
951   if (NonRegOp.isImm())
952     RegOp.ChangeToImmediate(NonRegOp.getImm());
953   else if (NonRegOp.isFI())
954     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
955   else
956     return nullptr;
957 
958   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
959   NonRegOp.setSubReg(SubReg);
960 
961   return &MI;
962 }
963 
964 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
965                                                   unsigned Src0Idx,
966                                                   unsigned Src1Idx) const {
967   assert(!NewMI && "this should never be used");
968 
969   unsigned Opc = MI.getOpcode();
970   int CommutedOpcode = commuteOpcode(Opc);
971   if (CommutedOpcode == -1)
972     return nullptr;
973 
974   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
975            static_cast<int>(Src0Idx) &&
976          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
977            static_cast<int>(Src1Idx) &&
978          "inconsistency with findCommutedOpIndices");
979 
980   MachineOperand &Src0 = MI.getOperand(Src0Idx);
981   MachineOperand &Src1 = MI.getOperand(Src1Idx);
982 
983   MachineInstr *CommutedMI = nullptr;
984   if (Src0.isReg() && Src1.isReg()) {
985     if (isOperandLegal(MI, Src1Idx, &Src0)) {
986       // Be sure to copy the source modifiers to the right place.
987       CommutedMI
988         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
989     }
990 
991   } else if (Src0.isReg() && !Src1.isReg()) {
992     // src0 should always be able to support any operand type, so no need to
993     // check operand legality.
994     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
995   } else if (!Src0.isReg() && Src1.isReg()) {
996     if (isOperandLegal(MI, Src1Idx, &Src0))
997       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
998   } else {
999     // FIXME: Found two non registers to commute. This does happen.
1000     return nullptr;
1001   }
1002 
1003 
1004   if (CommutedMI) {
1005     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1006                         Src1, AMDGPU::OpName::src1_modifiers);
1007 
1008     CommutedMI->setDesc(get(CommutedOpcode));
1009   }
1010 
1011   return CommutedMI;
1012 }
1013 
1014 // This needs to be implemented because the source modifiers may be inserted
1015 // between the true commutable operands, and the base
1016 // TargetInstrInfo::commuteInstruction uses it.
1017 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1018                                         unsigned &SrcOpIdx1) const {
1019   if (!MI.isCommutable())
1020     return false;
1021 
1022   unsigned Opc = MI.getOpcode();
1023   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1024   if (Src0Idx == -1)
1025     return false;
1026 
1027   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1028   if (Src1Idx == -1)
1029     return false;
1030 
1031   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1032 }
1033 
1034 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1035                                         int64_t BrOffset) const {
1036   // BranchRelaxation should never have to check s_setpc_b64 because its dest
1037   // block is unanalyzable.
1038   assert(BranchOp != AMDGPU::S_SETPC_B64);
1039 
1040   // Convert to dwords.
1041   BrOffset /= 4;
1042 
1043   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1044   // from the next instruction.
1045   BrOffset -= 1;
1046 
1047   return isIntN(BranchOffsetBits, BrOffset);
1048 }
1049 
1050 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1051   const MachineInstr &MI) const {
1052   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1053     // This would be a difficult analysis to perform, but can always be legal so
1054     // there's no need to analyze it.
1055     return nullptr;
1056   }
1057 
1058   return MI.getOperand(0).getMBB();
1059 }
1060 
1061 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1062                                            MachineBasicBlock &DestBB,
1063                                            const DebugLoc &DL,
1064                                            int64_t BrOffset,
1065                                            RegScavenger *RS) const {
1066   assert(RS && "RegScavenger required for long branching");
1067   assert(MBB.empty() &&
1068          "new block should be inserted for expanding unconditional branch");
1069   assert(MBB.pred_size() == 1);
1070 
1071   MachineFunction *MF = MBB.getParent();
1072   MachineRegisterInfo &MRI = MF->getRegInfo();
1073 
1074   // FIXME: Virtual register workaround for RegScavenger not working with empty
1075   // blocks.
1076   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1077 
1078   auto I = MBB.end();
1079 
1080   // We need to compute the offset relative to the instruction immediately after
1081   // s_getpc_b64. Insert pc arithmetic code before last terminator.
1082   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1083 
1084   // TODO: Handle > 32-bit block address.
1085   if (BrOffset >= 0) {
1086     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1087       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1088       .addReg(PCReg, 0, AMDGPU::sub0)
1089       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1090     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1091       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1092       .addReg(PCReg, 0, AMDGPU::sub1)
1093       .addImm(0);
1094   } else {
1095     // Backwards branch.
1096     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1097       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1098       .addReg(PCReg, 0, AMDGPU::sub0)
1099       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1100     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1101       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1102       .addReg(PCReg, 0, AMDGPU::sub1)
1103       .addImm(0);
1104   }
1105 
1106   // Insert the indirect branch after the other terminator.
1107   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1108     .addReg(PCReg);
1109 
1110   // FIXME: If spilling is necessary, this will fail because this scavenger has
1111   // no emergency stack slots. It is non-trivial to spill in this situation,
1112   // because the restore code needs to be specially placed after the
1113   // jump. BranchRelaxation then needs to be made aware of the newly inserted
1114   // block.
1115   //
1116   // If a spill is needed for the pc register pair, we need to insert a spill
1117   // restore block right before the destination block, and insert a short branch
1118   // into the old destination block's fallthrough predecessor.
1119   // e.g.:
1120   //
1121   // s_cbranch_scc0 skip_long_branch:
1122   //
1123   // long_branch_bb:
1124   //   spill s[8:9]
1125   //   s_getpc_b64 s[8:9]
1126   //   s_add_u32 s8, s8, restore_bb
1127   //   s_addc_u32 s9, s9, 0
1128   //   s_setpc_b64 s[8:9]
1129   //
1130   // skip_long_branch:
1131   //   foo;
1132   //
1133   // .....
1134   //
1135   // dest_bb_fallthrough_predecessor:
1136   // bar;
1137   // s_branch dest_bb
1138   //
1139   // restore_bb:
1140   //  restore s[8:9]
1141   //  fallthrough dest_bb
1142   ///
1143   // dest_bb:
1144   //   buzz;
1145 
1146   RS->enterBasicBlockEnd(MBB);
1147   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1148                                        MachineBasicBlock::iterator(GetPC), 0);
1149   MRI.replaceRegWith(PCReg, Scav);
1150   MRI.clearVirtRegs();
1151   RS->setRegUsed(Scav);
1152 
1153   return 4 + 8 + 4 + 4;
1154 }
1155 
1156 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1157   switch (Cond) {
1158   case SIInstrInfo::SCC_TRUE:
1159     return AMDGPU::S_CBRANCH_SCC1;
1160   case SIInstrInfo::SCC_FALSE:
1161     return AMDGPU::S_CBRANCH_SCC0;
1162   case SIInstrInfo::VCCNZ:
1163     return AMDGPU::S_CBRANCH_VCCNZ;
1164   case SIInstrInfo::VCCZ:
1165     return AMDGPU::S_CBRANCH_VCCZ;
1166   case SIInstrInfo::EXECNZ:
1167     return AMDGPU::S_CBRANCH_EXECNZ;
1168   case SIInstrInfo::EXECZ:
1169     return AMDGPU::S_CBRANCH_EXECZ;
1170   default:
1171     llvm_unreachable("invalid branch predicate");
1172   }
1173 }
1174 
1175 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1176   switch (Opcode) {
1177   case AMDGPU::S_CBRANCH_SCC0:
1178     return SCC_FALSE;
1179   case AMDGPU::S_CBRANCH_SCC1:
1180     return SCC_TRUE;
1181   case AMDGPU::S_CBRANCH_VCCNZ:
1182     return VCCNZ;
1183   case AMDGPU::S_CBRANCH_VCCZ:
1184     return VCCZ;
1185   case AMDGPU::S_CBRANCH_EXECNZ:
1186     return EXECNZ;
1187   case AMDGPU::S_CBRANCH_EXECZ:
1188     return EXECZ;
1189   default:
1190     return INVALID_BR;
1191   }
1192 }
1193 
1194 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1195                                     MachineBasicBlock::iterator I,
1196                                     MachineBasicBlock *&TBB,
1197                                     MachineBasicBlock *&FBB,
1198                                     SmallVectorImpl<MachineOperand> &Cond,
1199                                     bool AllowModify) const {
1200   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1201     // Unconditional Branch
1202     TBB = I->getOperand(0).getMBB();
1203     return false;
1204   }
1205 
1206   BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1207   if (Pred == INVALID_BR)
1208     return true;
1209 
1210   MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
1211   Cond.push_back(MachineOperand::CreateImm(Pred));
1212   Cond.push_back(I->getOperand(1)); // Save the branch register.
1213 
1214   ++I;
1215 
1216   if (I == MBB.end()) {
1217     // Conditional branch followed by fall-through.
1218     TBB = CondBB;
1219     return false;
1220   }
1221 
1222   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1223     TBB = CondBB;
1224     FBB = I->getOperand(0).getMBB();
1225     return false;
1226   }
1227 
1228   return true;
1229 }
1230 
1231 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1232                                 MachineBasicBlock *&FBB,
1233                                 SmallVectorImpl<MachineOperand> &Cond,
1234                                 bool AllowModify) const {
1235   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1236   if (I == MBB.end())
1237     return false;
1238 
1239   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1240     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1241 
1242   ++I;
1243 
1244   // TODO: Should be able to treat as fallthrough?
1245   if (I == MBB.end())
1246     return true;
1247 
1248   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1249     return true;
1250 
1251   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1252 
1253   // Specifically handle the case where the conditional branch is to the same
1254   // destination as the mask branch. e.g.
1255   //
1256   // si_mask_branch BB8
1257   // s_cbranch_execz BB8
1258   // s_cbranch BB9
1259   //
1260   // This is required to understand divergent loops which may need the branches
1261   // to be relaxed.
1262   if (TBB != MaskBrDest || Cond.empty())
1263     return true;
1264 
1265   auto Pred = Cond[0].getImm();
1266   return (Pred != EXECZ && Pred != EXECNZ);
1267 }
1268 
1269 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1270                                    int *BytesRemoved) const {
1271   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1272 
1273   unsigned Count = 0;
1274   unsigned RemovedSize = 0;
1275   while (I != MBB.end()) {
1276     MachineBasicBlock::iterator Next = std::next(I);
1277     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1278       I = Next;
1279       continue;
1280     }
1281 
1282     RemovedSize += getInstSizeInBytes(*I);
1283     I->eraseFromParent();
1284     ++Count;
1285     I = Next;
1286   }
1287 
1288   if (BytesRemoved)
1289     *BytesRemoved = RemovedSize;
1290 
1291   return Count;
1292 }
1293 
1294 // Copy the flags onto the implicit condition register operand.
1295 static void preserveCondRegFlags(MachineOperand &CondReg,
1296                                  const MachineOperand &OrigCond) {
1297   CondReg.setIsUndef(OrigCond.isUndef());
1298   CondReg.setIsKill(OrigCond.isKill());
1299 }
1300 
1301 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1302                                    MachineBasicBlock *TBB,
1303                                    MachineBasicBlock *FBB,
1304                                    ArrayRef<MachineOperand> Cond,
1305                                    const DebugLoc &DL,
1306                                    int *BytesAdded) const {
1307 
1308   if (!FBB && Cond.empty()) {
1309     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1310       .addMBB(TBB);
1311     if (BytesAdded)
1312       *BytesAdded = 4;
1313     return 1;
1314   }
1315 
1316   assert(TBB && Cond[0].isImm());
1317 
1318   unsigned Opcode
1319     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1320 
1321   if (!FBB) {
1322     Cond[1].isUndef();
1323     MachineInstr *CondBr =
1324       BuildMI(&MBB, DL, get(Opcode))
1325       .addMBB(TBB);
1326 
1327     // Copy the flags onto the implicit condition register operand.
1328     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1329 
1330     if (BytesAdded)
1331       *BytesAdded = 4;
1332     return 1;
1333   }
1334 
1335   assert(TBB && FBB);
1336 
1337   MachineInstr *CondBr =
1338     BuildMI(&MBB, DL, get(Opcode))
1339     .addMBB(TBB);
1340   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1341     .addMBB(FBB);
1342 
1343   MachineOperand &CondReg = CondBr->getOperand(1);
1344   CondReg.setIsUndef(Cond[1].isUndef());
1345   CondReg.setIsKill(Cond[1].isKill());
1346 
1347   if (BytesAdded)
1348       *BytesAdded = 8;
1349 
1350   return 2;
1351 }
1352 
1353 bool SIInstrInfo::reverseBranchCondition(
1354   SmallVectorImpl<MachineOperand> &Cond) const {
1355   assert(Cond.size() == 2);
1356   Cond[0].setImm(-Cond[0].getImm());
1357   return false;
1358 }
1359 
1360 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1361                                   ArrayRef<MachineOperand> Cond,
1362                                   unsigned TrueReg, unsigned FalseReg,
1363                                   int &CondCycles,
1364                                   int &TrueCycles, int &FalseCycles) const {
1365   switch (Cond[0].getImm()) {
1366   case VCCNZ:
1367   case VCCZ: {
1368     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1369     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1370     assert(MRI.getRegClass(FalseReg) == RC);
1371 
1372     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1373     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1374 
1375     // Limit to equal cost for branch vs. N v_cndmask_b32s.
1376     return !RI.isSGPRClass(RC) && NumInsts <= 6;
1377   }
1378   case SCC_TRUE:
1379   case SCC_FALSE: {
1380     // FIXME: We could insert for VGPRs if we could replace the original compare
1381     // with a vector one.
1382     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1383     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1384     assert(MRI.getRegClass(FalseReg) == RC);
1385 
1386     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1387 
1388     // Multiples of 8 can do s_cselect_b64
1389     if (NumInsts % 2 == 0)
1390       NumInsts /= 2;
1391 
1392     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1393     return RI.isSGPRClass(RC);
1394   }
1395   default:
1396     return false;
1397   }
1398 }
1399 
1400 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1401                                MachineBasicBlock::iterator I, const DebugLoc &DL,
1402                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
1403                                unsigned TrueReg, unsigned FalseReg) const {
1404   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1405   if (Pred == VCCZ || Pred == SCC_FALSE) {
1406     Pred = static_cast<BranchPredicate>(-Pred);
1407     std::swap(TrueReg, FalseReg);
1408   }
1409 
1410   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1411   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1412   unsigned DstSize = DstRC->getSize();
1413 
1414   if (DstSize == 4) {
1415     unsigned SelOp = Pred == SCC_TRUE ?
1416       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1417 
1418     // Instruction's operands are backwards from what is expected.
1419     MachineInstr *Select =
1420       BuildMI(MBB, I, DL, get(SelOp), DstReg)
1421       .addReg(FalseReg)
1422       .addReg(TrueReg);
1423 
1424     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1425     return;
1426   }
1427 
1428   if (DstSize == 8 && Pred == SCC_TRUE) {
1429     MachineInstr *Select =
1430       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1431       .addReg(FalseReg)
1432       .addReg(TrueReg);
1433 
1434     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1435     return;
1436   }
1437 
1438   static const int16_t Sub0_15[] = {
1439     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1440     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1441     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1442     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1443   };
1444 
1445   static const int16_t Sub0_15_64[] = {
1446     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1447     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1448     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1449     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1450   };
1451 
1452   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1453   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1454   const int16_t *SubIndices = Sub0_15;
1455   int NElts = DstSize / 4;
1456 
1457   // 64-bit select is only avaialble for SALU.
1458   if (Pred == SCC_TRUE) {
1459     SelOp = AMDGPU::S_CSELECT_B64;
1460     EltRC = &AMDGPU::SGPR_64RegClass;
1461     SubIndices = Sub0_15_64;
1462 
1463     assert(NElts % 2 == 0);
1464     NElts /= 2;
1465   }
1466 
1467   MachineInstrBuilder MIB = BuildMI(
1468     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1469 
1470   I = MIB->getIterator();
1471 
1472   SmallVector<unsigned, 8> Regs;
1473   for (int Idx = 0; Idx != NElts; ++Idx) {
1474     unsigned DstElt = MRI.createVirtualRegister(EltRC);
1475     Regs.push_back(DstElt);
1476 
1477     unsigned SubIdx = SubIndices[Idx];
1478 
1479     MachineInstr *Select =
1480       BuildMI(MBB, I, DL, get(SelOp), DstElt)
1481       .addReg(FalseReg, 0, SubIdx)
1482       .addReg(TrueReg, 0, SubIdx);
1483     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1484 
1485     MIB.addReg(DstElt)
1486        .addImm(SubIdx);
1487   }
1488 }
1489 
1490 static void removeModOperands(MachineInstr &MI) {
1491   unsigned Opc = MI.getOpcode();
1492   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1493                                               AMDGPU::OpName::src0_modifiers);
1494   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1495                                               AMDGPU::OpName::src1_modifiers);
1496   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1497                                               AMDGPU::OpName::src2_modifiers);
1498 
1499   MI.RemoveOperand(Src2ModIdx);
1500   MI.RemoveOperand(Src1ModIdx);
1501   MI.RemoveOperand(Src0ModIdx);
1502 }
1503 
1504 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1505                                 unsigned Reg, MachineRegisterInfo *MRI) const {
1506   if (!MRI->hasOneNonDBGUse(Reg))
1507     return false;
1508 
1509   unsigned Opc = UseMI.getOpcode();
1510   if (Opc == AMDGPU::COPY) {
1511     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1512     switch (DefMI.getOpcode()) {
1513     default:
1514       return false;
1515     case AMDGPU::S_MOV_B64:
1516       // TODO: We could fold 64-bit immediates, but this get compilicated
1517       // when there are sub-registers.
1518       return false;
1519 
1520     case AMDGPU::V_MOV_B32_e32:
1521     case AMDGPU::S_MOV_B32:
1522       break;
1523     }
1524     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1525     const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1526     assert(ImmOp);
1527     // FIXME: We could handle FrameIndex values here.
1528     if (!ImmOp->isImm()) {
1529       return false;
1530     }
1531     UseMI.setDesc(get(NewOpc));
1532     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1533     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1534     return true;
1535   }
1536 
1537   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1538       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1539     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1540 
1541     // Don't fold if we are using source modifiers. The new VOP2 instructions
1542     // don't have them.
1543     if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
1544         hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
1545         hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
1546       return false;
1547     }
1548 
1549     const MachineOperand &ImmOp = DefMI.getOperand(1);
1550 
1551     // If this is a free constant, there's no reason to do this.
1552     // TODO: We could fold this here instead of letting SIFoldOperands do it
1553     // later.
1554     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1555 
1556     // Any src operand can be used for the legality check.
1557     if (isInlineConstant(UseMI, *Src0, ImmOp))
1558       return false;
1559 
1560     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1561     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1562 
1563     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1564     // We should only expect these to be on src0 due to canonicalizations.
1565     if (Src0->isReg() && Src0->getReg() == Reg) {
1566       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1567         return false;
1568 
1569       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1570         return false;
1571 
1572       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1573 
1574       const int64_t Imm = DefMI.getOperand(1).getImm();
1575 
1576       // FIXME: This would be a lot easier if we could return a new instruction
1577       // instead of having to modify in place.
1578 
1579       // Remove these first since they are at the end.
1580       UseMI.RemoveOperand(
1581           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1582       UseMI.RemoveOperand(
1583           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1584 
1585       unsigned Src1Reg = Src1->getReg();
1586       unsigned Src1SubReg = Src1->getSubReg();
1587       Src0->setReg(Src1Reg);
1588       Src0->setSubReg(Src1SubReg);
1589       Src0->setIsKill(Src1->isKill());
1590 
1591       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1592           Opc == AMDGPU::V_MAC_F16_e64)
1593         UseMI.untieRegOperand(
1594             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1595 
1596       Src1->ChangeToImmediate(Imm);
1597 
1598       removeModOperands(UseMI);
1599       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
1600 
1601       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1602       if (DeleteDef)
1603         DefMI.eraseFromParent();
1604 
1605       return true;
1606     }
1607 
1608     // Added part is the constant: Use v_madak_{f16, f32}.
1609     if (Src2->isReg() && Src2->getReg() == Reg) {
1610       // Not allowed to use constant bus for another operand.
1611       // We can however allow an inline immediate as src0.
1612       if (!Src0->isImm() &&
1613           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1614         return false;
1615 
1616       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1617         return false;
1618 
1619       const int64_t Imm = DefMI.getOperand(1).getImm();
1620 
1621       // FIXME: This would be a lot easier if we could return a new instruction
1622       // instead of having to modify in place.
1623 
1624       // Remove these first since they are at the end.
1625       UseMI.RemoveOperand(
1626           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1627       UseMI.RemoveOperand(
1628           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1629 
1630       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1631           Opc == AMDGPU::V_MAC_F16_e64)
1632         UseMI.untieRegOperand(
1633             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1634 
1635       // ChangingToImmediate adds Src2 back to the instruction.
1636       Src2->ChangeToImmediate(Imm);
1637 
1638       // These come before src2.
1639       removeModOperands(UseMI);
1640       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
1641 
1642       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1643       if (DeleteDef)
1644         DefMI.eraseFromParent();
1645 
1646       return true;
1647     }
1648   }
1649 
1650   return false;
1651 }
1652 
1653 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1654                                 int WidthB, int OffsetB) {
1655   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1656   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1657   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1658   return LowOffset + LowWidth <= HighOffset;
1659 }
1660 
1661 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
1662                                                MachineInstr &MIb) const {
1663   unsigned BaseReg0, BaseReg1;
1664   int64_t Offset0, Offset1;
1665 
1666   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
1667       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
1668 
1669     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
1670       // FIXME: Handle ds_read2 / ds_write2.
1671       return false;
1672     }
1673     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
1674     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
1675     if (BaseReg0 == BaseReg1 &&
1676         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
1677       return true;
1678     }
1679   }
1680 
1681   return false;
1682 }
1683 
1684 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
1685                                                   MachineInstr &MIb,
1686                                                   AliasAnalysis *AA) const {
1687   assert((MIa.mayLoad() || MIa.mayStore()) &&
1688          "MIa must load from or modify a memory location");
1689   assert((MIb.mayLoad() || MIb.mayStore()) &&
1690          "MIb must load from or modify a memory location");
1691 
1692   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
1693     return false;
1694 
1695   // XXX - Can we relax this between address spaces?
1696   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
1697     return false;
1698 
1699   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
1700     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
1701     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
1702     if (MMOa->getValue() && MMOb->getValue()) {
1703       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
1704       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
1705       if (!AA->alias(LocA, LocB))
1706         return true;
1707     }
1708   }
1709 
1710   // TODO: Should we check the address space from the MachineMemOperand? That
1711   // would allow us to distinguish objects we know don't alias based on the
1712   // underlying address space, even if it was lowered to a different one,
1713   // e.g. private accesses lowered to use MUBUF instructions on a scratch
1714   // buffer.
1715   if (isDS(MIa)) {
1716     if (isDS(MIb))
1717       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1718 
1719     return !isFLAT(MIb);
1720   }
1721 
1722   if (isMUBUF(MIa) || isMTBUF(MIa)) {
1723     if (isMUBUF(MIb) || isMTBUF(MIb))
1724       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1725 
1726     return !isFLAT(MIb) && !isSMRD(MIb);
1727   }
1728 
1729   if (isSMRD(MIa)) {
1730     if (isSMRD(MIb))
1731       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1732 
1733     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
1734   }
1735 
1736   if (isFLAT(MIa)) {
1737     if (isFLAT(MIb))
1738       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1739 
1740     return false;
1741   }
1742 
1743   return false;
1744 }
1745 
1746 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
1747                                                  MachineInstr &MI,
1748                                                  LiveVariables *LV) const {
1749   bool IsF16 = false;
1750 
1751   switch (MI.getOpcode()) {
1752   default:
1753     return nullptr;
1754   case AMDGPU::V_MAC_F16_e64:
1755     IsF16 = true;
1756   case AMDGPU::V_MAC_F32_e64:
1757     break;
1758   case AMDGPU::V_MAC_F16_e32:
1759     IsF16 = true;
1760   case AMDGPU::V_MAC_F32_e32: {
1761     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1762                                              AMDGPU::OpName::src0);
1763     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
1764     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
1765       return nullptr;
1766     break;
1767   }
1768   }
1769 
1770   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
1771   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
1772   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
1773   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
1774 
1775   return BuildMI(*MBB, MI, MI.getDebugLoc(),
1776                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
1777       .add(*Dst)
1778       .addImm(0) // Src0 mods
1779       .add(*Src0)
1780       .addImm(0) // Src1 mods
1781       .add(*Src1)
1782       .addImm(0) // Src mods
1783       .add(*Src2)
1784       .addImm(0)  // clamp
1785       .addImm(0); // omod
1786 }
1787 
1788 // It's not generally safe to move VALU instructions across these since it will
1789 // start using the register as a base index rather than directly.
1790 // XXX - Why isn't hasSideEffects sufficient for these?
1791 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
1792   switch (MI.getOpcode()) {
1793   case AMDGPU::S_SET_GPR_IDX_ON:
1794   case AMDGPU::S_SET_GPR_IDX_MODE:
1795   case AMDGPU::S_SET_GPR_IDX_OFF:
1796     return true;
1797   default:
1798     return false;
1799   }
1800 }
1801 
1802 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
1803                                        const MachineBasicBlock *MBB,
1804                                        const MachineFunction &MF) const {
1805   // XXX - Do we want the SP check in the base implementation?
1806 
1807   // Target-independent instructions do not have an implicit-use of EXEC, even
1808   // when they operate on VGPRs. Treating EXEC modifications as scheduling
1809   // boundaries prevents incorrect movements of such instructions.
1810   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
1811          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
1812          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
1813          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
1814          changesVGPRIndexingMode(MI);
1815 }
1816 
1817 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
1818   switch (Imm.getBitWidth()) {
1819   case 32:
1820     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
1821                                         ST.hasInv2PiInlineImm());
1822   case 64:
1823     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
1824                                         ST.hasInv2PiInlineImm());
1825   case 16:
1826     return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
1827                                         ST.hasInv2PiInlineImm());
1828   default:
1829     llvm_unreachable("invalid bitwidth");
1830   }
1831 }
1832 
1833 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
1834                                    uint8_t OperandType) const {
1835   if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
1836     return false;
1837 
1838   // MachineOperand provides no way to tell the true operand size, since it only
1839   // records a 64-bit value. We need to know the size to determine if a 32-bit
1840   // floating point immediate bit pattern is legal for an integer immediate. It
1841   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
1842 
1843   int64_t Imm = MO.getImm();
1844   switch (operandBitWidth(OperandType)) {
1845   case 32: {
1846     int32_t Trunc = static_cast<int32_t>(Imm);
1847     return Trunc == Imm &&
1848            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
1849   }
1850   case 64: {
1851     return AMDGPU::isInlinableLiteral64(MO.getImm(),
1852                                         ST.hasInv2PiInlineImm());
1853   }
1854   case 16: {
1855     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
1856       int16_t Trunc = static_cast<int16_t>(Imm);
1857       return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
1858     }
1859 
1860     return false;
1861   }
1862   default:
1863     llvm_unreachable("invalid bitwidth");
1864   }
1865 }
1866 
1867 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
1868                                         const MCOperandInfo &OpInfo) const {
1869   switch (MO.getType()) {
1870   case MachineOperand::MO_Register:
1871     return false;
1872   case MachineOperand::MO_Immediate:
1873     return !isInlineConstant(MO, OpInfo);
1874   case MachineOperand::MO_FrameIndex:
1875   case MachineOperand::MO_MachineBasicBlock:
1876   case MachineOperand::MO_ExternalSymbol:
1877   case MachineOperand::MO_GlobalAddress:
1878   case MachineOperand::MO_MCSymbol:
1879     return true;
1880   default:
1881     llvm_unreachable("unexpected operand type");
1882   }
1883 }
1884 
1885 static bool compareMachineOp(const MachineOperand &Op0,
1886                              const MachineOperand &Op1) {
1887   if (Op0.getType() != Op1.getType())
1888     return false;
1889 
1890   switch (Op0.getType()) {
1891   case MachineOperand::MO_Register:
1892     return Op0.getReg() == Op1.getReg();
1893   case MachineOperand::MO_Immediate:
1894     return Op0.getImm() == Op1.getImm();
1895   default:
1896     llvm_unreachable("Didn't expect to be comparing these operand types");
1897   }
1898 }
1899 
1900 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
1901                                     const MachineOperand &MO) const {
1902   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
1903 
1904   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1905 
1906   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
1907     return true;
1908 
1909   if (OpInfo.RegClass < 0)
1910     return false;
1911 
1912   if (MO.isImm() && isInlineConstant(MO, OpInfo))
1913     return RI.opCanUseInlineConstant(OpInfo.OperandType);
1914 
1915   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
1916 }
1917 
1918 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
1919   int Op32 = AMDGPU::getVOPe32(Opcode);
1920   if (Op32 == -1)
1921     return false;
1922 
1923   return pseudoToMCOpcode(Op32) != -1;
1924 }
1925 
1926 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
1927   // The src0_modifier operand is present on all instructions
1928   // that have modifiers.
1929 
1930   return AMDGPU::getNamedOperandIdx(Opcode,
1931                                     AMDGPU::OpName::src0_modifiers) != -1;
1932 }
1933 
1934 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
1935                                   unsigned OpName) const {
1936   const MachineOperand *Mods = getNamedOperand(MI, OpName);
1937   return Mods && Mods->getImm();
1938 }
1939 
1940 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
1941                                   const MachineOperand &MO,
1942                                   const MCOperandInfo &OpInfo) const {
1943   // Literal constants use the constant bus.
1944   //if (isLiteralConstantLike(MO, OpInfo))
1945   // return true;
1946   if (MO.isImm())
1947     return !isInlineConstant(MO, OpInfo);
1948 
1949   if (!MO.isReg())
1950     return true; // Misc other operands like FrameIndex
1951 
1952   if (!MO.isUse())
1953     return false;
1954 
1955   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
1956     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
1957 
1958   // FLAT_SCR is just an SGPR pair.
1959   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
1960     return true;
1961 
1962   // EXEC register uses the constant bus.
1963   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
1964     return true;
1965 
1966   // SGPRs use the constant bus
1967   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
1968           (!MO.isImplicit() &&
1969            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
1970             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
1971 }
1972 
1973 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
1974   for (const MachineOperand &MO : MI.implicit_operands()) {
1975     // We only care about reads.
1976     if (MO.isDef())
1977       continue;
1978 
1979     switch (MO.getReg()) {
1980     case AMDGPU::VCC:
1981     case AMDGPU::M0:
1982     case AMDGPU::FLAT_SCR:
1983       return MO.getReg();
1984 
1985     default:
1986       break;
1987     }
1988   }
1989 
1990   return AMDGPU::NoRegister;
1991 }
1992 
1993 static bool shouldReadExec(const MachineInstr &MI) {
1994   if (SIInstrInfo::isVALU(MI)) {
1995     switch (MI.getOpcode()) {
1996     case AMDGPU::V_READLANE_B32:
1997     case AMDGPU::V_READLANE_B32_si:
1998     case AMDGPU::V_READLANE_B32_vi:
1999     case AMDGPU::V_WRITELANE_B32:
2000     case AMDGPU::V_WRITELANE_B32_si:
2001     case AMDGPU::V_WRITELANE_B32_vi:
2002       return false;
2003     }
2004 
2005     return true;
2006   }
2007 
2008   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2009       SIInstrInfo::isSALU(MI) ||
2010       SIInstrInfo::isSMRD(MI))
2011     return false;
2012 
2013   return true;
2014 }
2015 
2016 static bool isSubRegOf(const SIRegisterInfo &TRI,
2017                        const MachineOperand &SuperVec,
2018                        const MachineOperand &SubReg) {
2019   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2020     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2021 
2022   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2023          SubReg.getReg() == SuperVec.getReg();
2024 }
2025 
2026 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2027                                     StringRef &ErrInfo) const {
2028   uint16_t Opcode = MI.getOpcode();
2029   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2030   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2031   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2032   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2033 
2034   // Make sure the number of operands is correct.
2035   const MCInstrDesc &Desc = get(Opcode);
2036   if (!Desc.isVariadic() &&
2037       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2038     ErrInfo = "Instruction has wrong number of operands.";
2039     return false;
2040   }
2041 
2042   if (MI.isInlineAsm()) {
2043     // Verify register classes for inlineasm constraints.
2044     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2045          I != E; ++I) {
2046       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2047       if (!RC)
2048         continue;
2049 
2050       const MachineOperand &Op = MI.getOperand(I);
2051       if (!Op.isReg())
2052         continue;
2053 
2054       unsigned Reg = Op.getReg();
2055       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2056         ErrInfo = "inlineasm operand has incorrect register class.";
2057         return false;
2058       }
2059     }
2060 
2061     return true;
2062   }
2063 
2064   // Make sure the register classes are correct.
2065   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2066     if (MI.getOperand(i).isFPImm()) {
2067       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2068                 "all fp values to integers.";
2069       return false;
2070     }
2071 
2072     int RegClass = Desc.OpInfo[i].RegClass;
2073 
2074     switch (Desc.OpInfo[i].OperandType) {
2075     case MCOI::OPERAND_REGISTER:
2076       if (MI.getOperand(i).isImm()) {
2077         ErrInfo = "Illegal immediate value for operand.";
2078         return false;
2079       }
2080       break;
2081     case AMDGPU::OPERAND_REG_IMM_INT32:
2082     case AMDGPU::OPERAND_REG_IMM_FP32:
2083       break;
2084     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2085     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2086     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2087     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2088     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2089     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2090       const MachineOperand &MO = MI.getOperand(i);
2091       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2092         ErrInfo = "Illegal immediate value for operand.";
2093         return false;
2094       }
2095       break;
2096     }
2097     case MCOI::OPERAND_IMMEDIATE:
2098     case AMDGPU::OPERAND_KIMM32:
2099       // Check if this operand is an immediate.
2100       // FrameIndex operands will be replaced by immediates, so they are
2101       // allowed.
2102       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2103         ErrInfo = "Expected immediate, but got non-immediate";
2104         return false;
2105       }
2106       LLVM_FALLTHROUGH;
2107     default:
2108       continue;
2109     }
2110 
2111     if (!MI.getOperand(i).isReg())
2112       continue;
2113 
2114     if (RegClass != -1) {
2115       unsigned Reg = MI.getOperand(i).getReg();
2116       if (Reg == AMDGPU::NoRegister ||
2117           TargetRegisterInfo::isVirtualRegister(Reg))
2118         continue;
2119 
2120       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2121       if (!RC->contains(Reg)) {
2122         ErrInfo = "Operand has incorrect register class.";
2123         return false;
2124       }
2125     }
2126   }
2127 
2128   // Verify VOP*
2129   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) {
2130     // Only look at the true operands. Only a real operand can use the constant
2131     // bus, and we don't want to check pseudo-operands like the source modifier
2132     // flags.
2133     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2134 
2135     unsigned ConstantBusCount = 0;
2136 
2137     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2138       ++ConstantBusCount;
2139 
2140     unsigned SGPRUsed = findImplicitSGPRRead(MI);
2141     if (SGPRUsed != AMDGPU::NoRegister)
2142       ++ConstantBusCount;
2143 
2144     for (int OpIdx : OpIndices) {
2145       if (OpIdx == -1)
2146         break;
2147       const MachineOperand &MO = MI.getOperand(OpIdx);
2148       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2149         if (MO.isReg()) {
2150           if (MO.getReg() != SGPRUsed)
2151             ++ConstantBusCount;
2152           SGPRUsed = MO.getReg();
2153         } else {
2154           ++ConstantBusCount;
2155         }
2156       }
2157     }
2158     if (ConstantBusCount > 1) {
2159       ErrInfo = "VOP* instruction uses the constant bus more than once";
2160       return false;
2161     }
2162   }
2163 
2164   // Verify misc. restrictions on specific instructions.
2165   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2166       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2167     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2168     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2169     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2170     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2171       if (!compareMachineOp(Src0, Src1) &&
2172           !compareMachineOp(Src0, Src2)) {
2173         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2174         return false;
2175       }
2176     }
2177   }
2178 
2179   if (isSOPK(MI)) {
2180     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2181     if (sopkIsZext(MI)) {
2182       if (!isUInt<16>(Imm)) {
2183         ErrInfo = "invalid immediate for SOPK instruction";
2184         return false;
2185       }
2186     } else {
2187       if (!isInt<16>(Imm)) {
2188         ErrInfo = "invalid immediate for SOPK instruction";
2189         return false;
2190       }
2191     }
2192   }
2193 
2194   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2195       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2196       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2197       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2198     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2199                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2200 
2201     const unsigned StaticNumOps = Desc.getNumOperands() +
2202       Desc.getNumImplicitUses();
2203     const unsigned NumImplicitOps = IsDst ? 2 : 1;
2204 
2205     // Allow additional implicit operands. This allows a fixup done by the post
2206     // RA scheduler where the main implicit operand is killed and implicit-defs
2207     // are added for sub-registers that remain live after this instruction.
2208     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2209       ErrInfo = "missing implicit register operands";
2210       return false;
2211     }
2212 
2213     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2214     if (IsDst) {
2215       if (!Dst->isUse()) {
2216         ErrInfo = "v_movreld_b32 vdst should be a use operand";
2217         return false;
2218       }
2219 
2220       unsigned UseOpIdx;
2221       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2222           UseOpIdx != StaticNumOps + 1) {
2223         ErrInfo = "movrel implicit operands should be tied";
2224         return false;
2225       }
2226     }
2227 
2228     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2229     const MachineOperand &ImpUse
2230       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2231     if (!ImpUse.isReg() || !ImpUse.isUse() ||
2232         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2233       ErrInfo = "src0 should be subreg of implicit vector use";
2234       return false;
2235     }
2236   }
2237 
2238   // Make sure we aren't losing exec uses in the td files. This mostly requires
2239   // being careful when using let Uses to try to add other use registers.
2240   if (shouldReadExec(MI)) {
2241     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2242       ErrInfo = "VALU instruction does not implicitly read exec mask";
2243       return false;
2244     }
2245   }
2246 
2247   if (isSMRD(MI)) {
2248     if (MI.mayStore()) {
2249       // The register offset form of scalar stores may only use m0 as the
2250       // soffset register.
2251       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2252       if (Soff && Soff->getReg() != AMDGPU::M0) {
2253         ErrInfo = "scalar stores must use m0 as offset register";
2254         return false;
2255       }
2256     }
2257   }
2258 
2259   return true;
2260 }
2261 
2262 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
2263   switch (MI.getOpcode()) {
2264   default: return AMDGPU::INSTRUCTION_LIST_END;
2265   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2266   case AMDGPU::COPY: return AMDGPU::COPY;
2267   case AMDGPU::PHI: return AMDGPU::PHI;
2268   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2269   case AMDGPU::S_MOV_B32:
2270     return MI.getOperand(1).isReg() ?
2271            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2272   case AMDGPU::S_ADD_I32:
2273   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
2274   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
2275   case AMDGPU::S_SUB_I32:
2276   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
2277   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2278   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2279   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2280   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2281   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2282   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2283   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2284   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2285   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2286   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2287   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2288   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2289   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2290   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2291   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2292   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2293   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2294   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2295   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2296   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2297   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2298   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2299   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2300   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2301   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2302   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2303   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2304   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2305   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2306   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2307   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2308   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2309   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2310   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2311   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2312   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2313   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2314   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2315   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2316   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2317   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2318   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2319   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2320   }
2321 }
2322 
2323 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
2324   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
2325 }
2326 
2327 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
2328                                                       unsigned OpNo) const {
2329   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2330   const MCInstrDesc &Desc = get(MI.getOpcode());
2331   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
2332       Desc.OpInfo[OpNo].RegClass == -1) {
2333     unsigned Reg = MI.getOperand(OpNo).getReg();
2334 
2335     if (TargetRegisterInfo::isVirtualRegister(Reg))
2336       return MRI.getRegClass(Reg);
2337     return RI.getPhysRegClass(Reg);
2338   }
2339 
2340   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2341   return RI.getRegClass(RCID);
2342 }
2343 
2344 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2345   switch (MI.getOpcode()) {
2346   case AMDGPU::COPY:
2347   case AMDGPU::REG_SEQUENCE:
2348   case AMDGPU::PHI:
2349   case AMDGPU::INSERT_SUBREG:
2350     return RI.hasVGPRs(getOpRegClass(MI, 0));
2351   default:
2352     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
2353   }
2354 }
2355 
2356 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
2357   MachineBasicBlock::iterator I = MI;
2358   MachineBasicBlock *MBB = MI.getParent();
2359   MachineOperand &MO = MI.getOperand(OpIdx);
2360   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2361   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
2362   const TargetRegisterClass *RC = RI.getRegClass(RCID);
2363   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
2364   if (MO.isReg())
2365     Opcode = AMDGPU::COPY;
2366   else if (RI.isSGPRClass(RC))
2367     Opcode = AMDGPU::S_MOV_B32;
2368 
2369   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
2370   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
2371     VRC = &AMDGPU::VReg_64RegClass;
2372   else
2373     VRC = &AMDGPU::VGPR_32RegClass;
2374 
2375   unsigned Reg = MRI.createVirtualRegister(VRC);
2376   DebugLoc DL = MBB->findDebugLoc(I);
2377   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
2378   MO.ChangeToRegister(Reg, false);
2379 }
2380 
2381 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
2382                                          MachineRegisterInfo &MRI,
2383                                          MachineOperand &SuperReg,
2384                                          const TargetRegisterClass *SuperRC,
2385                                          unsigned SubIdx,
2386                                          const TargetRegisterClass *SubRC)
2387                                          const {
2388   MachineBasicBlock *MBB = MI->getParent();
2389   DebugLoc DL = MI->getDebugLoc();
2390   unsigned SubReg = MRI.createVirtualRegister(SubRC);
2391 
2392   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
2393     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2394       .addReg(SuperReg.getReg(), 0, SubIdx);
2395     return SubReg;
2396   }
2397 
2398   // Just in case the super register is itself a sub-register, copy it to a new
2399   // value so we don't need to worry about merging its subreg index with the
2400   // SubIdx passed to this function. The register coalescer should be able to
2401   // eliminate this extra copy.
2402   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
2403 
2404   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
2405     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
2406 
2407   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2408     .addReg(NewSuperReg, 0, SubIdx);
2409 
2410   return SubReg;
2411 }
2412 
2413 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
2414   MachineBasicBlock::iterator MII,
2415   MachineRegisterInfo &MRI,
2416   MachineOperand &Op,
2417   const TargetRegisterClass *SuperRC,
2418   unsigned SubIdx,
2419   const TargetRegisterClass *SubRC) const {
2420   if (Op.isImm()) {
2421     if (SubIdx == AMDGPU::sub0)
2422       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
2423     if (SubIdx == AMDGPU::sub1)
2424       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
2425 
2426     llvm_unreachable("Unhandled register index for immediate");
2427   }
2428 
2429   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
2430                                        SubIdx, SubRC);
2431   return MachineOperand::CreateReg(SubReg, false);
2432 }
2433 
2434 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
2435 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
2436   assert(Inst.getNumExplicitOperands() == 3);
2437   MachineOperand Op1 = Inst.getOperand(1);
2438   Inst.RemoveOperand(1);
2439   Inst.addOperand(Op1);
2440 }
2441 
2442 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
2443                                     const MCOperandInfo &OpInfo,
2444                                     const MachineOperand &MO) const {
2445   if (!MO.isReg())
2446     return false;
2447 
2448   unsigned Reg = MO.getReg();
2449   const TargetRegisterClass *RC =
2450     TargetRegisterInfo::isVirtualRegister(Reg) ?
2451     MRI.getRegClass(Reg) :
2452     RI.getPhysRegClass(Reg);
2453 
2454   const SIRegisterInfo *TRI =
2455       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
2456   RC = TRI->getSubRegClass(RC, MO.getSubReg());
2457 
2458   // In order to be legal, the common sub-class must be equal to the
2459   // class of the current operand.  For example:
2460   //
2461   // v_mov_b32 s0 ; Operand defined as vsrc_b32
2462   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
2463   //
2464   // s_sendmsg 0, s0 ; Operand defined as m0reg
2465   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
2466 
2467   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
2468 }
2469 
2470 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
2471                                      const MCOperandInfo &OpInfo,
2472                                      const MachineOperand &MO) const {
2473   if (MO.isReg())
2474     return isLegalRegOperand(MRI, OpInfo, MO);
2475 
2476   // Handle non-register types that are treated like immediates.
2477   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2478   return true;
2479 }
2480 
2481 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
2482                                  const MachineOperand *MO) const {
2483   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2484   const MCInstrDesc &InstDesc = MI.getDesc();
2485   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
2486   const TargetRegisterClass *DefinedRC =
2487       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
2488   if (!MO)
2489     MO = &MI.getOperand(OpIdx);
2490 
2491   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
2492 
2493     RegSubRegPair SGPRUsed;
2494     if (MO->isReg())
2495       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
2496 
2497     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2498       if (i == OpIdx)
2499         continue;
2500       const MachineOperand &Op = MI.getOperand(i);
2501       if (Op.isReg()) {
2502         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2503             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
2504           return false;
2505         }
2506       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
2507         return false;
2508       }
2509     }
2510   }
2511 
2512   if (MO->isReg()) {
2513     assert(DefinedRC);
2514     return isLegalRegOperand(MRI, OpInfo, *MO);
2515   }
2516 
2517   // Handle non-register types that are treated like immediates.
2518   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
2519 
2520   if (!DefinedRC) {
2521     // This operand expects an immediate.
2522     return true;
2523   }
2524 
2525   return isImmOperandLegal(MI, OpIdx, *MO);
2526 }
2527 
2528 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
2529                                        MachineInstr &MI) const {
2530   unsigned Opc = MI.getOpcode();
2531   const MCInstrDesc &InstrDesc = get(Opc);
2532 
2533   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2534   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2535 
2536   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
2537   // we need to only have one constant bus use.
2538   //
2539   // Note we do not need to worry about literal constants here. They are
2540   // disabled for the operand type for instructions because they will always
2541   // violate the one constant bus use rule.
2542   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
2543   if (HasImplicitSGPR) {
2544     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2545     MachineOperand &Src0 = MI.getOperand(Src0Idx);
2546 
2547     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
2548       legalizeOpWithMove(MI, Src0Idx);
2549   }
2550 
2551   // VOP2 src0 instructions support all operand types, so we don't need to check
2552   // their legality. If src1 is already legal, we don't need to do anything.
2553   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
2554     return;
2555 
2556   // We do not use commuteInstruction here because it is too aggressive and will
2557   // commute if it is possible. We only want to commute here if it improves
2558   // legality. This can be called a fairly large number of times so don't waste
2559   // compile time pointlessly swapping and checking legality again.
2560   if (HasImplicitSGPR || !MI.isCommutable()) {
2561     legalizeOpWithMove(MI, Src1Idx);
2562     return;
2563   }
2564 
2565   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2566   MachineOperand &Src0 = MI.getOperand(Src0Idx);
2567 
2568   // If src0 can be used as src1, commuting will make the operands legal.
2569   // Otherwise we have to give up and insert a move.
2570   //
2571   // TODO: Other immediate-like operand kinds could be commuted if there was a
2572   // MachineOperand::ChangeTo* for them.
2573   if ((!Src1.isImm() && !Src1.isReg()) ||
2574       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
2575     legalizeOpWithMove(MI, Src1Idx);
2576     return;
2577   }
2578 
2579   int CommutedOpc = commuteOpcode(MI);
2580   if (CommutedOpc == -1) {
2581     legalizeOpWithMove(MI, Src1Idx);
2582     return;
2583   }
2584 
2585   MI.setDesc(get(CommutedOpc));
2586 
2587   unsigned Src0Reg = Src0.getReg();
2588   unsigned Src0SubReg = Src0.getSubReg();
2589   bool Src0Kill = Src0.isKill();
2590 
2591   if (Src1.isImm())
2592     Src0.ChangeToImmediate(Src1.getImm());
2593   else if (Src1.isReg()) {
2594     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
2595     Src0.setSubReg(Src1.getSubReg());
2596   } else
2597     llvm_unreachable("Should only have register or immediate operands");
2598 
2599   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
2600   Src1.setSubReg(Src0SubReg);
2601 }
2602 
2603 // Legalize VOP3 operands. Because all operand types are supported for any
2604 // operand, and since literal constants are not allowed and should never be
2605 // seen, we only need to worry about inserting copies if we use multiple SGPR
2606 // operands.
2607 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
2608                                        MachineInstr &MI) const {
2609   unsigned Opc = MI.getOpcode();
2610 
2611   int VOP3Idx[3] = {
2612     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
2613     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
2614     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
2615   };
2616 
2617   // Find the one SGPR operand we are allowed to use.
2618   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
2619 
2620   for (unsigned i = 0; i < 3; ++i) {
2621     int Idx = VOP3Idx[i];
2622     if (Idx == -1)
2623       break;
2624     MachineOperand &MO = MI.getOperand(Idx);
2625 
2626     // We should never see a VOP3 instruction with an illegal immediate operand.
2627     if (!MO.isReg())
2628       continue;
2629 
2630     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
2631       continue; // VGPRs are legal
2632 
2633     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
2634       SGPRReg = MO.getReg();
2635       // We can use one SGPR in each VOP3 instruction.
2636       continue;
2637     }
2638 
2639     // If we make it this far, then the operand is not legal and we must
2640     // legalize it.
2641     legalizeOpWithMove(MI, Idx);
2642   }
2643 }
2644 
2645 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
2646                                          MachineRegisterInfo &MRI) const {
2647   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
2648   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
2649   unsigned DstReg = MRI.createVirtualRegister(SRC);
2650   unsigned SubRegs = VRC->getSize() / 4;
2651 
2652   SmallVector<unsigned, 8> SRegs;
2653   for (unsigned i = 0; i < SubRegs; ++i) {
2654     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2655     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
2656             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
2657         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
2658     SRegs.push_back(SGPR);
2659   }
2660 
2661   MachineInstrBuilder MIB =
2662       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
2663               get(AMDGPU::REG_SEQUENCE), DstReg);
2664   for (unsigned i = 0; i < SubRegs; ++i) {
2665     MIB.addReg(SRegs[i]);
2666     MIB.addImm(RI.getSubRegFromChannel(i));
2667   }
2668   return DstReg;
2669 }
2670 
2671 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
2672                                        MachineInstr &MI) const {
2673 
2674   // If the pointer is store in VGPRs, then we need to move them to
2675   // SGPRs using v_readfirstlane.  This is safe because we only select
2676   // loads with uniform pointers to SMRD instruction so we know the
2677   // pointer value is uniform.
2678   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
2679   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
2680       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
2681       SBase->setReg(SGPR);
2682   }
2683 }
2684 
2685 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
2686                                          MachineBasicBlock::iterator I,
2687                                          const TargetRegisterClass *DstRC,
2688                                          MachineOperand &Op,
2689                                          MachineRegisterInfo &MRI,
2690                                          const DebugLoc &DL) const {
2691 
2692   unsigned OpReg = Op.getReg();
2693   unsigned OpSubReg = Op.getSubReg();
2694 
2695   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
2696       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
2697 
2698   // Check if operand is already the correct register class.
2699   if (DstRC == OpRC)
2700     return;
2701 
2702   unsigned DstReg = MRI.createVirtualRegister(DstRC);
2703   MachineInstr *Copy =
2704       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
2705 
2706   Op.setReg(DstReg);
2707   Op.setSubReg(0);
2708 
2709   MachineInstr *Def = MRI.getVRegDef(OpReg);
2710   if (!Def)
2711     return;
2712 
2713   // Try to eliminate the copy if it is copying an immediate value.
2714   if (Def->isMoveImmediate())
2715     FoldImmediate(*Copy, *Def, OpReg, &MRI);
2716 }
2717 
2718 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
2719   MachineFunction &MF = *MI.getParent()->getParent();
2720   MachineRegisterInfo &MRI = MF.getRegInfo();
2721 
2722   // Legalize VOP2
2723   if (isVOP2(MI) || isVOPC(MI)) {
2724     legalizeOperandsVOP2(MRI, MI);
2725     return;
2726   }
2727 
2728   // Legalize VOP3
2729   if (isVOP3(MI)) {
2730     legalizeOperandsVOP3(MRI, MI);
2731     return;
2732   }
2733 
2734   // Legalize SMRD
2735   if (isSMRD(MI)) {
2736     legalizeOperandsSMRD(MRI, MI);
2737     return;
2738   }
2739 
2740   // Legalize REG_SEQUENCE and PHI
2741   // The register class of the operands much be the same type as the register
2742   // class of the output.
2743   if (MI.getOpcode() == AMDGPU::PHI) {
2744     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
2745     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
2746       if (!MI.getOperand(i).isReg() ||
2747           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
2748         continue;
2749       const TargetRegisterClass *OpRC =
2750           MRI.getRegClass(MI.getOperand(i).getReg());
2751       if (RI.hasVGPRs(OpRC)) {
2752         VRC = OpRC;
2753       } else {
2754         SRC = OpRC;
2755       }
2756     }
2757 
2758     // If any of the operands are VGPR registers, then they all most be
2759     // otherwise we will create illegal VGPR->SGPR copies when legalizing
2760     // them.
2761     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
2762       if (!VRC) {
2763         assert(SRC);
2764         VRC = RI.getEquivalentVGPRClass(SRC);
2765       }
2766       RC = VRC;
2767     } else {
2768       RC = SRC;
2769     }
2770 
2771     // Update all the operands so they have the same type.
2772     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2773       MachineOperand &Op = MI.getOperand(I);
2774       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2775         continue;
2776 
2777       // MI is a PHI instruction.
2778       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
2779       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
2780 
2781       // Avoid creating no-op copies with the same src and dst reg class.  These
2782       // confuse some of the machine passes.
2783       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
2784     }
2785   }
2786 
2787   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
2788   // VGPR dest type and SGPR sources, insert copies so all operands are
2789   // VGPRs. This seems to help operand folding / the register coalescer.
2790   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
2791     MachineBasicBlock *MBB = MI.getParent();
2792     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
2793     if (RI.hasVGPRs(DstRC)) {
2794       // Update all the operands so they are VGPR register classes. These may
2795       // not be the same register class because REG_SEQUENCE supports mixing
2796       // subregister index types e.g. sub0_sub1 + sub2 + sub3
2797       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2798         MachineOperand &Op = MI.getOperand(I);
2799         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2800           continue;
2801 
2802         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
2803         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
2804         if (VRC == OpRC)
2805           continue;
2806 
2807         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
2808         Op.setIsKill();
2809       }
2810     }
2811 
2812     return;
2813   }
2814 
2815   // Legalize INSERT_SUBREG
2816   // src0 must have the same register class as dst
2817   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
2818     unsigned Dst = MI.getOperand(0).getReg();
2819     unsigned Src0 = MI.getOperand(1).getReg();
2820     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
2821     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
2822     if (DstRC != Src0RC) {
2823       MachineBasicBlock *MBB = MI.getParent();
2824       MachineOperand &Op = MI.getOperand(1);
2825       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
2826     }
2827     return;
2828   }
2829 
2830   // Legalize MIMG and MUBUF/MTBUF for shaders.
2831   //
2832   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
2833   // scratch memory access. In both cases, the legalization never involves
2834   // conversion to the addr64 form.
2835   if (isMIMG(MI) ||
2836       (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
2837        (isMUBUF(MI) || isMTBUF(MI)))) {
2838     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
2839     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
2840       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
2841       SRsrc->setReg(SGPR);
2842     }
2843 
2844     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
2845     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
2846       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
2847       SSamp->setReg(SGPR);
2848     }
2849     return;
2850   }
2851 
2852   // Legalize MUBUF* instructions by converting to addr64 form.
2853   // FIXME: If we start using the non-addr64 instructions for compute, we
2854   // may need to legalize them as above. This especially applies to the
2855   // buffer_load_format_* variants and variants with idxen (or bothen).
2856   int SRsrcIdx =
2857       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
2858   if (SRsrcIdx != -1) {
2859     // We have an MUBUF instruction
2860     MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
2861     unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
2862     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
2863                                              RI.getRegClass(SRsrcRC))) {
2864       // The operands are legal.
2865       // FIXME: We may need to legalize operands besided srsrc.
2866       return;
2867     }
2868 
2869     MachineBasicBlock &MBB = *MI.getParent();
2870 
2871     // Extract the ptr from the resource descriptor.
2872     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
2873       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
2874 
2875     // Create an empty resource descriptor
2876     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2877     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2878     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2879     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2880     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2881 
2882     // Zero64 = 0
2883     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
2884         .addImm(0);
2885 
2886     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
2887     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
2888         .addImm(RsrcDataFormat & 0xFFFFFFFF);
2889 
2890     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
2891     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
2892         .addImm(RsrcDataFormat >> 32);
2893 
2894     // NewSRsrc = {Zero64, SRsrcFormat}
2895     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
2896         .addReg(Zero64)
2897         .addImm(AMDGPU::sub0_sub1)
2898         .addReg(SRsrcFormatLo)
2899         .addImm(AMDGPU::sub2)
2900         .addReg(SRsrcFormatHi)
2901         .addImm(AMDGPU::sub3);
2902 
2903     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
2904     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2905     if (VAddr) {
2906       // This is already an ADDR64 instruction so we need to add the pointer
2907       // extracted from the resource descriptor to the current value of VAddr.
2908       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2909       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2910 
2911       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
2912       DebugLoc DL = MI.getDebugLoc();
2913       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
2914         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2915         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
2916 
2917       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
2918       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
2919         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2920         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
2921 
2922       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2923       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
2924           .addReg(NewVAddrLo)
2925           .addImm(AMDGPU::sub0)
2926           .addReg(NewVAddrHi)
2927           .addImm(AMDGPU::sub1);
2928     } else {
2929       // This instructions is the _OFFSET variant, so we need to convert it to
2930       // ADDR64.
2931       assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
2932              < SISubtarget::VOLCANIC_ISLANDS &&
2933              "FIXME: Need to emit flat atomics here");
2934 
2935       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
2936       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2937       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
2938       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
2939 
2940       // Atomics rith return have have an additional tied operand and are
2941       // missing some of the special bits.
2942       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
2943       MachineInstr *Addr64;
2944 
2945       if (!VDataIn) {
2946         // Regular buffer load / store.
2947         MachineInstrBuilder MIB =
2948             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
2949                 .add(*VData)
2950                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2951                 // This will be replaced later
2952                 // with the new value of vaddr.
2953                 .add(*SRsrc)
2954                 .add(*SOffset)
2955                 .add(*Offset);
2956 
2957         // Atomics do not have this operand.
2958         if (const MachineOperand *GLC =
2959                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
2960           MIB.addImm(GLC->getImm());
2961         }
2962 
2963         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
2964 
2965         if (const MachineOperand *TFE =
2966                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
2967           MIB.addImm(TFE->getImm());
2968         }
2969 
2970         MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
2971         Addr64 = MIB;
2972       } else {
2973         // Atomics with return.
2974         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
2975                      .add(*VData)
2976                      .add(*VDataIn)
2977                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2978                      // This will be replaced later
2979                      // with the new value of vaddr.
2980                      .add(*SRsrc)
2981                      .add(*SOffset)
2982                      .add(*Offset)
2983                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
2984                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
2985       }
2986 
2987       MI.removeFromParent();
2988 
2989       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2990       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
2991               NewVAddr)
2992           .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2993           .addImm(AMDGPU::sub0)
2994           .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2995           .addImm(AMDGPU::sub1);
2996 
2997       VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
2998       SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
2999     }
3000 
3001     // Update the instruction to use NewVaddr
3002     VAddr->setReg(NewVAddr);
3003     // Update the instruction to use NewSRsrc
3004     SRsrc->setReg(NewSRsrc);
3005   }
3006 }
3007 
3008 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
3009   SmallVector<MachineInstr *, 128> Worklist;
3010   Worklist.push_back(&TopInst);
3011 
3012   while (!Worklist.empty()) {
3013     MachineInstr &Inst = *Worklist.pop_back_val();
3014     MachineBasicBlock *MBB = Inst.getParent();
3015     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3016 
3017     unsigned Opcode = Inst.getOpcode();
3018     unsigned NewOpcode = getVALUOp(Inst);
3019 
3020     // Handle some special cases
3021     switch (Opcode) {
3022     default:
3023       break;
3024     case AMDGPU::S_AND_B64:
3025       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3026       Inst.eraseFromParent();
3027       continue;
3028 
3029     case AMDGPU::S_OR_B64:
3030       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3031       Inst.eraseFromParent();
3032       continue;
3033 
3034     case AMDGPU::S_XOR_B64:
3035       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3036       Inst.eraseFromParent();
3037       continue;
3038 
3039     case AMDGPU::S_NOT_B64:
3040       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3041       Inst.eraseFromParent();
3042       continue;
3043 
3044     case AMDGPU::S_BCNT1_I32_B64:
3045       splitScalar64BitBCNT(Worklist, Inst);
3046       Inst.eraseFromParent();
3047       continue;
3048 
3049     case AMDGPU::S_BFE_I64: {
3050       splitScalar64BitBFE(Worklist, Inst);
3051       Inst.eraseFromParent();
3052       continue;
3053     }
3054 
3055     case AMDGPU::S_LSHL_B32:
3056       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3057         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3058         swapOperands(Inst);
3059       }
3060       break;
3061     case AMDGPU::S_ASHR_I32:
3062       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3063         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3064         swapOperands(Inst);
3065       }
3066       break;
3067     case AMDGPU::S_LSHR_B32:
3068       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3069         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3070         swapOperands(Inst);
3071       }
3072       break;
3073     case AMDGPU::S_LSHL_B64:
3074       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3075         NewOpcode = AMDGPU::V_LSHLREV_B64;
3076         swapOperands(Inst);
3077       }
3078       break;
3079     case AMDGPU::S_ASHR_I64:
3080       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3081         NewOpcode = AMDGPU::V_ASHRREV_I64;
3082         swapOperands(Inst);
3083       }
3084       break;
3085     case AMDGPU::S_LSHR_B64:
3086       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3087         NewOpcode = AMDGPU::V_LSHRREV_B64;
3088         swapOperands(Inst);
3089       }
3090       break;
3091 
3092     case AMDGPU::S_ABS_I32:
3093       lowerScalarAbs(Worklist, Inst);
3094       Inst.eraseFromParent();
3095       continue;
3096 
3097     case AMDGPU::S_CBRANCH_SCC0:
3098     case AMDGPU::S_CBRANCH_SCC1:
3099       // Clear unused bits of vcc
3100       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3101               AMDGPU::VCC)
3102           .addReg(AMDGPU::EXEC)
3103           .addReg(AMDGPU::VCC);
3104       break;
3105 
3106     case AMDGPU::S_BFE_U64:
3107     case AMDGPU::S_BFM_B64:
3108       llvm_unreachable("Moving this op to VALU not implemented");
3109     }
3110 
3111     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3112       // We cannot move this instruction to the VALU, so we should try to
3113       // legalize its operands instead.
3114       legalizeOperands(Inst);
3115       continue;
3116     }
3117 
3118     // Use the new VALU Opcode.
3119     const MCInstrDesc &NewDesc = get(NewOpcode);
3120     Inst.setDesc(NewDesc);
3121 
3122     // Remove any references to SCC. Vector instructions can't read from it, and
3123     // We're just about to add the implicit use / defs of VCC, and we don't want
3124     // both.
3125     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3126       MachineOperand &Op = Inst.getOperand(i);
3127       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3128         Inst.RemoveOperand(i);
3129         addSCCDefUsersToVALUWorklist(Inst, Worklist);
3130       }
3131     }
3132 
3133     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
3134       // We are converting these to a BFE, so we need to add the missing
3135       // operands for the size and offset.
3136       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
3137       Inst.addOperand(MachineOperand::CreateImm(0));
3138       Inst.addOperand(MachineOperand::CreateImm(Size));
3139 
3140     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
3141       // The VALU version adds the second operand to the result, so insert an
3142       // extra 0 operand.
3143       Inst.addOperand(MachineOperand::CreateImm(0));
3144     }
3145 
3146     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
3147 
3148     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
3149       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3150       // If we need to move this to VGPRs, we need to unpack the second operand
3151       // back into the 2 separate ones for bit offset and width.
3152       assert(OffsetWidthOp.isImm() &&
3153              "Scalar BFE is only implemented for constant width and offset");
3154       uint32_t Imm = OffsetWidthOp.getImm();
3155 
3156       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3157       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3158       Inst.RemoveOperand(2);                     // Remove old immediate.
3159       Inst.addOperand(MachineOperand::CreateImm(Offset));
3160       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3161     }
3162 
3163     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3164     unsigned NewDstReg = AMDGPU::NoRegister;
3165     if (HasDst) {
3166       // Update the destination register class.
3167       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3168       if (!NewDstRC)
3169         continue;
3170 
3171       unsigned DstReg = Inst.getOperand(0).getReg();
3172       if (Inst.isCopy() &&
3173           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
3174           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
3175         // Instead of creating a copy where src and dst are the same register
3176         // class, we just replace all uses of dst with src.  These kinds of
3177         // copies interfere with the heuristics MachineSink uses to decide
3178         // whether or not to split a critical edge.  Since the pass assumes
3179         // that copies will end up as machine instructions and not be
3180         // eliminated.
3181         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3182         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3183         MRI.clearKillFlags(Inst.getOperand(1).getReg());
3184         Inst.getOperand(0).setReg(DstReg);
3185         continue;
3186       }
3187 
3188       NewDstReg = MRI.createVirtualRegister(NewDstRC);
3189       MRI.replaceRegWith(DstReg, NewDstReg);
3190     }
3191 
3192     // Legalize the operands
3193     legalizeOperands(Inst);
3194 
3195     if (HasDst)
3196      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
3197   }
3198 }
3199 
3200 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
3201                                  MachineInstr &Inst) const {
3202   MachineBasicBlock &MBB = *Inst.getParent();
3203   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3204   MachineBasicBlock::iterator MII = Inst;
3205   DebugLoc DL = Inst.getDebugLoc();
3206 
3207   MachineOperand &Dest = Inst.getOperand(0);
3208   MachineOperand &Src = Inst.getOperand(1);
3209   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3210   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3211 
3212   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
3213     .addImm(0)
3214     .addReg(Src.getReg());
3215 
3216   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
3217     .addReg(Src.getReg())
3218     .addReg(TmpReg);
3219 
3220   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3221   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3222 }
3223 
3224 void SIInstrInfo::splitScalar64BitUnaryOp(
3225     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
3226     unsigned Opcode) const {
3227   MachineBasicBlock &MBB = *Inst.getParent();
3228   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3229 
3230   MachineOperand &Dest = Inst.getOperand(0);
3231   MachineOperand &Src0 = Inst.getOperand(1);
3232   DebugLoc DL = Inst.getDebugLoc();
3233 
3234   MachineBasicBlock::iterator MII = Inst;
3235 
3236   const MCInstrDesc &InstDesc = get(Opcode);
3237   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3238     MRI.getRegClass(Src0.getReg()) :
3239     &AMDGPU::SGPR_32RegClass;
3240 
3241   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3242 
3243   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3244                                                        AMDGPU::sub0, Src0SubRC);
3245 
3246   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3247   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3248   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3249 
3250   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3251   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
3252 
3253   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3254                                                        AMDGPU::sub1, Src0SubRC);
3255 
3256   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3257   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
3258 
3259   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3260   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3261     .addReg(DestSub0)
3262     .addImm(AMDGPU::sub0)
3263     .addReg(DestSub1)
3264     .addImm(AMDGPU::sub1);
3265 
3266   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3267 
3268   // We don't need to legalizeOperands here because for a single operand, src0
3269   // will support any kind of input.
3270 
3271   // Move all users of this moved value.
3272   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3273 }
3274 
3275 void SIInstrInfo::splitScalar64BitBinaryOp(
3276     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst,
3277     unsigned Opcode) const {
3278   MachineBasicBlock &MBB = *Inst.getParent();
3279   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3280 
3281   MachineOperand &Dest = Inst.getOperand(0);
3282   MachineOperand &Src0 = Inst.getOperand(1);
3283   MachineOperand &Src1 = Inst.getOperand(2);
3284   DebugLoc DL = Inst.getDebugLoc();
3285 
3286   MachineBasicBlock::iterator MII = Inst;
3287 
3288   const MCInstrDesc &InstDesc = get(Opcode);
3289   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3290     MRI.getRegClass(Src0.getReg()) :
3291     &AMDGPU::SGPR_32RegClass;
3292 
3293   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3294   const TargetRegisterClass *Src1RC = Src1.isReg() ?
3295     MRI.getRegClass(Src1.getReg()) :
3296     &AMDGPU::SGPR_32RegClass;
3297 
3298   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3299 
3300   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3301                                                        AMDGPU::sub0, Src0SubRC);
3302   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3303                                                        AMDGPU::sub0, Src1SubRC);
3304 
3305   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3306   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3307   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3308 
3309   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3310   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
3311                               .add(SrcReg0Sub0)
3312                               .add(SrcReg1Sub0);
3313 
3314   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3315                                                        AMDGPU::sub1, Src0SubRC);
3316   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3317                                                        AMDGPU::sub1, Src1SubRC);
3318 
3319   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3320   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
3321                               .add(SrcReg0Sub1)
3322                               .add(SrcReg1Sub1);
3323 
3324   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3325   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3326     .addReg(DestSub0)
3327     .addImm(AMDGPU::sub0)
3328     .addReg(DestSub1)
3329     .addImm(AMDGPU::sub1);
3330 
3331   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3332 
3333   // Try to legalize the operands in case we need to swap the order to keep it
3334   // valid.
3335   legalizeOperands(LoHalf);
3336   legalizeOperands(HiHalf);
3337 
3338   // Move all users of this moved vlaue.
3339   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3340 }
3341 
3342 void SIInstrInfo::splitScalar64BitBCNT(
3343     SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const {
3344   MachineBasicBlock &MBB = *Inst.getParent();
3345   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3346 
3347   MachineBasicBlock::iterator MII = Inst;
3348   DebugLoc DL = Inst.getDebugLoc();
3349 
3350   MachineOperand &Dest = Inst.getOperand(0);
3351   MachineOperand &Src = Inst.getOperand(1);
3352 
3353   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
3354   const TargetRegisterClass *SrcRC = Src.isReg() ?
3355     MRI.getRegClass(Src.getReg()) :
3356     &AMDGPU::SGPR_32RegClass;
3357 
3358   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3359   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3360 
3361   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
3362 
3363   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3364                                                       AMDGPU::sub0, SrcSubRC);
3365   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3366                                                       AMDGPU::sub1, SrcSubRC);
3367 
3368   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
3369 
3370   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
3371 
3372   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3373 
3374   // We don't need to legalize operands here. src0 for etiher instruction can be
3375   // an SGPR, and the second input is unused or determined here.
3376   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3377 }
3378 
3379 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
3380                                       MachineInstr &Inst) const {
3381   MachineBasicBlock &MBB = *Inst.getParent();
3382   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3383   MachineBasicBlock::iterator MII = Inst;
3384   DebugLoc DL = Inst.getDebugLoc();
3385 
3386   MachineOperand &Dest = Inst.getOperand(0);
3387   uint32_t Imm = Inst.getOperand(2).getImm();
3388   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3389   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3390 
3391   (void) Offset;
3392 
3393   // Only sext_inreg cases handled.
3394   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
3395          Offset == 0 && "Not implemented");
3396 
3397   if (BitWidth < 32) {
3398     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3399     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3400     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3401 
3402     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
3403         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
3404         .addImm(0)
3405         .addImm(BitWidth);
3406 
3407     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
3408       .addImm(31)
3409       .addReg(MidRegLo);
3410 
3411     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3412       .addReg(MidRegLo)
3413       .addImm(AMDGPU::sub0)
3414       .addReg(MidRegHi)
3415       .addImm(AMDGPU::sub1);
3416 
3417     MRI.replaceRegWith(Dest.getReg(), ResultReg);
3418     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3419     return;
3420   }
3421 
3422   MachineOperand &Src = Inst.getOperand(1);
3423   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3424   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3425 
3426   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
3427     .addImm(31)
3428     .addReg(Src.getReg(), 0, AMDGPU::sub0);
3429 
3430   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3431     .addReg(Src.getReg(), 0, AMDGPU::sub0)
3432     .addImm(AMDGPU::sub0)
3433     .addReg(TmpReg)
3434     .addImm(AMDGPU::sub1);
3435 
3436   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3437   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3438 }
3439 
3440 void SIInstrInfo::addUsersToMoveToVALUWorklist(
3441   unsigned DstReg,
3442   MachineRegisterInfo &MRI,
3443   SmallVectorImpl<MachineInstr *> &Worklist) const {
3444   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
3445          E = MRI.use_end(); I != E;) {
3446     MachineInstr &UseMI = *I->getParent();
3447     if (!canReadVGPR(UseMI, I.getOperandNo())) {
3448       Worklist.push_back(&UseMI);
3449 
3450       do {
3451         ++I;
3452       } while (I != E && I->getParent() == &UseMI);
3453     } else {
3454       ++I;
3455     }
3456   }
3457 }
3458 
3459 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
3460     MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
3461   // This assumes that all the users of SCC are in the same block
3462   // as the SCC def.
3463   for (MachineInstr &MI :
3464        llvm::make_range(MachineBasicBlock::iterator(SCCDefInst),
3465                         SCCDefInst.getParent()->end())) {
3466     // Exit if we find another SCC def.
3467     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
3468       return;
3469 
3470     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
3471       Worklist.push_back(&MI);
3472   }
3473 }
3474 
3475 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
3476   const MachineInstr &Inst) const {
3477   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
3478 
3479   switch (Inst.getOpcode()) {
3480   // For target instructions, getOpRegClass just returns the virtual register
3481   // class associated with the operand, so we need to find an equivalent VGPR
3482   // register class in order to move the instruction to the VALU.
3483   case AMDGPU::COPY:
3484   case AMDGPU::PHI:
3485   case AMDGPU::REG_SEQUENCE:
3486   case AMDGPU::INSERT_SUBREG:
3487     if (RI.hasVGPRs(NewDstRC))
3488       return nullptr;
3489 
3490     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
3491     if (!NewDstRC)
3492       return nullptr;
3493     return NewDstRC;
3494   default:
3495     return NewDstRC;
3496   }
3497 }
3498 
3499 // Find the one SGPR operand we are allowed to use.
3500 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
3501                                    int OpIndices[3]) const {
3502   const MCInstrDesc &Desc = MI.getDesc();
3503 
3504   // Find the one SGPR operand we are allowed to use.
3505   //
3506   // First we need to consider the instruction's operand requirements before
3507   // legalizing. Some operands are required to be SGPRs, such as implicit uses
3508   // of VCC, but we are still bound by the constant bus requirement to only use
3509   // one.
3510   //
3511   // If the operand's class is an SGPR, we can never move it.
3512 
3513   unsigned SGPRReg = findImplicitSGPRRead(MI);
3514   if (SGPRReg != AMDGPU::NoRegister)
3515     return SGPRReg;
3516 
3517   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
3518   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3519 
3520   for (unsigned i = 0; i < 3; ++i) {
3521     int Idx = OpIndices[i];
3522     if (Idx == -1)
3523       break;
3524 
3525     const MachineOperand &MO = MI.getOperand(Idx);
3526     if (!MO.isReg())
3527       continue;
3528 
3529     // Is this operand statically required to be an SGPR based on the operand
3530     // constraints?
3531     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
3532     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
3533     if (IsRequiredSGPR)
3534       return MO.getReg();
3535 
3536     // If this could be a VGPR or an SGPR, Check the dynamic register class.
3537     unsigned Reg = MO.getReg();
3538     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
3539     if (RI.isSGPRClass(RegRC))
3540       UsedSGPRs[i] = Reg;
3541   }
3542 
3543   // We don't have a required SGPR operand, so we have a bit more freedom in
3544   // selecting operands to move.
3545 
3546   // Try to select the most used SGPR. If an SGPR is equal to one of the
3547   // others, we choose that.
3548   //
3549   // e.g.
3550   // V_FMA_F32 v0, s0, s0, s0 -> No moves
3551   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
3552 
3553   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
3554   // prefer those.
3555 
3556   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
3557     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
3558       SGPRReg = UsedSGPRs[0];
3559   }
3560 
3561   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
3562     if (UsedSGPRs[1] == UsedSGPRs[2])
3563       SGPRReg = UsedSGPRs[1];
3564   }
3565 
3566   return SGPRReg;
3567 }
3568 
3569 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
3570                                              unsigned OperandName) const {
3571   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
3572   if (Idx == -1)
3573     return nullptr;
3574 
3575   return &MI.getOperand(Idx);
3576 }
3577 
3578 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
3579   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
3580   if (ST.isAmdHsaOS()) {
3581     RsrcDataFormat |= (1ULL << 56);
3582 
3583     if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3584       // Set MTYPE = 2
3585       RsrcDataFormat |= (2ULL << 59);
3586   }
3587 
3588   return RsrcDataFormat;
3589 }
3590 
3591 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
3592   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
3593                     AMDGPU::RSRC_TID_ENABLE |
3594                     0xffffffff; // Size;
3595 
3596   uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
3597 
3598   Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
3599             // IndexStride = 64
3600             (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
3601 
3602   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
3603   // Clear them unless we want a huge stride.
3604   if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3605     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
3606 
3607   return Rsrc23;
3608 }
3609 
3610 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
3611   unsigned Opc = MI.getOpcode();
3612 
3613   return isSMRD(Opc);
3614 }
3615 
3616 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
3617   unsigned Opc = MI.getOpcode();
3618 
3619   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
3620 }
3621 
3622 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
3623                                     int &FrameIndex) const {
3624   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3625   if (!Addr || !Addr->isFI())
3626     return AMDGPU::NoRegister;
3627 
3628   assert(!MI.memoperands_empty() &&
3629          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
3630 
3631   FrameIndex = Addr->getIndex();
3632   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
3633 }
3634 
3635 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
3636                                         int &FrameIndex) const {
3637   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
3638   assert(Addr && Addr->isFI());
3639   FrameIndex = Addr->getIndex();
3640   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
3641 }
3642 
3643 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
3644                                           int &FrameIndex) const {
3645 
3646   if (!MI.mayLoad())
3647     return AMDGPU::NoRegister;
3648 
3649   if (isMUBUF(MI) || isVGPRSpill(MI))
3650     return isStackAccess(MI, FrameIndex);
3651 
3652   if (isSGPRSpill(MI))
3653     return isSGPRStackAccess(MI, FrameIndex);
3654 
3655   return AMDGPU::NoRegister;
3656 }
3657 
3658 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
3659                                          int &FrameIndex) const {
3660   if (!MI.mayStore())
3661     return AMDGPU::NoRegister;
3662 
3663   if (isMUBUF(MI) || isVGPRSpill(MI))
3664     return isStackAccess(MI, FrameIndex);
3665 
3666   if (isSGPRSpill(MI))
3667     return isSGPRStackAccess(MI, FrameIndex);
3668 
3669   return AMDGPU::NoRegister;
3670 }
3671 
3672 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
3673   unsigned Opc = MI.getOpcode();
3674   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
3675   unsigned DescSize = Desc.getSize();
3676 
3677   // If we have a definitive size, we can use it. Otherwise we need to inspect
3678   // the operands to know the size.
3679   //
3680   // FIXME: Instructions that have a base 32-bit encoding report their size as
3681   // 4, even though they are really 8 bytes if they have a literal operand.
3682   if (DescSize != 0 && DescSize != 4)
3683     return DescSize;
3684 
3685   if (Opc == AMDGPU::WAVE_BARRIER)
3686     return 0;
3687 
3688   // 4-byte instructions may have a 32-bit literal encoded after them. Check
3689   // operands that coud ever be literals.
3690   if (isVALU(MI) || isSALU(MI)) {
3691     if (isFixedSize(MI)) {
3692       assert(DescSize == 4);
3693       return DescSize;
3694     }
3695 
3696     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3697     if (Src0Idx == -1)
3698       return 4; // No operands.
3699 
3700     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
3701       return 8;
3702 
3703     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3704     if (Src1Idx == -1)
3705       return 4;
3706 
3707     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
3708       return 8;
3709 
3710     return 4;
3711   }
3712 
3713   if (DescSize == 4)
3714     return 4;
3715 
3716   switch (Opc) {
3717   case AMDGPU::SI_MASK_BRANCH:
3718   case TargetOpcode::IMPLICIT_DEF:
3719   case TargetOpcode::KILL:
3720   case TargetOpcode::DBG_VALUE:
3721   case TargetOpcode::BUNDLE:
3722   case TargetOpcode::EH_LABEL:
3723     return 0;
3724   case TargetOpcode::INLINEASM: {
3725     const MachineFunction *MF = MI.getParent()->getParent();
3726     const char *AsmStr = MI.getOperand(0).getSymbolName();
3727     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
3728   }
3729   default:
3730     llvm_unreachable("unable to find instruction size");
3731   }
3732 }
3733 
3734 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
3735   if (!isFLAT(MI))
3736     return false;
3737 
3738   if (MI.memoperands_empty())
3739     return true;
3740 
3741   for (const MachineMemOperand *MMO : MI.memoperands()) {
3742     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
3743       return true;
3744   }
3745   return false;
3746 }
3747 
3748 ArrayRef<std::pair<int, const char *>>
3749 SIInstrInfo::getSerializableTargetIndices() const {
3750   static const std::pair<int, const char *> TargetIndices[] = {
3751       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
3752       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
3753       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
3754       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
3755       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
3756   return makeArrayRef(TargetIndices);
3757 }
3758 
3759 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
3760 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
3761 ScheduleHazardRecognizer *
3762 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
3763                                             const ScheduleDAG *DAG) const {
3764   return new GCNHazardRecognizer(DAG->MF);
3765 }
3766 
3767 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
3768 /// pass.
3769 ScheduleHazardRecognizer *
3770 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
3771   return new GCNHazardRecognizer(MF);
3772 }
3773 
3774 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
3775   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
3776          MI.modifiesRegister(AMDGPU::EXEC, &RI);
3777 }
3778